# Setup

In [None]:

from pathlib import Path
import os
import pprint
import pandas as pd
import pygwalker as pyg
import dabl

import sys
from pathlib import Path
#### For Path setup
def setup_project_path():
    """프로젝트 루트 경로를 찾아서 파이썬 경로에 추가"""
    current = Path.cwd()
    while current != current.parent:
        if (current / '.git').exists():
            if str(current) not in sys.path:
                sys.path.append(str(current))
                print(f'Project root found: {current}')
            return current
        current = current.parent
    return None

# 프로젝트 경로 설정
project_root = setup_project_path()
if project_root is None:
    # 프로젝트 루트를 찾지 못했다면 직접 지정
    project_root = Path("D:/dev/upstageailab5-ml-regression-ml_r4")
    if str(project_root) not in sys.path:
        sys.path.append(str(project_root))

#jupyter nbextension enable --py widgetsnbextension

import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
from src.logger import Logger
from src.preprocessing import DataPrep
from src.eda import EDA
from src.feature import FeatureEngineer, Clustering, XAI
from src.train import Model
from src.visualization import Visualizer
from src.utils import Utils, PathManager
## memory management
import gc
gc.collect()
########################################################################################################################################
logger_instance = Logger()
logger = logger_instance.logger
utils = Utils(logger)
utils.setup_font_and_path_platform()
current_platform = utils.current_platform
#os.environ['PYTHONPATH'] = r'D:\dev\upstageailab5-ml-regression-ml_r4'
current_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
logger.info(f'#### Current workspalce: {current_path}')
if current_platform == 'Windows':
    base_path = Path(r'D:\dev\upstageailab5-ml-regression-ml_r4')
    logger.info(f'{current_platform} platform. Path: {base_path}')
elif current_platform == 'Darwin':          # Mac
    base_path = Path('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4')
    logger.info(f'{current_platform} platform. Path: {base_path}')
else:
    base_path = Path('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4')    # Linux
    logger.info(f'{current_platform} platform. Path: {base_path}')
########################################################################################################################################


########################################################################################################################################
### Data Prep


pm = PathManager(base_path)
pm.config

# PathManager에서 경로 가져오기
data_path = pm.get_path('data', as_str=True)
prep_path = pm.get_path('processed_path', as_str=True)
output_path = pm.get_path('output', as_str=True)

# 설정 딕셔너리 생성
config = {   
    'out_path': output_path,
    'base_path': str(pm.base_path),
    'data_paths': {
        'subway_feature': str(Path(data_path) / 'subway_feature.csv'),
        'bus_feature': str(Path(data_path) / 'bus_feature.csv'),
    },
    'prep_paths': {
        'baseline': str(Path(prep_path) / 'df_feature.csv'),
        'auto': str(Path(prep_path) / 'df_auto_prep.csv'),
        'scaled': str(Path(prep_path) / 'df_scaled.csv'),
        'encoded': str(Path(prep_path) / 'df_encoded.csv')
    },
    'logger': logger_instance
}

# 필요한 경우 개별 경로 접근
path_baseline = config['prep_paths']['baseline']
path_auto = config['prep_paths']['auto']
path_scaled = config['prep_paths']['scaled']
path_encoded = config['prep_paths']['encoded']

config.update(pm.paths)
pprint.pprint(config)

# from src.utils import Utils
# concat =Utils.clean_df(concat)
# feat_cluster =Utils.clean_df(feat_cluster)
# feat_transport =Utils.clean_df(feat_transport)




2024-11-14 01:35:43,684 - root - INFO - Initialized Logger.
2024-11-14 01:35:43,685 - root - INFO - Linux platform. Font: ['NanumGothic']
2024-11-14 01:35:43,686 - root - INFO - #### Current workspalce: /data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4
2024-11-14 01:35:43,686 - root - INFO - Linux platform. Path: /data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4
{'base_path': '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4',
 'config': PosixPath('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/config'),
 'data': PosixPath('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data'),
 'data_paths': {'bus_feature': '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data/bus_feature.csv',
                'subway_feature': '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data/subway_feature.csv'},
 'logger': <src.logger.Logger object at 0x7f82fd8fda00>,
 'logs': PosixPath('/data/ephemeral/home/dev/upstageailab5-ml-regressio

In [7]:

#concat = pd.read_csv(os.path.join(prep_path, 'df_baseline.csv'), index_col=0)
# feat_cluster = pd.read_csv(os.path.join(prep_path, 'feat_cluster.csv'), index_col=0)
# feat_transport = pd.read_csv(os.path.join(prep_path, 'feat_transport.csv'), index_col=0)
baseline =pd.read_csv(path_baseline)
auto = pd.read_csv(path_auto)



In [18]:
df=pd.read_csv(os.path.join(prep_path, 'df_null-preped_freq-encoded.csv'))

In [19]:
df.columns

Index(['Unnamed: 0', '시군구', '번지', '본번', '부번', '아파트명', '도로명',
       'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-관리방식', 'k-복도유형', 'k-난방방식',
       'k-건설사(시공사)', 'k-시행사', 'k-사용검사일-사용승인일', 'k-홈페이지', 'k-수정일자', '경비비관리형태',
       '세대전기계약방법', '청소비관리형태', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부',
       '관리비 업로드', '단지신청일', '전용면적', '계약년월', '계약일', '층', '건축년도', 'k-전체동수',
       'k-전체세대수', 'k-연면적', 'k-주거전용면적', 'k-관리비부과면적', 'k-전용면적별세대현황(60㎡이하)',
       'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하', '건축면적', '주차대수', '좌표X', '좌표Y',
       'target', 'is_test'],
      dtype='object')

In [17]:
baseline.columns

Index(['번지', '본번', '부번', '아파트명', '전용면적', '계약일', '층', '건축년도', '도로명',
       'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-관리방식', 'k-복도유형', 'k-난방방식',
       'k-전체동수', 'k-전체세대수', 'k-건설사(시공사)', 'k-시행사', 'k-사용검사일-사용승인일', 'k-연면적',
       'k-주거전용면적', 'k-관리비부과면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)',
       'k-85㎡~135㎡이하', 'k-수정일자', '경비비관리형태', '세대전기계약방법', '청소비관리형태', '건축면적',
       '주차대수', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드', '좌표X',
       '좌표Y', '단지신청일', 'target', '구', '동', '계약년', '계약월', '강남여부', '신축여부',
       'is_test'],
      dtype='object')

In [16]:
encoded.columns

Index(['k-건설사(시공사)', 'k-관리방식', 'k-난방방식', 'k-단지분류(아파트,주상복합등등)', 'k-복도유형',
       'k-사용검사일-사용승인일', 'k-세대타입(분양형태)', 'k-수정일자', 'k-시행사', '경비비관리형태',
       '관리비 업로드', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '단지신청일', '도로명', '번지',
       '사용허가여부', '세대전기계약방법', '아파트명', '청소비관리형태', '강남여부', '신축여부', '구', '동',
       'cluster_dist_transport', 'cluster_dist_transport_count',
       'cluster_select', 'cluster_select_count', 'bus_direct_influence_count',
       'subway_zone_type', 'subway_station_area_count', 'bus_zone_type',
       'bus_indirect_influence_count', 'bus_station_area_count',
       'subway_indirect_influence_count', 'subway_direct_influence_count',
       'k-85㎡~135㎡이하', 'k-관리비부과면적', 'k-연면적', 'k-전용면적별세대현황(60㎡~85㎡이하)',
       'k-전용면적별세대현황(60㎡이하)', 'k-전체동수', 'k-전체세대수', 'k-주거전용면적', '건축년도', '건축면적',
       '계약일', '본번', '부번', '전용면적', '좌표X', '좌표Y', '주차대수', '층', '계약년', '계약월',
       'subway_shortest_distance', 'bus_shortest_distance', 'is_test',
       'target'],
      dtype='object')

In [14]:
scaled = pd.read_csv(path_scaled)


In [15]:
scaled.head()

Unnamed: 0,k-85㎡~135㎡이하,k-관리비부과면적,k-연면적,k-전용면적별세대현황(60㎡~85㎡이하),k-전용면적별세대현황(60㎡이하),k-전체동수,k-전체세대수,k-주거전용면적,건축년도,건축면적,...,bus_direct_influence_count,subway_zone_type,subway_station_area_count,bus_zone_type,bus_indirect_influence_count,bus_station_area_count,subway_indirect_influence_count,subway_direct_influence_count,target,is_test
0,-1.421189,-1.408835,-1.627235,0.011021,-1.361858,-0.036579,-1.266376,-1.337363,-1.0,-0.054029,...,16,3,0,1,150,5,7,0,124000.0,0
1,-1.421189,-1.408835,-1.627235,0.011021,-1.361858,-0.036579,-1.266376,-1.337363,-1.0,-0.054029,...,16,3,0,1,150,5,7,0,123500.0,0
2,-1.421189,-1.408835,-1.627235,0.011021,-1.361858,-0.036579,-1.266376,-1.337363,-1.0,-0.054029,...,16,3,0,1,150,5,7,0,91500.0,0
3,-1.421189,-1.408835,-1.627235,0.011021,-1.361858,-0.036579,-1.266376,-1.337363,-1.0,-0.054029,...,16,3,0,1,150,5,7,0,130000.0,0
4,-1.421189,-1.408835,-1.627235,0.011021,-1.361858,-0.036579,-1.266376,-1.337363,-1.0,-0.054029,...,16,3,0,1,150,5,7,0,117000.0,0


In [None]:
from src.preprocessing import DataPrep

df_raw = pd.read_csv(os.path.join(prep_path, 'df_raw.csv'), index_col=0)

profile_df = DataPrep.get_data_profile(df_raw, 'raw')

In [1]:
concat.head()

NameError: name 'concat' is not defined

In [None]:
dict_feat = {'feat_baseline': '',
            'feat_cluster': '', 
            'feat_transport': '', 
            'feat_gangnam_apt_dist': ''}

def load_feat(dict_feat, prep_path):
    for key, _ in dict_feat.items():
        df=pd.read_csv(os.path.join(prep_path,f'{key}.csv'))
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        print(f'{len(df.columns)}', df.shape, df.columns)
        dict_feat[key] = df
    return dict_feat



In [None]:
dict_feat = load_feat(dict_feat, prep_path)



6 (1128094, 6) Index(['구', '동', '계약년', '계약월', '강남여부', '신축여부'], dtype='object')
4 (1128094, 4) Index(['cluster_dist_transport', 'cluster_dist_transport_count',
       'cluster_select', 'cluster_select_count'],
      dtype='object')
10 (1128094, 10) Index(['subway_station_area_count', 'subway_direct_influence_count',
       'subway_indirect_influence_count', 'subway_shortest_distance',
       'subway_zone_type', 'bus_station_area_count',
       'bus_direct_influence_count', 'bus_indirect_influence_count',
       'bus_shortest_distance', 'bus_zone_type'],
      dtype='object')
1 (1128094, 1) Index(['대장아파트_거리'], dtype='object')


In [24]:
renamed_dfs = {key: df.add_prefix(f"") for key, df in dict_feat.items()}
df_feat = pd.concat(renamed_dfs.values(),axis=1)

In [25]:
df_feat.head()
df_feat.to_csv(os.path.join(prep_path, 'feat_concat_raw.csv'))