# Setup

In [None]:

from pathlib import Path
import os
import pprint
import pandas as pd
import pygwalker as pyg
import dabl

import sys
from pathlib import Path
#### For Path setup
def setup_project_path():
    """프로젝트 루트 경로를 찾아서 파이썬 경로에 추가"""
    current = Path.cwd()
    while current != current.parent:
        if (current / '.git').exists():
            if str(current) not in sys.path:
                sys.path.append(str(current))
                print(f'Project root found: {current}')
            return current
        current = current.parent
    return None

# 프로젝트 경로 설정
project_root = setup_project_path()
if project_root is None:
    # 프로젝트 루트를 찾지 못했다면 직접 지정
    project_root = Path("D:/dev/upstageailab5-ml-regression-ml_r4")
    if str(project_root) not in sys.path:
        sys.path.append(str(project_root))

#jupyter nbextension enable --py widgetsnbextension

import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
from src.logger import Logger
from src.preprocessing import DataPrep
from src.eda import EDA
from src.feature import FeatureEngineer, Clustering, XAI
from src.train import Model
from src.visualization import Visualizer
from src.utils import Utils, PathManager
## memory management
import gc
gc.collect()
########################################################################################################################################
logger_instance = Logger()
logger = logger_instance.logger
utils = Utils(logger)
utils.setup_font_and_path_platform()
current_platform = utils.current_platform
#os.environ['PYTHONPATH'] = r'D:\dev\upstageailab5-ml-regression-ml_r4'
current_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
logger.info(f'#### Current workspalce: {current_path}')
if current_platform == 'Windows':
    base_path = Path(r'D:\dev\upstageailab5-ml-regression-ml_r4')
    logger.info(f'{current_platform} platform. Path: {base_path}')
elif current_platform == 'Darwin':          # Mac
    base_path = Path('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4')
    logger.info(f'{current_platform} platform. Path: {base_path}')
else:
    base_path = Path('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4')    # Linux
    logger.info(f'{current_platform} platform. Path: {base_path}')
########################################################################################################################################


########################################################################################################################################
### Data Prep


pm = PathManager(base_path)
pm.config

# PathManager에서 경로 가져오기
data_path = pm.get_path('data', as_str=True)
prep_path = pm.get_path('processed_path', as_str=True)
output_path = pm.get_path('output', as_str=True)

# 설정 딕셔너리 생성
config = {   
    'out_path': output_path,
    'base_path': str(pm.base_path),
    'data_paths': {
        'subway_feature': str(Path(data_path) / 'subway_feature.csv'),
        'bus_feature': str(Path(data_path) / 'bus_feature.csv'),
    },
    'prep_paths': {
        'baseline': str(Path(prep_path) / 'df_feature.csv'),
        'auto': str(Path(prep_path) / 'df_auto_prep.csv'),
        'scaled': str(Path(prep_path) / 'df_scaled.csv'),
        'encoded': str(Path(prep_path) / 'df_encoded.csv')
    },
    'logger': logger_instance
}

# 필요한 경우 개별 경로 접근
path_baseline = config['prep_paths']['baseline']
path_auto = config['prep_paths']['auto']
path_feat = config['prep_paths']['feat']
path_feat_add = config['prep_paths']['feat_add']

config.update(pm.paths)
pprint.pprint(config)

# from src.utils import Utils
# concat =Utils.clean_df(concat)
# feat_cluster =Utils.clean_df(feat_cluster)
# feat_transport =Utils.clean_df(feat_transport)




Project root found: /data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4
2024-11-14 01:33:41,665 - root - INFO - Initialized Logger.
2024-11-14 01:33:41,666 - root - INFO - Linux platform. Font: ['NanumGothic']
2024-11-14 01:33:41,667 - root - INFO - #### Current workspalce: /data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4
2024-11-14 01:33:41,667 - root - INFO - Linux platform. Path: /data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4
{'base_path': '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4',
 'config': PosixPath('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/config'),
 'data': PosixPath('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data'),
 'data_paths': {'bus_feature': '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data/bus_feature.csv',
                'subway_feature': '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data/subway_feature.csv'},
 'logger': <src.logger.Logger object at 0x7f82558

In [4]:

concat = pd.read_csv(os.path.join(prep_path, 'df_baseline.csv'), index_col=0)
feat_cluster = pd.read_csv(os.path.join(prep_path, 'feat_cluster.csv'), index_col=0)
feat_transport = pd.read_csv(os.path.join(prep_path, 'feat_transport.csv'), index_col=0)

print(concat.shape, concat.columns, feat_cluster.shape, feat_cluster.columns, feat_transport.shape, feat_transport.columns)

FileNotFoundError: [Errno 2] No such file or directory: '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data/preprocessed/df_baseline.csv'

In [None]:
from src.preprocessing import DataPrep

df_raw = pd.read_csv(os.path.join(prep_path, 'df_raw.csv'), index_col=0)

profile_df = DataPrep.get_data_profile(df_raw, 'raw')

In [1]:
concat.head()

NameError: name 'concat' is not defined

In [None]:
dict_feat = {'feat_baseline': '',
            'feat_cluster': '', 
            'feat_transport': '', 
            'feat_gangnam_apt_dist': ''}

def load_feat(dict_feat, prep_path):
    for key, _ in dict_feat.items():
        df=pd.read_csv(os.path.join(prep_path,f'{key}.csv'))
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        print(f'{len(df.columns)}', df.shape, df.columns)
        dict_feat[key] = df
    return dict_feat



In [None]:
dict_feat = load_feat(dict_feat, prep_path)



6 (1128094, 6) Index(['구', '동', '계약년', '계약월', '강남여부', '신축여부'], dtype='object')
4 (1128094, 4) Index(['cluster_dist_transport', 'cluster_dist_transport_count',
       'cluster_select', 'cluster_select_count'],
      dtype='object')
10 (1128094, 10) Index(['subway_station_area_count', 'subway_direct_influence_count',
       'subway_indirect_influence_count', 'subway_shortest_distance',
       'subway_zone_type', 'bus_station_area_count',
       'bus_direct_influence_count', 'bus_indirect_influence_count',
       'bus_shortest_distance', 'bus_zone_type'],
      dtype='object')
1 (1128094, 1) Index(['대장아파트_거리'], dtype='object')


In [24]:
renamed_dfs = {key: df.add_prefix(f"") for key, df in dict_feat.items()}
df_feat = pd.concat(renamed_dfs.values(),axis=1)

In [25]:
df_feat.head()
df_feat.to_csv(os.path.join(prep_path, 'feat_concat_raw.csv'))

In [26]:
df_feat.columns

Index(['구', '동', '계약년', '계약월', '강남여부', '신축여부', 'cluster_dist_transport',
       'cluster_dist_transport_count', 'cluster_select',
       'cluster_select_count', 'subway_station_area_count',
       'subway_direct_influence_count', 'subway_indirect_influence_count',
       'subway_shortest_distance', 'subway_zone_type',
       'bus_station_area_count', 'bus_direct_influence_count',
       'bus_indirect_influence_count', 'bus_shortest_distance',
       'bus_zone_type', '대장아파트_거리'],
      dtype='object')

In [22]:
df_feat.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128094 entries, 0 to 1128093
Data columns (total 21 columns):
 #   Column                                          Non-Null Count    Dtype  
---  ------                                          --------------    -----  
 0   feat_baseline_구                                 1128094 non-null  object 
 1   feat_baseline_동                                 1128094 non-null  object 
 2   feat_baseline_계약년                               1128094 non-null  int64  
 3   feat_baseline_계약월                               1128094 non-null  int64  
 4   feat_baseline_강남여부                              1128094 non-null  int64  
 5   feat_baseline_신축여부                              1128094 non-null  int64  
 6   feat_cluster_cluster_dist_transport             1128094 non-null  int64  
 7   feat_cluster_cluster_dist_transport_count       1128094 non-null  int64  
 8   feat_cluster_cluster_select                     1128094 non-null  int64  
 9   feat_cluster_