# Setup

In [2]:

from pathlib import Path
import os
import pprint
import pandas as pd
import pygwalker as pyg
import dabl

import sys
from pathlib import Path
#### For Path setup
def setup_project_path():
    """프로젝트 루트 경로를 찾아서 파이썬 경로에 추가"""
    current = Path.cwd()
    while current != current.parent:
        if (current / '.git').exists():
            if str(current) not in sys.path:
                sys.path.append(str(current))
                print(f'Project root found: {current}')
            return current
        current = current.parent
    return None

# 프로젝트 경로 설정
project_root = setup_project_path()
if project_root is None:
    # 프로젝트 루트를 찾지 못했다면 직접 지정
    project_root = Path("D:/dev/upstageailab5-ml-regression-ml_r4")
    if str(project_root) not in sys.path:
        sys.path.append(str(project_root))

#jupyter nbextension enable --py widgetsnbextension

import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

from src.logger import Logger
from src.preprocessing import DataPrep
from src.eda import EDA
from src.feature import FeatureEngineer, Clustering, XAI
from src.train import Model
from src.utils import Utils, PathManager
## memory management
import gc
gc.collect()
########################################################################################################################################
logger_instance = Logger()
logger = logger_instance.logger
utils = Utils(logger)
utils.setup_font_and_path_platform()
current_platform = utils.current_platform
#os.environ['PYTHONPATH'] = r'D:\dev\upstageailab5-ml-regression-ml_r4'
current_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
logger.info(f'#### Current workspalce: {current_path}')
if current_platform == 'Windows':
    base_path = Path(r'D:\dev\upstageailab5-ml-regression-ml_r4')
    logger.info(f'{current_platform} platform. Path: {base_path}')
elif current_platform == 'Darwin':          # Mac
    base_path = Path('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4')
    logger.info(f'{current_platform} platform. Path: {base_path}')
else:
    base_path = Path('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4')    # Linux
    logger.info(f'{current_platform} platform. Path: {base_path}')
########################################################################################################################################

########################################################################################################################################
### Data Prep


pm = PathManager(base_path)
pm.config

# PathManager에서 경로 가져오기
data_path = pm.get_path('data', as_str=True)
prep_path = pm.get_path('processed_path', as_str=True)
output_path = pm.get_path('output', as_str=True)

# 설정 딕셔너리 생성
config = {   
    'out_path': output_path,
    'base_path': str(pm.base_path),
    'data_paths': {
        'subway_feature': str(Path(data_path) / 'subway_feature.csv'),
        'bus_feature': str(Path(data_path) / 'bus_feature.csv'),
    },
    'prep_paths': {
        'baseline': str(Path(prep_path) / 'df_raw.csv'),
        'auto': str(Path(prep_path) / 'df_auto_prep.csv'),
        'feature': str(Path(prep_path) / 'feat_concat_raw.csv'),
        'df_feat_add': str(Path(prep_path) / 'df_feature.csv')
    },
    'logger': logger_instance
}
# 필요한 경우 개별 경로 접근
path_baseline = config['prep_paths']['baseline']
path_auto = config['prep_paths']['auto']
path_feat = config['prep_paths']['feature']
path_feat_add = config['prep_paths']['df_feat_add']

config.update(pm.paths)
pprint.pprint(config)


2024-11-14 15:16:36,123 - root - INFO - Initialized Logger.
2024-11-14 15:16:36,123 - root - INFO - Linux platform. Font: ['NanumGothic']
2024-11-14 15:16:36,124 - root - INFO - #### Current workspalce: /data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4
2024-11-14 15:16:36,125 - root - INFO - Linux platform. Path: /data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4
{'base_path': '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4',
 'config': PosixPath('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/config'),
 'data': PosixPath('/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data'),
 'data_paths': {'bus_feature': '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data/bus_feature.csv',
                'subway_feature': '/data/ephemeral/home/dev/upstageailab5-ml-regression-ml_r4/data/subway_feature.csv'},
 'logger': <src.logger.Logger object at 0x7f2869bab280>,
 'logs': PosixPath('/data/ephemeral/home/dev/upstageailab5-ml-regressio

In [4]:
from sweep.feature_selection import DataPrep, Utils

In [6]:
df = pd.read_csv(os.path.join(prep_path, 'df_raw.csv'), index_col=0)

In [7]:
cols_id = ['is_test', 'target']
cols = ['계약년', '전용면적', '강남여부', '구', '건축년도', '좌표X', '좌표Y', '동']
cols_daniel = ['전용면적', '계약년', '계약월', '구', '동', '층', '건축년도'] # + 주소 # 시군구+번지
cols_to_remove = ['등기신청일자', '거래유형', '중개사소재지'] 
cols_to_remove2 = ['홈페이지','k-전화번호', 'k-팩스번호', '고용보험관리번호']
cols_add = ['시군구+번지']

cols_to_str = ['본번', '부번'] 
cols_to_str_2 = ['구', '동', '강남여부', '신축여부', 'cluster_dist_transport', 'cluster_dist_transport_count', 'cluster_select','subway_zone_type', 'bus_zone_type']

cols_date = ['단지신청일', '단지승인일','k-사용검사일-사용승인일']

group_cols = ['시군구', '도로명', '아파트명']
cols_to_num = []#['좌표X', '좌표Y', '위도', '경도']
#cols_to_select = ['시군구', '전용면적', '계약년월', '건축년도']

# 1. null 제거
thr_null = 0.9
df = DataPrep.remove_null(df, thr_null)
# 2. Prep null
cols_to_str = []


df = DataPrep.convert_dtype(df, cols_to_str, cols_to_num)
continuous_columns, categorical_columns = Utils.categorical_numeric(df)
df = DataPrep.prep_null(df, continuous_columns, categorical_columns)
# Feature engineering
feat_eng = FeatureEngineer()
df = feat_eng.prep_feat(df, 'address')
# 3. Encode categorical 
df_train, X_test = Utils.unconcat_train_test(df)
y_train = df_train['target']
X_train = df_train.drop(columns=['target'])
df_train, df_test, label_encoders = DataPrep.encode_label(X_train, X_test, categorical_columns)


* 결측치 비율이 0.9 이하인 변수들: ['시군구', '번지', '본번', '부번', '아파트명', '전용면적', '계약년월', '계약일', '층', '건축년도', '도로명', '등기신청일자', '거래유형', '중개사소재지', 'k-단지분류(아파트,주상복합등등)', 'k-전화번호', 'k-팩스번호', 'k-세대타입(분양형태)', 'k-관리방식', 'k-복도유형', 'k-난방방식', 'k-전체동수', 'k-전체세대수', 'k-건설사(시공사)', 'k-시행사', 'k-사용검사일-사용승인일', 'k-연면적', 'k-주거전용면적', 'k-관리비부과면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하', 'k-홈페이지', 'k-수정일자', '고용보험관리번호', '경비비관리형태', '세대전기계약방법', '청소비관리형태', '건축면적', '주차대수', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드', '좌표X', '좌표Y', '단지신청일', 'target', 'is_test']
* 결측치 비율이 0.9 초과인 변수들: ['해제사유발생일', '단지소개기존clob', 'k-135㎡초과', 'k-등록일자']

#### Interpolation for Null values
보간 전 shape: (1128094, 49)
보간 전 null 개수: 시군구                             0
번지                            227
본번                             75
부번                             75
아파트명                         2136
전용면적                            0
계약년월                            0
계약일                             0
층                          

100%|██████████| 28/28 [00:07<00:00,  3.73it/s]


In [None]:
from sweep.sweep_config import config_baseline

config = config_baseline.get('parameters')
    
pprint.pprint(config)
model = xgb.XGBRegressor(
            eta=config.xgboost_eta,
            max_depth=config.xgboost_max_depth,
            subsample=config.xgboost_subsample,
            colsample_bytree=config.xgboost_colsample_bytree,
            gamma=config.xgboost_gamma,
            reg_lambda=config.xgboost_reg_lambda,  
            reg_alpha=config.xgboost_alpha,
            )

In [5]:

#concat = pd.read_csv(os.path.join(prep_path, 'df_baseline.csv'), index_col=0)
# feat_cluster = pd.read_csv(os.path.join(prep_path, 'feat_cluster.csv'), index_col=0)
# feat_transport = pd.read_csv(os.path.join(prep_path, 'feat_transport.csv'), index_col=0)
baseline =pd.read_csv(path_baseline)
auto = pd.read_csv(path_auto)



In [None]:
from src.preprocessing import DataPrep

df_raw = pd.read_csv(os.path.join(prep_path, 'df_raw.csv'), index_col=0)

profile_df = DataPrep.get_data_profile(df_raw, 'raw')

In [None]:
dict_feat = {'feat_baseline': '',
            'feat_cluster': '', 
            'feat_transport': '', 
            'feat_gangnam_apt_dist': ''}

def load_feat(dict_feat, prep_path):
    for key, _ in dict_feat.items():
        df=pd.read_csv(os.path.join(prep_path,f'{key}.csv'))
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        print(f'{len(df.columns)}', df.shape, df.columns)
        dict_feat[key] = df
    return dict_feat



In [None]:
dict_feat = load_feat(dict_feat, prep_path)



In [24]:
renamed_dfs = {key: df.add_prefix(f"") for key, df in dict_feat.items()}
df_feat = pd.concat(renamed_dfs.values(),axis=1)

In [25]:
df_feat.head()
df_feat.to_csv(os.path.join(prep_path, 'feat_concat_raw.csv'))