In [1]:
# sales_shop_modeling.ipynb 내용 예시

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. 데이터 로딩
sales_path = "/Users/giwonjun/Desktop/boot/02_proj_ML/SKN13-2nd-4TEAM/data/expected_sales"
shops_path = "/Users/giwonjun/Desktop/boot/02_proj_ML/SKN13-2nd-4TEAM/data/the_number_of_shops"

def load_csvs_from_folder(folder):
    all_files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".csv")]
    df_list = [pd.read_csv(file) for file in all_files]
    return pd.concat(df_list, ignore_index=True)

sales_df = load_csvs_from_folder(sales_path)
shops_df = load_csvs_from_folder(shops_path)

In [9]:
print("sales_df 컬럼명:")
print(sales_df.columns)

print("\nshop_df 컬럼명:")
print(shops_df.columns)

sales_df 컬럼명:
Index(['기준_년분기_코드', '상권_구분_코드', '상권_구분_코드_명', '상권_코드', '상권_코드_명', '서비스_업종_코드',
       '서비스_업종_코드_명', '당월_매출_금액', '당월_매출_건수', '주중_매출_금액', '주말_매출_금액',
       '월요일_매출_금액', '화요일_매출_금액', '수요일_매출_금액', '목요일_매출_금액', '금요일_매출_금액',
       '토요일_매출_금액', '일요일_매출_금액', '시간대_00~06_매출_금액', '시간대_06~11_매출_금액',
       '시간대_11~14_매출_금액', '시간대_14~17_매출_금액', '시간대_17~21_매출_금액',
       '시간대_21~24_매출_금액', '남성_매출_금액', '여성_매출_금액', '연령대_10_매출_금액',
       '연령대_20_매출_금액', '연령대_30_매출_금액', '연령대_40_매출_금액', '연령대_50_매출_금액',
       '연령대_60_이상_매출_금액', '주중_매출_건수', '주말_매출_건수', '월요일_매출_건수', '화요일_매출_건수',
       '수요일_매출_건수', '목요일_매출_건수', '금요일_매출_건수', '토요일_매출_건수', '일요일_매출_건수',
       '시간대_건수~06_매출_건수', '시간대_건수~11_매출_건수', '시간대_건수~14_매출_건수',
       '시간대_건수~17_매출_건수', '시간대_건수~21_매출_건수', '시간대_건수~24_매출_건수', '남성_매출_건수',
       '여성_매출_건수', '연령대_10_매출_건수', '연령대_20_매출_건수', '연령대_30_매출_건수',
       '연령대_40_매출_건수', '연령대_50_매출_건수', '연령대_60_이상_매출_건수'],
      dtype='object')

shop_df 컬럼명:
Index(['기준_년분기_코드', '상권_구분_코드', '상권_구분_코드_명

In [11]:
# 2. 병합
# 병합 이유: 매출 데이터와 점포 데이터를 상권, 업종명, 분기 기준으로 합쳐야 더 정확한 예측 가능
merge_keys = ['기준_년분기_코드', '상권_구분_코드_명', '상권_코드_명', '서비스_업종_코드_명']
data = pd.merge(sales_df, shops_df, on=merge_keys, how='inner')

In [13]:
# 3. 파생변수 추가 (폐업률 등 정제)
data['폐업률_정제'] = data['폐업_점포_수'] / data['점포_수']
data['개업률_정제'] = data['개업_점포_수'] / data['점포_수']

In [14]:
# 4. 결측치 제거
data = data.dropna()

In [15]:
# 5. 예측할 타겟 설정 (예: '당월_매출_금액')
target = '당월_매출_금액'
X = data.drop(columns=[target])
y = data[target]

In [16]:
# 6. 범주형/수치형 컬럼 분리
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [17]:
# 7. 파이프라인 구성
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = RandomForestRegressor(random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model)
])

In [18]:
# 8. 하이퍼파라미터 튜닝
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train = X_train.replace([np.inf, -np.inf], np.nan)  # inf를 NaN으로
X_train = X_train.fillna(0)  # NaN을 0으로 채우기 (또는 평균/중앙값으로 대체)
X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(0) 

In [21]:
# 9. 모델 학습
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


KeyboardInterrupt: 

In [None]:
# 10. 결과 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Best Params:", grid_search.best_params_)
print("Test MSE:", mse)

In [None]:
# 11. 중요 Feature 보기
if hasattr(best_model.named_steps['regressor'], 'feature_importances_'):
    feature_names = grid_search.best_estimator_.named_steps['preprocessor'].transformers_[0][2] + \
                    list(grid_search.best_estimator_.named_steps['preprocessor']
                         .transformers_[1][1]
                         .named_steps['encoder']
                         .get_feature_names_out(categorical_cols))
    importances = best_model.named_steps['regressor'].feature_importances_
    feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)
    print(feature_importance_df.head(10))