In [None]:
# 사용방법 - 복사하여 주석만 제거하여 사용
import sys
from pathlib import Path

# 현재 노트북(.ipynb)이 src/ 안에 있으니 상위 폴더(프로젝트 루트)를 추가
BASE_DIR    = Path().resolve().parent.parent    # Notebook이 src/ 안이라면 .parent
config_path = BASE_DIR / 'config' / 'data_paths.yaml'
sys.path.insert(0, str(BASE_DIR))


# 이제 바로 import
from scripts.data_loader import load_data

In [None]:
df_merged = load_data('Merged_Data_with_Season', section='processed')
df = df_merged.dropna(subset=['pm10'])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import lightgbm as lgb


# Convert year_month to datetime and extract features
df['year_month'] = pd.to_datetime(df['year_month'], format="%Y-%m")
df['year'] = df['year_month'].dt.year
df['month'] = df['year_month'].dt.month

# Define features and target
X = df[['region', 'gender', 'age_group', 'pm10', 'season', 'year', 'month']]
y = df['visit_count']

# Split into train (64%), validation (16%), test (20%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)  # 0.2 * 0.8 = 0.16

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

# Define preprocessing for categorical and numeric data
categorical_features = ['region', 'gender', 'season']
numeric_features = ['age_group', 'pm10', 'year', 'month']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ])


# Create a pipeline with XGBoost Regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor(objective='reg:squarederror', random_state=42))

In [None]:
# Train the model
model.fit(X_train, y_train)

# Predict and evaluate on validation and test sets
y_val_pred = model.predict(X_val)


val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
val_r2 = r2_score(y_val, y_val_pred)

print(f"✅ RMSE: {val_rmse:.2f}")
print(f"✅ R-squared: {val_r2:.2f}")

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family'] = 'Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

# 중요도 시각화
lgb.plot_importance(model, max_num_features=10, importance_type='gain')
plt.title("Top 10 Feature Importance (by Gain)")
plt.show()

## last step (after final modeling)

In [None]:
y_test_pred = model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
test_r2 = r2_score(y_test, y_test_pred)

print(f"✅ RMSE: {test_rmse:.2f}")
print(f"✅ R-squared: {test_r2:.2f}")