In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import pingouin as pg
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.formula.api as smf

df = pd.read_csv("1st-dataset-prepressed-baseball.csv")     # 1차 데이터셋

pd.set_option("display.max_columns", None)

# 분석용 편의 함수
def pct(n): return f"{n:.1%}"

# df['accident_score'] = df['accident_count'] + 3 * df['injury_count'] + 10 * df['death_count']
df['accident_score'] = df['accident_count'] + 3 * df['injury_count']


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2072 entries, 0 to 2071
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_id         2072 non-null   int64  
 1   stadium_code    2072 non-null   object 
 2   sports_type     2072 non-null   object 
 3   game_date       2072 non-null   object 
 4   day_of_week     2072 non-null   object 
 5   is_holiday      2072 non-null   int64  
 6   start_time      2072 non-null   object 
 7   home_team_win   2072 non-null   int64  
 8   audience        2072 non-null   float64
 9   region          2072 non-null   object 
 10  accident_count  2072 non-null   float64
 11  injury_count    2072 non-null   float64
 12  death_count     2072 non-null   float64
 13  temperature     2072 non-null   float64
 14  precipitation   2072 non-null   float64
 15  snow_depth      2072 non-null   float64
 16  start_hour      2072 non-null   int64  
 17  match_시범경기      2072 non-null   b

In [7]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, PoissonRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# 독립 변수와 종속 변수 정의
X = df.drop(columns=["accident_score", "accident_count", "injury_count", "death_count", "game_id", "stadium_code","sports_type","game_date", "day_of_week", "start_time", "region", "snow_depth"])
y = df['accident_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ▶ 기본 회귀 모델들 정의
base_estimators = [
    ('rf', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))
]

# ▶ 최종 예측기 (Meta-learner): 선형회귀
# meta_model = LinearRegression()
meta_model = PoissonRegressor(max_iter=1000)
""""
# ▶ 스태킹 앙상블 모델 생성
stacking_model = StackingRegressor(
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=5,  # 5-fold cross-validation for base models
    n_jobs=-1
)

# ▶ 파라미터 그리드 정의 (base estimator에 접근할 때 '__' 사용)
param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [5, 10],
    'gb__learning_rate': [0.05, 0.1],
    'gb__n_estimators': [100, 150],
    # 'final_estimator__alpha': [0.1, 1.0, 10.0]
}


# ▶ GridSearchCV로 튜닝
grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=2
)
"""

# ▶ 스태킹 앙상블 모델 생성
stacking_model = StackingRegressor(
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=5,
    passthrough=True,
    n_jobs=-1
)

# ▶ 파라미터 그리드 정의
param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [5, 10],
    'gb__learning_rate': [0.05, 0.1],
    'gb__n_estimators': [100, 150]
    # meta_model은 PoissonRegressor이며, alpha 등의 파라미터는 제외함 (필요시 추가 가능)
}

# ▶ GridSearchCV로 튜닝
grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=2
)

# ▶ 학습
grid_search.fit(X_train, y_train)

# ▶ 예측
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Best 모델 및 평가 지표 출력
print("Best Params:", grid_search.best_params_)
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")


Fitting 3 folds for each of 16 candidates, totalling 48 fits


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=100; total time=   1.4s
[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=100; total time=   1.5s
[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=100; total time=   1.6s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=100; total time=   1.9s
[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=100; total time=   2.0s
[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=100; total time=   2.0s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=200; total time=   2.5s
[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=200; total time=   2.6s
[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=200; total time=   2.7s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=200; total time=   3.4s
[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=100; total time=   1.8s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=100; total time=   1.7s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=100; total time=   1.9s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=100; total time=   2.0s
[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=200; total time=   2.7s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=200; total time=   3.6s
[CV] END gb__learning_rate=0.05, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=200; total time=   3.7s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=200; total time=   2.7s
[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=200; total time=   2.8s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=100; total time=   2.2s
[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=100; total time=   2.2s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=100; total time=   1.5s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=100; total time=   1.7s
[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=100; total time=   1.7s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=100; total time=   1.9s
[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=200; total time=   4.0s
[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=200; total time=   3.8s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=200; total time=   2.6s
[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=200; total time=   2.5s
[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=100; total time=   1.6s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=5, rf__n_estimators=200; total time=   2.6s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=100; total time=   1.6s
[CV] END gb__learning_rate=0.05, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=200; total time=   3.8s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=100; total time=   1.8s
[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=100; total time=   1.7s
[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=100; total time=   1.7s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=200; total time=   3.3s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=100; total time=   1.9s
[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=200; total time=   2.5s
[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=200; total time=   2.4s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=200; total time=   3.1s
[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=5, rf__n_estimators=200; total time=   2.3s
[CV] END gb__learning_rate=0.1, gb__n_estimators=100, rf__max_depth=10, rf__n_estimators=200; total time=   3.0s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=100; total time=   1.6s
[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=100; total time=   1.6s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=200; total time=   2.2s


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=200; total time=   2.0s
[CV] END gb__learning_rate=0.1, gb__n_estimators=150, rf__max_depth=10, rf__n_estimators=200; total time=   1.9s
Best Params: {'gb__learning_rate': 0.05, 'gb__n_estimators': 100, 'rf__max_depth': 5, 'rf__n_estimators': 100}
MSE: 553.0140
R² Score: -0.0048


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
