In [1]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
import warnings

warnings.filterwarnings('ignore')

  from pandas.core import (


In [2]:
# Seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [3]:
# 데이터 불러오기
train = pd.read_csv('cal.train.csv')
test = pd.read_csv('cal.test.csv')
sample_submission = pd.read_csv('cal.sample_submission.csv', index_col=0)

In [4]:
# 데이터 전처리
ordinal_features = ['Weight_Status', 'Gender']
for feature in ordinal_features:
    le = LabelEncoder()
    le.fit(train[feature])
    train[feature] = le.transform(train[feature])
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[feature] = le.transform(test[feature])

In [5]:
# Feature Selection 및 PolynomialFeatures 적용
train_x = train.drop(['ID', 'Calories_Burned', 'Weight_Status', 'Height(Remainder_Inches)', 'Height(Feet)'], axis=1)
train_y = train['Calories_Burned']
test_x = test.drop(['ID', 'Weight_Status', 'Height(Remainder_Inches)', 'Height(Feet)'], axis=1)

poly = PolynomialFeatures(degree=3)  # 모든 고차항 생성
train_poly = poly.fit_transform(train_x)
test_poly = poly.transform(test_x)

In [6]:
# Feature Selection
selector = SelectKBest(score_func=f_regression, k=40)
train_poly = selector.fit_transform(train_poly, train_y)
test_poly = selector.transform(test_poly)

In [7]:
# 스태킹 모델 구성
base_models = [
    ('linear', LinearRegression()),
    ('ridge', Ridge()),
    ('rf', RandomForestRegressor(n_estimators=50, random_state=42))  # RandomForest 추가
]

stacking = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(alpha=1.0)  # 최종 추정기로 Ridge 사용
)

In [8]:
# Voting 기법을 사용하는 앙상블 구성
voting = VotingRegressor(estimators=base_models)

In [9]:
# 교차 검증 (Stacking Regressor)
cv_scores_stacking = np.sqrt(-cross_val_score(stacking, train_poly, train_y, cv=5, scoring='neg_mean_squared_error'))
print(f"Stacking Regressor Cross-validation RMSE: {cv_scores_stacking.mean()}")

# 교차 검증 (Voting Regressor)
cv_scores_voting = np.sqrt(-cross_val_score(voting, train_poly, train_y, cv=5, scoring='neg_mean_squared_error'))
print(f"Voting Regressor Cross-validation RMSE: {cv_scores_voting.mean()}")

Stacking Regressor Cross-validation RMSE: 0.2904624297587722
Voting Regressor Cross-validation RMSE: 0.9399324847927749


In [10]:
# 학습 및 검증 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(train_poly, train_y, test_size=0.1, random_state=42, shuffle=True)

In [11]:
# Stacking Regressor 학습 및 검증 성능 평가
stacking.fit(X_train, y_train)
y_pred_stacking = stacking.predict(X_val)
rmse_stacking = mean_squared_error(y_val, y_pred_stacking, squared=False)
print(f"Stacking Regressor Validation RMSE: {rmse_stacking}")

# Voting Regressor 학습 및 검증 성능 평가
voting.fit(X_train, y_train)
y_pred_voting = voting.predict(X_val)
rmse_voting = mean_squared_error(y_val, y_pred_voting, squared=False)
print(f"Voting Regressor Validation RMSE: {rmse_voting}")

Stacking Regressor Validation RMSE: 0.289088034192128
Voting Regressor Validation RMSE: 0.8995721450451558


In [12]:
# 테스트 데이터 예측 및 제출 파일 생성 (Voting 모델 사용)
test_preds_voting = voting.predict(test_poly)
sample_submission['Calories_Burned'] = np.round(test_preds_voting)
sample_submission.to_csv('submission_voting.csv', index=True)