In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

warnings.filterwarnings('ignore')

In [8]:
# Seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [9]:
# 데이터 불러오기
train = pd.read_csv('cal.train.csv')
test = pd.read_csv('cal.test.csv')
sample_submission = pd.read_csv('cal.sample_submission.csv', index_col=0)

In [10]:
# 데이터 전처리
ordinal_features = ['Weight_Status', 'Gender']
for feature in ordinal_features:
    le = LabelEncoder()
    le.fit(train[feature])
    train[feature] = le.transform(train[feature])
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[feature] = le.transform(test[feature])

In [11]:
# Feature Selection 및 PolynomialFeatures 적용
train_x = train.drop(['ID', 'Calories_Burned', 'Weight_Status', 'Height(Remainder_Inches)', 'Height(Feet)'], axis=1)
train_y = train['Calories_Burned']
test_x = test.drop(['ID', 'Weight_Status', 'Height(Remainder_Inches)', 'Height(Feet)'], axis=1)

poly = PolynomialFeatures(degree=3)  # 모든 고차항 생성
train_poly = poly.fit_transform(train_x)
test_poly = poly.transform(test_x)

# Feature Selection
selector = SelectKBest(score_func=f_regression, k=40)
train_poly = selector.fit_transform(train_poly, train_y)
test_poly = selector.transform(test_poly)

In [12]:
# 스태킹 모델 구성
base_models = [
    ('linear', LinearRegression()),
    ('ridge', Ridge()),
    ('rf', RandomForestRegressor(n_estimators=50, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42)),  # XGBoost 추가
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42)),  # LightGBM 추가
    ('svr', SVR(kernel='rbf')),  # SVR 추가
    ('elastic', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)),  # ElasticNet 추가
    ('mlp', MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)),  # MLP 추가
    ('knn', KNeighborsRegressor(n_neighbors=5))  # KNN 추가
]

stacking = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(alpha=1.0)  # 최종 추정기로 Ridge 사용
)

In [16]:
# 교차 검증
cv_scores = np.sqrt(-cross_val_score(stacking, train_poly, train_y, cv=5, scoring='neg_mean_squared_error'))
print(f"Cross-validation RMSE: {cv_scores.mean()}")

# 학습 및 검증 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(train_poly, train_y, test_size=0.3, random_state=42, shuffle=True)

# Stacking Regressor 학습
stacking.fit(X_train, y_train)

# 검증 성능 평가
y_pred_stacking = stacking.predict(X_val)
rmse_stacking = mean_squared_error(y_val, y_pred_stacking, squared=False)
print(f"Stacking Regressor Validation RMSE: {rmse_stacking}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7542
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 40
[LightGBM] [Info] Start training from score 89.411500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7539
[LightGBM] [Info] Number of data points in the train set: 4800, number of used features: 40
[LightGBM] [Info] Start training from score 89.167292
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7542
[LightGBM] [Info] Number of data points in the train set: 4800, number of used features: 40
[LightGBM] [Info] Start trai

In [17]:
print(f"Cross-validation RMSE: {cv_scores.mean()}")
print(f"Stacking Regressor Validation RMSE: {rmse_stacking}")

Cross-validation RMSE: 0.29053215319441944
Stacking Regressor Validation RMSE: 0.28688648736655203


In [18]:
# 테스트 데이터 예측 및 제출 파일 생성 (Stacking 모델 사용)
test_preds_stacking = stacking.predict(test_poly)
submission_stacking = sample_submission.copy()
submission_stacking['Calories_Burned'] = np.round(test_preds_stacking)
submission_stacking.to_csv('submission_stacking.csv', index=True)

In [19]:
# 스태킹 모델 구성
base_models = [
    ('linear', LinearRegression()),
    ('ridge', Ridge()),
    ('rf', RandomForestRegressor(n_estimators=50, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42)),  # XGBoost 추가
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42)),  # LightGBM 추가
    #('svr', SVR(kernel='rbf')),  # SVR 추가
    ('elastic', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)),  # ElasticNet 추가
    #('mlp', MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)),  # MLP 추가
    #('knn', KNeighborsRegressor(n_neighbors=5))  # KNN 추가
]

stacking = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(alpha=1.0)  # 최종 추정기로 Ridge 사용
)

In [20]:
# 교차 검증
cv_scores = np.sqrt(-cross_val_score(stacking, train_poly, train_y, cv=5, scoring='neg_mean_squared_error'))
print(f"Cross-validation RMSE: {cv_scores.mean()}")

# 학습 및 검증 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(train_poly, train_y, test_size=0.3, random_state=42, shuffle=True)

# Stacking Regressor 학습
stacking.fit(X_train, y_train)

# 검증 성능 평가
y_pred_stacking = stacking.predict(X_val)
rmse_stacking = mean_squared_error(y_val, y_pred_stacking, squared=False)
print(f"Stacking Regressor Validation RMSE: {rmse_stacking}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7542
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 40
[LightGBM] [Info] Start training from score 89.411500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000910 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7539
[LightGBM] [Info] Number of data points in the train set: 4800, number of used features: 40
[LightGBM] [Info] Start training from score 89.167292
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7542
[LightGBM] [Info] Number of data points in the train set: 4800, number of used features: 40
[LightGBM] [Info] Start trai

In [21]:
print(f"Cross-validation RMSE: {cv_scores.mean()}")
print(f"Stacking Regressor Validation RMSE: {rmse_stacking}")

Cross-validation RMSE: 0.2905202198629908
Stacking Regressor Validation RMSE: 0.28714395580088653
