In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

from FeatureEngineering import create_advanced_features

# Add file upload widgets
df = pd.read_csv('Data/train_data.csv')
test_features = pd.read_csv('Data/test_data.csv')

# Prepare data
print("Creating features...")
train_df = create_advanced_features(df)
test_df = create_advanced_features(test_features)

# Model preparation
drop_columns = ['ID', 'radiant_win']
feature_cols = [col for col in train_df.columns if col not in drop_columns]

X = train_df[feature_cols]
y = train_df['radiant_win']

# Cross validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize arrays for predictions
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
predictions_lgb = np.zeros(len(test_df))
predictions_xgb = np.zeros(len(test_df))
predictions_cat = np.zeros(len(test_df))

# Model parameters
lgb_params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'num_leaves': 63,
    'max_depth': 7,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

xgb_params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 7,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

cat_params = {
    'iterations': 2000,
    'learning_rate': 0.01,
    'depth': 7,
    'l2_leaf_reg': 3,
    'random_seed': 42,
    'verbose': False
}

print("Training models with cross-validation...")
# Cross-validation training
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f'Fold {fold + 1}')
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(test_df[feature_cols])
    
    # LightGBM
    lgb_model = LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train_scaled, y_train)
    oof_lgb[val_idx] = lgb_model.predict_proba(X_val_scaled)[:, 1]
    predictions_lgb += lgb_model.predict_proba(X_test_scaled)[:, 1] / n_folds
    
    # XGBoost
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(X_train_scaled, y_train)
    oof_xgb[val_idx] = xgb_model.predict_proba(X_val_scaled)[:, 1]
    predictions_xgb += xgb_model.predict_proba(X_test_scaled)[:, 1] / n_folds
    
    # CatBoost
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(X_train_scaled, y_train)
    oof_cat[val_idx] = cat_model.predict_proba(X_val_scaled)[:, 1]
    predictions_cat += cat_model.predict_proba(X_test_scaled)[:, 1] / n_folds

# Evaluate individual models
print("\nCross-validation scores:")
print(f"LightGBM CV score: {roc_auc_score(y, oof_lgb):.4f}")
print(f"XGBoost CV score: {roc_auc_score(y, oof_xgb):.4f}")
print(f"CatBoost CV score: {roc_auc_score(y, oof_cat):.4f}")

Creating features...
Training models with cross-validation...
Fold 1
[LightGBM] [Info] Number of positive: 12420, number of negative: 11320
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015963 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37553
[LightGBM] [Info] Number of data points in the train set: 23740, number of used features: 307
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.523168 -> initscore=0.092737
[LightGBM] [Info] Start training from score 0.092737
Fold 2
[LightGBM] [Info] Number of positive: 12421, number of negative: 11319
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37560
[LightGBM] [Info] Number of data points in the train set: 23740, number of used features: 307
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.523210 -> initscor

In [None]:
# Find optimal weights
from scipy.optimize import minimize

def objective(weights):
    oof_weighted = weights[0] * oof_lgb + weights[1] * oof_xgb + weights[2] * oof_cat
    return -roc_auc_score(y, oof_weighted)

constraints = (
    {'type': 'eq', 'fun': lambda w: np.sum(w) - 1},  # weights sum to 1
    {'type': 'ineq', 'fun': lambda w: w},  # weights are non-negative
)

initial_weights = [1/3, 1/3, 1/3]
bounds = [(0, 1), (0, 1), (0, 1)]
result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)
optimal_weights = result.x

print("\nOptimal weights:", optimal_weights)

# Final ensemble predictions
oof_weighted = (
    optimal_weights[0] * oof_lgb + 
    optimal_weights[1] * oof_xgb + 
    optimal_weights[2] * oof_cat
)

print(f"\nFinal ensemble CV score: {roc_auc_score(y, oof_weighted):.4f}")

# Create submission
test_pred = (
    optimal_weights[0] * predictions_lgb + 
    optimal_weights[1] * predictions_xgb + 
    optimal_weights[2] * predictions_cat
)

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'radiant_win': test_pred
})
submission.to_csv('optimized_ensemble_submission.csv', index=False)
print("\nSubmission file created!")