In [1]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
import pandas as pd

In [17]:
def optimize_xgboost(X_train, X_val, y_train, y_val, initial_model=None):
    """
    Optimize XGBoost model using a systematic approach.
    Returns the best model and a dictionary of results.
    """
    # Step 1: Define base model if none provided
    if initial_model is None:
        initial_model = xgb.XGBRegressor(
            objective='reg:squarederror',
            random_state=42,
            eval_metric='rmse'  # Add eval metric here
        )
    
    # Step 2: Define parameter grid for initial search
    param_grid = {
        'max_depth': [3, 4, 5],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 200],
        'min_child_weight': [1, 3],
        'subsample': [0.8, 0.9],
        'colsample_bytree': [0.8, 0.9]
    }
    
    # Step 3: Perform grid search with cross-validation
    grid_search = GridSearchCV(
        estimator=initial_model,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit grid search without early stopping
    grid_search.fit(X_train, y_train)
    
    # Step 4: Fine-tune around best parameters
    best_params = grid_search.best_params_
    fine_param_grid = {
        'max_depth': [best_params['max_depth'] - 1, best_params['max_depth'], best_params['max_depth'] + 1],
        'learning_rate': [best_params['learning_rate'] * 0.5, best_params['learning_rate'], best_params['learning_rate'] * 1.5],
        'n_estimators': [best_params['n_estimators'] - 50, best_params['n_estimators'], best_params['n_estimators'] + 50],
        'reg_alpha': [0, 0.001, 0.01],  # L1 regularization
        'reg_lambda': [0, 0.001, 0.01]   # L2 regularization
    }
    
    # Perform fine-tuning grid search
    fine_grid_search = GridSearchCV(
        estimator=initial_model,
        param_grid=fine_param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )
    
    fine_grid_search.fit(X_train, y_train)
    
    # Step 5: Create final model with best parameters and early stopping
    final_params = fine_grid_search.best_params_.copy()
    final_model = xgb.XGBRegressor(
        **final_params,
        eval_metric='rmse',
        random_state=42
    )
    
    # Fit final model with early stopping
    eval_set = [(X_train, y_train), (X_val, y_val)]
    final_model.fit(
        X_train,
        y_train,
        eval_set=eval_set,
        verbose=False
    )
    
    # Step 6: Evaluate final model
    train_pred = final_model.predict(X_train)
    val_pred = final_model.predict(X_val)
    
    results = {
        'best_params': fine_grid_search.best_params_,
        'train_mse': mean_squared_error(y_train, train_pred),
        'val_mse': mean_squared_error(y_val, val_pred),
        'feature_importance': pd.DataFrame({
            'feature': X_train.columns,
            'importance': final_model.feature_importances_
        }).sort_values('importance', ascending=False)
    }
    
    return final_model, results

In [18]:
#Read/Adjust X_train and y_train Data

X_train_full = pd.read_csv('/Users/powellshayne/Desktop/githubrepos/VictorVis2.0/Shayne/X_train.csv')
X_train_full = X_train_full.set_index('nickname')
X_train_full = X_train_full[['series_count', 'game_count', 'kills_per_game', 'deaths_per_game', 'avg_kills', 'avg_deaths']]

y_train_full = pd.read_csv('/Users/powellshayne/Desktop/githubrepos/VictorVis2.0/Shayne/y_train.csv')
y_train_full = y_train_full.set_index('nickname')

In [19]:
#Train, Test, Split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=60)

In [20]:
#Run Optimization Function
optimized_model, optimization_results = optimize_xgboost(X_train, X_val, y_train, y_val)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [21]:
#Print Results
print("Best Parameters:", optimization_results['best_params'])
print("\nTraining MSE:", optimization_results['train_mse'])
print("Validation MSE:", optimization_results['val_mse'])
print("\nTop 10 Important Features:")
print(optimization_results['feature_importance'].head(10))


Best Parameters: {'learning_rate': 0.15000000000000002, 'max_depth': 2, 'n_estimators': 250, 'reg_alpha': 0, 'reg_lambda': 0.01}

Training MSE: 3.342552570570211e-05
Validation MSE: 0.0028784309165266536

Top 10 Important Features:
           feature  importance
3  deaths_per_game    0.524894
2   kills_per_game    0.426764
4        avg_kills    0.041488
5       avg_deaths    0.004760
1       game_count    0.001654
0     series_count    0.000439


In [22]:
#Final Evaluation on test set

X_test = pd.read_csv('/Users/powellshayne/Desktop/githubrepos/VictorVis2.0/Shayne/X_test.csv')
X_test = X_test.set_index('nickname')
X_test = X_test[['series_count', 'game_count', 'kills_per_game', 'deaths_per_game', 'avg_kills', 'avg_deaths']]

y_test = pd.read_csv('/Users/powellshayne/Desktop/githubrepos/VictorVis2.0/Shayne/y_test.csv')
y_test = y_test.set_index('nickname')

test_pred = optimized_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_pred)
print('\nTest MSE:', test_mse)



Test MSE: 0.002730083611904867


## Optimization Results

Before optimizing the XGBRegressor model, the Mean-Squared-Error was:
> ~0.007

After optimization, the XGBRegressor model's Mean-Squared-Error is:
> ~0.0027