In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import joblib
from sklearn.preprocessing import StandardScaler

# Load processed data
df = pd.read_csv('../data/processed/ml_data12.csv')

# Prepare training data (features and target)
X = df.drop(columns=['universityRankingNum', 'uniqueID'])
y = df['universityRankingNum']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost Model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Hyperparameter tuning for XGBoost using GridSearchCV
xgb_param_grid = {
    'n_estimators': [100, 300, 500, 700],  # Higher number of estimators
    'max_depth': [3, 5, 6, 10],  # Depth control for trees
    'learning_rate': [0.01, 0.05, 0.1],  # Learning rate
    'subsample': [0.7, 0.8, 0.9, 1.0],  # Subsampling
    'colsample_bytree': [0.7, 0.8, 0.9],  # Feature subsampling
    'reg_alpha': [0.01, 0.1, 1.0],  # L1 regularization
    'reg_lambda': [0.01, 0.1, 1.0]  # L2 regularization
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from GridSearchCV
print(f"Best parameters from XGBoost GridSearchCV: {grid_search.best_params_}")

# Train the best XGBoost model
best_xgb_model = grid_search.best_estimator_

# Training with early stopping
eval_set = [(X_train, y_train), (X_test, y_test)]  # Evaluation set

# Specify the parameters for early stopping and eval_metric
params = {
    'eval_metric': 'rmse',  # Use RMSE for evaluation
    'early_stopping_rounds': 50  # Stop after 50 rounds with no improvement
}

# Fit the model with early stopping using eval_set
best_xgb_model.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=True,
    **params  # Include eval_metric and early_stopping_rounds
)

# Make predictions and evaluate the model
y_pred = best_xgb_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"✅ XGBoost Test RMSE: {rmse:.2f}")
print(f"✅ XGBoost R²: {r2:.2f}")

# Save the best XGBoost model
joblib.dump(best_xgb_model, '../models/xgboost_model_best.joblib')
print("✅ Best XGBoost model saved to '../models/xgboost_model_best.joblib'")

# Optional: Evaluate feature importance after training
import matplotlib.pyplot as plt

# Plot feature importance
xgb.plot_importance(best_xgb_model, importance_type='weight', max_num_features=10)
plt.title('XGBoost Feature Importance')
plt.show()

Best parameters from XGBoost GridSearchCV: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 0.01, 'reg_lambda': 0.1, 'subsample': 1.0}


TypeError: XGBModel.fit() got an unexpected keyword argument 'eval_metric'