In [None]:
import pandas as pd
import xgboost as xgb
import optuna #hyperparameter optimization
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer, r2_score


In [None]:
file_path = 'student-por.csv'
df = pd.read_csv(file_path)

print(df.head())

In [None]:

X = df[['G1', 'G2']]
X = pd.concat([X, df.drop(columns=['G3'])], axis=1) #include all other features except G3
y = df[['G3']] #g3 is target var

#one hot encoding for categorical variables
X = pd.get_dummies(X, drop_first=True)


Remove Outliers 
(Using IQR to remove outliers)

In [None]:
#from scipy import stats
import numpy as np 

#only complete task on numerical columns
numerical_columns = X.select_dtypes(include=[np.number])

#calculate q1 and q1
Q1 = numerical_columns.quantile(0.25)
Q3 = numerical_columns.quantile(0.75)
IQR = Q3 - Q1

#outlier bounds
lower_bound = numerical_columns < (Q1 - 1.5 * IQR)
upper_bound = numerical_columns > (Q3 + 1.5 * IQR)


outliers = lower_bound | upper_bound
non_outliers = ~outliers.any(axis=1)

#remove outliers
X_clean = X[non_outliers]
y_clean = y[non_outliers]



Attempt with RandomizedSearchCV

In [None]:
#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

y_train = y_train.squeeze()
y_test = y_test.squeeze()
#make sure features are in correct format
X_train = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
X_test = X_test.values if isinstance(X_test, pd.DataFrame) else X_test

#define XG boost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

#define parameter grid
#specifies range of hyperparams that will be searched through
#RSCV --> randomly samples fixed # of combinations, evaluates useing cross-val
param_grid = {
    'n_estimators': [100, 200, 300], # num. trees in ensemble
    'max_depth': [3, 4, 5], #max depth of tree
    'learning_rate': [0.1, 0.2], #how much model adjusts each round
    'subsample': [0.7, 0.8], #fraction of training data used to build tree
    'colsample_bytree': [0.7, 0.8], #fraction of features selected for each tree
    'reg_alpha': [0.5, 1, 2], #l1 reg.
    'reg_lambda': [1.5, 2, 3], #l2 reg.
}

#set up scoring functions
scorers = {
    'mse': make_scorer(mean_squared_error, greater_is_better=False),
    'mae': make_scorer(mean_absolute_error, greater_is_better=False)
}

#set up RandomizedSearchCV for mse
random_search_mse = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, 
                                   n_iter=100, cv=5, verbose=2, random_state=42, 
                                   scoring='neg_mean_squared_error')

random_search_mse.fit(X_train, y_train.values.ravel())

#setup RandomizedSearchCV for mae
random_search_mae = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, 
                                       n_iter=100, cv=5, verbose=2, random_state=42, 
                                       scoring=scorers['mae'])
random_search_mae.fit(X_train, y_train)

#eval on test(validation) set
y_pred_mse = random_search_mse.best_estimator_.predict(X_test)
y_pred_mae = random_search_mae.best_estimator_.predict(X_test)

mse_final = mean_squared_error(y_test, y_pred_mse)
mae_final = mean_absolute_error(y_test, y_pred_mae)
print(f'XGBoost Model Mean Squared Error: {mse_final}')
print(f'XGBoost Model Mean ABS Error: {mae_final}')


With RandomizedSearchCV, MSE --> 1.62, MAE --> 0.74

After removing outliers, MSE 1.36, MAE 0.85

Attempt with Optuna

In [25]:
import optuna #hyperparam optimizer(bayesian optimization)

X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)
y_train = y_train.squeeze()
y_test = y_test.squeeze()
#make sure features are in correct format
X_train = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
X_test = X_test.values if isinstance(X_test, pd.DataFrame) else X_test


def objective_mse(trial):
    param = {
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'tree_method': 'hist',  #use histogram-based method (CPU)
        'device': 'cuda'        #use GPU (CUDA) for training
    }

    #train model with the current set of hyperparameters
    xgb_model = xgb.XGBRegressor(**param)

    #use cross validation for stabler evaluation
    cv_scores = cross_val_score(xgb_model, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')

    return -1 * cv_scores.mean()

def objective_mae(trial):
    param = {
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
    }

    #train model with the current set of hyperparameters
    xgb_model = xgb.XGBRegressor(**param)

    #use cross validation for stabler evaluation
    cv_scores = cross_val_score(xgb_model, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_absolute_error')

    return -1 * cv_scores.mean()

#create study and optimize
study_mse = optuna.create_study(direction='minimize')
study_mse.optimize(objective_mse, n_trials=300)

#study_mae = optuna.create_study(direction='minimize')
#study_mae.optimize(objective_mae, n_trials=250)

#obtain best params + train final model
best_params_mse = study_mse.best_params
#best_params_mae = study_mae.best_params

best_xgb_model_mse = xgb.XGBRegressor(**best_params_mse)
best_xgb_model_mse.fit(X_train, y_train.values.ravel())
y_pred_final_mse = best_xgb_model_mse.predict(X_test)
mse_final = mean_squared_error(y_test, y_pred_final_mse)

#best_xgb_model_mae = xgb.XGBRegressor(**best_params_mae)
# best_xgb_model_mae.fit(X_train, y_train.values.ravel())
# y_pred_final_mae = best_xgb_model_mae.predict(X_test)
# mae_final = mean_absolute_error(y_test, y_pred_final_mae)

print(f'Final Model Mean Squared Error: {mse_final}')
#print(f'Final Model Mean Absolute Error: {mae_final}')





[I 2024-08-12 16:04:39,519] A new study created in memory with name: no-name-77871733-90cf-41b3-b8aa-08f5482819ba
[I 2024-08-12 16:04:51,001] Trial 0 finished with value: 1.259661946396306 and parameters: {'n_estimators': 501, 'max_depth': 17, 'learning_rate': 0.29744836946008013, 'subsample': 0.6656843344048304, 'colsample_bytree': 0.8040149492631081, 'reg_alpha': 9.084053418259344, 'reg_lambda': 7.795201045695941, 'min_child_weight': 1, 'gamma': 0.04363189360224229}. Best is trial 0 with value: 1.259661946396306.
[I 2024-08-12 16:04:57,858] Trial 1 finished with value: 1.121434037019144 and parameters: {'n_estimators': 568, 'max_depth': 10, 'learning_rate': 0.2708570465436004, 'subsample': 0.8601015995181187, 'colsample_bytree': 0.504277562654605, 'reg_alpha': 7.162313953473563, 'reg_lambda': 7.058843042801874, 'min_child_weight': 3, 'gamma': 2.5964599382130538}. Best is trial 1 with value: 1.121434037019144.
[I 2024-08-12 16:05:06,936] Trial 2 finished with value: 1.1352938459806157

KeyboardInterrupt: 

With optuna(bayesian optimization), obtained 1.56 MSE, 0.747 MAE

After removing outliers, obtained 1.37 MSE