In this file, we move to using XGBoost (with gridsearch) to actually predict. 

In [59]:
# load libraries that may be of use
import numpy as np
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
import torch
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import random
from scipy.stats import randint, uniform
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
# seed (idk how necessary this is)
np.random.seed(324324)

In [40]:
# read in the data appropriately
X_train = pd.read_csv("/Users/senadkokic/Desktop/W2024/STAT 841/Final Project/X_train_cleaned.csv")
y_train = pd.read_csv("/Users/senadkokic/Desktop/W2024/STAT 841/Final Project/Y_train.csv")
X_test = pd.read_csv("/Users/senadkokic/Desktop/W2024/STAT 841/Final Project/X_test_cleaned.csv")
# drop the first column in X_train and X_test
X_train = X_train.drop("Unnamed: 0", axis = 1)
X_test = X_test.drop("Unnamed: 0", axis = 1)
# using a label encoder to use proper class 
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train['label'])

Now, we get to the meat and potatos: using grid search. 

In [43]:
param_dist = {
    'learning_rate': uniform(0.001, 0.3),
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 0.5),
    'reg_lambda': uniform(0, 0.5)
}
model = XGBClassifier()
# define the random search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=500,
                                   cv=5, n_jobs = -1, random_state=314, verbose=10)

# perform random search
random_search.fit(X_train, y_train)

# best parameters found
print("Best parameters:", random_search.best_params_)

# best score found
print("Best accuracy:", random_search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV 3/5; 1/500] START colsample_bytree=0.9584367907464837, gamma=0.29427095682096105, learning_rate=0.08051432535173861, max_depth=9, min_child_weight=1, n_estimators=619, reg_alpha=0.45790412957726045, reg_lambda=0.4776390333769898, subsample=0.5222089374828354
[CV 1/5; 1/500] START colsample_bytree=0.9584367907464837, gamma=0.29427095682096105, learning_rate=0.08051432535173861, max_depth=9, min_child_weight=1, n_estimators=619, reg_alpha=0.45790412957726045, reg_lambda=0.4776390333769898, subsample=0.5222089374828354
[CV 2/5; 1/500] START colsample_bytree=0.9584367907464837, gamma=0.29427095682096105, learning_rate=0.08051432535173861, max_depth=9, min_child_weight=1, n_estimators=619, reg_alpha=0.45790412957726045, reg_lambda=0.4776390333769898, subsample=0.5222089374828354
[CV 4/5; 1/500] START colsample_bytree=0.9584367907464837, gamma=0.29427095682096105, learning_rate=0.08051432535173861, max_depth=9, min_child_wei



[CV 5/5; 9/500] START colsample_bytree=0.5222854728887216, gamma=0.2150259542109662, learning_rate=0.2959095248772951, max_depth=7, min_child_weight=6, n_estimators=850, reg_alpha=0.04503979988576001, reg_lambda=0.26509020723410753, subsample=0.6340953335022488
[CV 2/5; 7/500] END colsample_bytree=0.6300628867450083, gamma=0.331449805391708, learning_rate=0.04291598997819727, max_depth=5, min_child_weight=6, n_estimators=820, reg_alpha=0.3051176479260391, reg_lambda=0.43369455797627104, subsample=0.5744675103078685;, score=0.598 total time=  40.3s
[CV 1/5; 10/500] START colsample_bytree=0.6737932636208017, gamma=0.10675089313296093, learning_rate=0.13051684593814772, max_depth=9, min_child_weight=3, n_estimators=667, reg_alpha=0.17099171194171292, reg_lambda=0.020846294415557742, subsample=0.9706829256231915
[CV 5/5; 8/500] END colsample_bytree=0.9320312413356666, gamma=0.22495693490642205, learning_rate=0.21905471070254431, max_depth=6, min_child_weight=9, n_estimators=276, reg_alpha=

In [44]:
params = random_search.best_params_
mod = XGBClassifier(**params)
mod.fit(X_train, y_train)
y_pred = mod.predict_proba(X_test)
for i in range(0, len(X_test)):
    print(i, ",", y_pred[i][0], ",", y_pred[i][1], ",", y_pred[i][2], ",", y_pred[i][3], ",", y_pred[i][4], sep = '')

0,0.005636112,0.3784677,0.32603356,0.22395554,0.06590705
1,0.007885923,0.137171,0.6776898,0.15907457,0.018178718
2,0.003985125,0.08919668,0.68433523,0.20741801,0.015064994
3,0.018112108,0.08322373,0.46647012,0.33137035,0.10082369
4,0.0106324395,0.070766956,0.51733196,0.338238,0.06303061
5,0.0036633732,0.56979495,0.37056696,0.042070996,0.013903732
6,0.0018610717,0.3699399,0.5319056,0.083676524,0.012616905
7,0.006911261,0.05717552,0.40466854,0.42570764,0.105537
8,0.0027566967,0.67970294,0.25627422,0.046327434,0.014938747
9,0.011196803,0.34568354,0.45889014,0.14814495,0.036084566
10,0.0037265925,0.6970486,0.24148373,0.039090898,0.018650128
11,0.0066376817,0.012886194,0.045725003,0.42087066,0.51388043
12,0.002346277,0.5106455,0.44396162,0.035429128,0.00761741
13,0.0026904687,0.38963103,0.5490564,0.0506912,0.007930865
14,0.0054907724,0.24549042,0.61076266,0.11410805,0.024148138
15,0.0031083948,0.51733935,0.42919925,0.043732177,0.006620838
16,0.003924155,0.21930005,0.68061304,0.0810964,0.015

... What about using the first 100 again?

In [48]:
X_train = pd.read_csv("/Users/senadkokic/Desktop/W2024/STAT 841/Final Project/X_train.csv")
y_train = pd.read_csv("/Users/senadkokic/Desktop/W2024/STAT 841/Final Project/Y_train.csv")
X_test = pd.read_csv("/Users/senadkokic/Desktop/W2024/STAT 841/Final Project/X_test.csv")
# conversion from "object" columns to "category" ones
object_columns = X_train.select_dtypes(include=['object']).columns
X_train[object_columns] = X_train[object_columns].astype('category')
label_encoders = {}
for column in X_train.select_dtypes(include=['category']).columns:
    label_encoders[column] = LabelEncoder()
    X_train[column] = label_encoders[column].fit_transform(X_train[column])

object_columns = X_test.select_dtypes(include=['object']).columns
X_test[object_columns] = X_test[object_columns].astype('category')
label_encoders = {}
for column in X_test.select_dtypes(include=['category']).columns:
    label_encoders[column] = LabelEncoder()
    X_test[column] = label_encoders[column].fit_transform(X_test[column])
# using a label encoder to use proper class 
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train['label'])
column_names = X_train.columns
# imputing missing values
imputer = SimpleImputer(strategy='median') 
X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)
# reconvert to data frames
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
# changing back the column names
X_train = X_train.set_axis(column_names, axis = 1)
X_test = X_test.set_axis(column_names, axis = 1)

In [49]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [50]:
# extract the importance of the variables
imps = model.feature_importances_
# extract the names of the features
feats = X_train.columns
# make a dictionary based on the above
feature_importance_dict = dict(zip(feats, imps))
# sort it out
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
# print or use the sorted feature importance
#for feature, importance in sorted_feature_importance:
#    print(f"{feature}: {importance}")
te = 0
list_bad_feats = []
for feature, importance in sorted_feature_importance:
    if (importance == 0.0):
        print(f"{feature}: {importance}")
        te += 1
        list_bad_feats.append(feature)

f46_IT: 0.0
v72_DE: 0.0
v73_DE: 0.0
v74_DE: 0.0
v75_DE: 0.0
v76_DE: 0.0
v77_DE: 0.0
v78_DE: 0.0
v79_DE: 0.0
f112_SE: 0.0
v182_DK: 0.0
v183_DK: 0.0
age_r: 0.0
age_r2: 0.0
age_r3: 0.0
v243_edulvlb_2: 0.0
v243_edulvlb_1: 0.0
v243_ISCED_2: 0.0
v243_r: 0.0
v252_edulvlb_2: 0.0
v252_edulvlb_1: 0.0
v252_ISCED_2: 0.0
v252_r: 0.0
v252_cs_GB1: 0.0
v262_edulvlb_2: 0.0
v262_edulvlb_1: 0.0
v262_ISCED_2: 0.0
v263_edulvlb_2: 0.0
v263_edulvlb_1: 0.0
v263_ISCED_2: 0.0


In [53]:
firsthundo = []
for i in range(0,100):
    firsthundo.append(sorted_feature_importance[i][0])
X_train = X_train[firsthundo]
X_test = X_test[firsthundo]

In [56]:
param_dist = {
    'learning_rate': uniform(0.001, 0.3),
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 0.5),
    'reg_lambda': uniform(0, 0.5)
}
model = XGBClassifier()
# define the random search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=500,
                                   cv=5, n_jobs = -1, scoring="neg_log_loss", random_state=314, verbose=10)

# perform random search
random_search.fit(X_train, y_train)

# best parameters found
print("Best parameters:", random_search.best_params_)

# best score found
print("Best accuracy:", random_search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV 4/5; 1/500] START colsample_bytree=0.9584367907464837, gamma=0.29427095682096105, learning_rate=0.08051432535173861, max_depth=9, min_child_weight=1, n_estimators=619, reg_alpha=0.45790412957726045, reg_lambda=0.4776390333769898, subsample=0.5222089374828354
[CV 2/5; 1/500] START colsample_bytree=0.9584367907464837, gamma=0.29427095682096105, learning_rate=0.08051432535173861, max_depth=9, min_child_weight=1, n_estimators=619, reg_alpha=0.45790412957726045, reg_lambda=0.4776390333769898, subsample=0.5222089374828354
[CV 1/5; 2/500] START colsample_bytree=0.8716941994121796, gamma=0.4699458936630061, learning_rate=0.13508996485108737, max_depth=3, min_child_weight=5, n_estimators=273, reg_alpha=0.3181590564027635, reg_lambda=0.03265591278702479, subsample=0.9592911313406354
[CV 2/5; 2/500] START colsample_bytree=0.8716941994121796, gamma=0.4699458936630061, learning_rate=0.13508996485108737, max_depth=3, min_child_weigh



[CV 2/5; 12/500] END colsample_bytree=0.9340815928362433, gamma=0.1704813984226623, learning_rate=0.19046978513044358, max_depth=5, min_child_weight=3, n_estimators=115, reg_alpha=0.3533145650685386, reg_lambda=0.13709151919629659, subsample=0.8141955890238539;, score=-0.845 total time=   4.2s
[CV 2/5; 13/500] START colsample_bytree=0.719338335888264, gamma=0.4113852579605368, learning_rate=0.2521333738408973, max_depth=9, min_child_weight=8, n_estimators=157, reg_alpha=0.21550162063556727, reg_lambda=0.4214543486348736, subsample=0.5577430022148953
[CV 3/5; 12/500] END colsample_bytree=0.9340815928362433, gamma=0.1704813984226623, learning_rate=0.19046978513044358, max_depth=5, min_child_weight=3, n_estimators=115, reg_alpha=0.3533145650685386, reg_lambda=0.13709151919629659, subsample=0.8141955890238539;, score=-0.843 total time=   4.1s
[CV 4/5; 12/500] END colsample_bytree=0.9340815928362433, gamma=0.1704813984226623, learning_rate=0.19046978513044358, max_depth=5, min_child_weight=

In [57]:
params = random_search.best_params_
mod = XGBClassifier(**params)
mod.fit(X_train, y_train)
y_pred = mod.predict_proba(X_test)
for i in range(0, len(X_test)):
    print(i, ",", y_pred[i][0], ",", y_pred[i][1], ",", y_pred[i][2], ",", y_pred[i][3], ",", y_pred[i][4], sep = '')

0,0.0018802086,0.81817394,0.1314453,0.03745534,0.011045179
1,0.0035775315,0.09212921,0.81912935,0.07996158,0.0052022655
2,0.0069242422,0.16411161,0.55966496,0.25273445,0.016564747
3,0.012854427,0.06279712,0.7008484,0.20085604,0.022643974
4,0.0066757994,0.23826571,0.62881684,0.10478171,0.021459954
5,0.0020351498,0.6913148,0.26976126,0.030744502,0.0061442903
6,0.0018784457,0.74185014,0.2254817,0.018026736,0.012762923
7,0.00878817,0.028792199,0.34342808,0.3500794,0.26891214
8,0.0022905185,0.49775156,0.40706965,0.07240583,0.0204824
9,0.0021142904,0.8282532,0.11978684,0.040118996,0.009726658
10,0.0067299637,0.52528757,0.31905887,0.119423665,0.02949994
11,0.027762981,0.057788763,0.22590522,0.44504386,0.2434991
12,0.00074911036,0.89215386,0.09529162,0.010078305,0.0017270871
13,0.002317137,0.31657234,0.63840437,0.034950785,0.007755389
14,0.0019605146,0.11540349,0.53309673,0.33287898,0.016660308
15,0.0025816287,0.5264444,0.4371816,0.029707465,0.004084983
16,0.0012739509,0.8276601,0.14156595,0.0

How about a Bayesian optimization?

In [60]:
# Define the parameter space for Bayesian optimization
param_space = {
    'learning_rate': Real(0.001, 0.3, 'log-uniform'),
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(3, 10),
    'min_child_weight': Integer(1, 10),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.5, 1.0),
    'gamma': Real(0.0, 0.5),
    'reg_alpha': Real(0.0, 0.5),
    'reg_lambda': Real(0.0, 0.5),
}

bayesmod = XGBClassifier()

# Setup BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=bayesmod,
    search_spaces=param_space,
    n_iter=500,
    cv=5,
    n_jobs=-1,
    scoring="neg_log_loss",
    random_state=314,
    verbose=10
)

In [61]:
# perform the search
bayes_search.fit(X_train, y_train)

# best parameters found
print("Best parameters:", bayes_search.best_params_)

# best score found
print("Best accuracy:", bayes_search.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 5/5; 1/1] START colsample_bytree=0.571850507639275, gamma=0.18888500946820663, learning_rate=0.005441328203749632, max_depth=5, min_child_weight=4, n_estimators=494, reg_alpha=0.3687790287998559, reg_lambda=0.4954834960293144, subsample=0.9120133147395711
[CV 4/5; 1/1] START colsample_bytree=0.571850507639275, gamma=0.18888500946820663, learning_rate=0.005441328203749632, max_depth=5, min_child_weight=4, n_estimators=494, reg_alpha=0.3687790287998559, reg_lambda=0.4954834960293144, subsample=0.9120133147395711
[CV 2/5; 1/1] START colsample_bytree=0.571850507639275, gamma=0.18888500946820663, learning_rate=0.005441328203749632, max_depth=5, min_child_weight=4, n_estimators=494, reg_alpha=0.3687790287998559, reg_lambda=0.4954834960293144, subsample=0.9120133147395711
[CV 1/5; 1/1] START colsample_bytree=0.571850507639275, gamma=0.18888500946820663, learning_rate=0.005441328203749632, max_depth=5, min_child_weight=4, n_estimat

In [62]:
params = bayes_search.best_params_
modbayes = XGBClassifier(**params)
modbayes.fit(X_train, y_train)
y_pred = modbayes.predict_proba(X_test)
for i in range(0, len(X_test)):
    print(i, ",", y_pred[i][0], ",", y_pred[i][1], ",", y_pred[i][2], ",", y_pred[i][3], ",", y_pred[i][4], sep = '')

0,0.001970269,0.8317465,0.11104516,0.043724995,0.011513046
1,0.0032498206,0.084209785,0.83366084,0.07372769,0.0051518558
2,0.007360144,0.15142082,0.56378126,0.2633588,0.014078954
3,0.01626934,0.054515127,0.6961875,0.20045051,0.032577507
4,0.0057734554,0.18299586,0.702398,0.0915401,0.017292572
5,0.0024243821,0.7084472,0.24822207,0.0346396,0.006266724
6,0.0020915808,0.8162478,0.15258005,0.019248057,0.009832522
7,0.006973886,0.022733107,0.3296378,0.35219666,0.2884586
8,0.0027886338,0.43371347,0.4508882,0.08654832,0.026061414
9,0.0022262735,0.82663506,0.11299415,0.048962492,0.009182061
10,0.006222518,0.4695071,0.37798733,0.11002187,0.036261205
11,0.024388112,0.06495285,0.19074285,0.5213779,0.19853827
12,0.0011045844,0.8838918,0.098445736,0.013621995,0.0029359083
13,0.0026484327,0.4302129,0.517658,0.041129235,0.008351445
14,0.0026204402,0.11455502,0.5534159,0.31273147,0.016677167
15,0.0029139803,0.52596307,0.4302025,0.035692256,0.0052281446
16,0.0016422507,0.82491916,0.14203444,0.027672296,