### Nikolaos Giannopoulos AM 5199
### Team: Trump Tariffed My Datasets

In [None]:
import numpy as np
import csv
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, log_loss

In [None]:
X_train = np.load("X_train.npy")
y_train = np.load("y_train.npy")
X_test = np.load("X_test.npy")

# XGBoost Hyper-Tuning

In [None]:
#Best is trial 105 with value: 0.10577266665304749 and {'n_estimators': 491, 'max_depth': 8, 'subsample': 0.8979238963378958, 'colsample_bytree': 0.7931631519474577, 'min_child_weight': 4}
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'eval_metric': 'logloss',
        'use_label_encoder': False
    }

    model = XGBClassifier(**params,  device='cuda')
    return -cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=5).mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=150)

print("Best trial:")
print(study.best_trial.params)

In [None]:
#Learning rate 0.05 gives the best results, so let it train and then change the learning rate to 0.05
xbg = XGBClassifier(**study.best_trial.params, device='cuda')
xbg.fit(X_train, y_train)

y_pred_xgb = xbg.predict_proba(X_test)
y_pred_xgb = y_pred_xgb[:,1]
print(y_pred_xgb)

# LightGBM Hyper-Tuning

In [None]:
#Best is trial 47 with value: 0.10584998479257306 and {'n_estimators': 905, 'learning_rate': 0.12903899598984012, 'max_depth': 3, 'num_leaves': 93, 'min_child_samples': 77, 'subsample': 0.9799477602168288, 'colsample_bytree': 0.9520928492544529}

def objective_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 15, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    model = LGBMClassifier(**params, random_state=42, device = 'gpu')
    return -cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=5).mean()

study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(objective_lgb, n_trials=100)
print(study_lgb.best_params)

# CatBoost Hyper-Tuning

In [None]:
#Best is trial 47 with value: 0.10584998479257306 and {'learning_rate': 0.07728841406235608, 'depth': 7, 'l2_leaf_reg': 9.09712685218602, 'border_count': 77}
def objective_cat(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
    }

    model = CatBoostClassifier(
        **params,
        verbose=0,
        loss_function='Logloss',
        random_seed=42,
        task_type="GPU"
    )
    return -cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=5).mean()

study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(objective_cat, n_trials=50)
print(study_cat.best_params)

In [None]:
#Reduce learning rate to 1/3, increase border to double
catboost = CatBoostClassifier(verbose=0,
        loss_function='Logloss',
        task_type="GPU",learning_rate = 0.025, depth = 7.0, l2_leaf_reg = 9.09712685218602, border_count = 140, iterations = 10000, early_stopping_rounds=2000)
catboost.fit(X_train, y_train)
y_pred_cat = catboost.predict_proba(X_test)
y_pred_cat = y_pred_cat[:,1]
print(y_pred_cat)

# Logistic Regression Hyper-Tuning

In [None]:
model = LogisticRegression(solver='liblinear', random_state=42, n_jobs=10)

logloss = -cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=5).mean()

print("Log-loss of Logistic Regression is ",logloss)

# Random Forest Hyper-Tuning

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)

logloss = -cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=5).mean()

print("Log-loss of Random Forest is ",logloss)

# Model Stacking

In [None]:
#Model Stacking
#CatBoost + XGBoost: Log-loss of Stacked Models is  0.12020644024186593
#Log-loss of Stacked Models is  0.1443349556964726
best_xgb_model = XGBClassifier(colsample_bytree = 0.9, learning_rate = 0.05, max_depth = 5, min_child_weight = 6, subsample = 0.8864966515073411, n_estimators = 494, device='cuda')
best_cat_model = CatBoostClassifier(verbose=0, loss_function='Logloss', task_type="GPU",learning_rate = 0.025, depth = 7.0, l2_leaf_reg = 9.09712685218602, border_count = 140, iterations = 10000, early_stopping_rounds=2000)
best_light_model = LGBMClassifier(n_estimators = 905, learning_rate = 0.12903899598984012, max_depth = 3, num_leaves = 93, min_child_samples = 77, subsample = 0.9799477602168288, colsample_bytree = 0.9520928492544529, device = 'gpu')
stacked_model = StackingClassifier(
    estimators=[
        ('xgb', best_xgb_model),  #XGBClassifier(**best_xgb_params)
        ('cat', best_cat_model),  #CatBoostClassifier(**best_cat_params)
        ('light', best_light_model), #LGBMClassifier(**best_light_params)
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    stack_method='predict_proba'  #needed for log loss
)
logloss = -cross_val_score(stacked_model, X_train, y_train, scoring='neg_log_loss', cv=2).mean()
stacked_model.fit(X_train, y_train)
y_pred_stacked = stacked_model.predict_proba(X_test)
y_pred_stacked = y_pred_stacked[:,1]
print("Log-loss of Stacked Models is ",logloss)
print(y_pred_stacked)

In [None]:
# Write predictions to a file
predictions = zip(range(len(y_pred_xgb)), y_pred_xgb)
with open("submissions.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['ID','Label'])
    for row in predictions:
        csv_out.writerow(row)

In [None]:
best_xgb_model = XGBClassifier(colsample_bytree = 0.9, learning_rate = 0.05, max_depth = 5, min_child_weight = 6, subsample = 0.8864966515073411, n_estimators = 494, device='cuda')
best_cat_model = CatBoostClassifier(verbose=0, loss_function='Logloss', task_type="GPU",learning_rate = 0.025, depth = 7.0, l2_leaf_reg = 9.09712685218602, border_count = 140, iterations = 10000, early_stopping_rounds=2000)
best_light_model = LGBMClassifier(n_estimators = 905, learning_rate = 0.12903899598984012, max_depth = 3, num_leaves = 93, min_child_samples = 77, subsample = 0.9799477602168288, colsample_bytree = 0.9520928492544529, device = 'gpu')

logloss_scorer = make_scorer(log_loss, needs_proba=True)
scores = cross_val_score(best_cat_model, X_train, y_train, cv=10, scoring='neg_log_loss')
print("Cross-validated log loss:", -scores.mean())  #take negative since sklearn returns negative log loss