In [58]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import xgboost as xgb


In [86]:
from src.dataset import load_data
from src.preprocessing import impute_missing_values, split_data, remove_name_columns, encode_target_variable

x, y = load_data()
x = remove_name_columns(x)
y = encode_target_variable(y)
(x_train, y_train), (x_val, y_val), (x_test, y_test) = split_data(x, y)

x_train, imputer = impute_missing_values(x_train)
x_val, _ = impute_missing_values(x_val, imputer=imputer)
x_test, _ = impute_missing_values(x_test, imputer=imputer)

In [87]:
#--colsample_bylevel=0.21270862659553913 --colsample_bynode=0.3425437174862951 --colsample_bytree=0.5109253108234333 --early_stopping_rounds=8 --eta=0.28997103267508106 --gamma=0.17127497737359776 --l1_reg=0.03016313633404244 --l2_reg=1.9098450598114027 --max_depth=14 --max_leaves=3 --min_child_weight=0.971287126583725 --num_boost_round=961 --subsample=0.6839398034341768
sweep_result = {
    "colsample_bylevel": 0.21270862659553913,
    "colsample_bynode": 0.3425437174862951,
    "colsample_bytree": 0.5109253108234333,
    "eta": 0.28997103267508106,
    "gamma": 0.17127497737359776,
    "alpha": 0.03016313633404244,
    "lambda": 1.9098450598114027,
    "max_depth": 14,
    "max_leaves": 3,
    "min_child_weight": 0.971287126583725,
    "subsample": 0.6839398034341768

}
num_boost_round_sweep= 961
early_stopping_rounds_sweep= 8

In [88]:
xgb_params = sweep_result

xgb_params["booster"] = "gbtree"
xgb_params["device"] = "cpu"
xgb_params["objective"] = "multi:softmax"
xgb_params["num_class"] = 3
xgb_params["eval_metric"] = "merror"
xgb_params["verbosity"] = 0

num_boost_round = num_boost_round_sweep
early_stopping_rounds = early_stopping_rounds_sweep

dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)
dtest = xgb.DMatrix(x_test, label=y_test)

evals = [(dtrain, "train"), (dval, "val")]

bst = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=False)

In [89]:
from src.evaluate import evaluate_model

acc_val = evaluate_model(bst, dval, y_val)
acc_test = evaluate_model(bst, dtest, y_test)

print(f"Validation accuracy: {acc_val:.4f}")
print(f"Test accuracy: {acc_test:.4f}")

Validation accuracy: 0.5074
Test accuracy: 0.5022


In [147]:
from src.postprocessing import  compute_prediction, save_predictions

x_test = load_data(train=False)

x_test = remove_name_columns(x_test)
x_test, _ = impute_missing_values(x_test, imputer=imputer)

dtest = xgb.DMatrix(x_test)

y_pred = bst.predict(dtest, iteration_range=(0, bst.best_iteration))
predictions = compute_prediction(y_pred, x_test)

save_predictions(predictions, "xgboost.csv")