In [None]:
%load_ext autoreload
%autoreload 2

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import xgboost as xgb


In [69]:
from src.dataset import load_data
from src.preprocessing import impute_missing_values, split_data, remove_name_columns, encode_target_variable

x, y = load_data()
x = remove_name_columns(x)
y = encode_target_variable(y)
(x_train, y_train), (x_val, y_val), (x_test, y_test) = split_data(x, y)

x_train, imputer = impute_missing_values(x_train)
x_val, _ = impute_missing_values(x_val, imputer=imputer)
x_test, _ = impute_missing_values(x_test, imputer=imputer)

In [75]:
xgb_params = {
    "booster": "gbtree",
    "device": "cpu",

    "eta": 0.3,
    "gamma": 0,
    "max_depth": 6,
    "lambda": 1,
    "alpha": 0,

    "objective": "multi:softmax",
    "num_class": 3,
    "eval_metric": "merror",
}

num_boost_round = 1000
early_stopping_rounds = 100

dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)
dtest = xgb.DMatrix(x_test, label=y_test)

evals = [(dtrain, "train"), (dval, "val")]

bst = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds)

[0]	train-merror:0.43884	val-merror:0.51498
[1]	train-merror:0.41242	val-merror:0.51701
[2]	train-merror:0.39350	val-merror:0.52209
[3]	train-merror:0.37851	val-merror:0.51651
[4]	train-merror:0.36289	val-merror:0.51041
[5]	train-merror:0.34777	val-merror:0.51498
[6]	train-merror:0.32643	val-merror:0.51092
[7]	train-merror:0.30814	val-merror:0.50940
[8]	train-merror:0.28820	val-merror:0.50889
[9]	train-merror:0.26648	val-merror:0.50990
[10]	train-merror:0.24565	val-merror:0.50635
[11]	train-merror:0.22952	val-merror:0.50787
[12]	train-merror:0.21351	val-merror:0.51397
[13]	train-merror:0.19916	val-merror:0.50787
[14]	train-merror:0.18506	val-merror:0.51295
[15]	train-merror:0.17033	val-merror:0.50787
[16]	train-merror:0.15902	val-merror:0.50990
[17]	train-merror:0.14721	val-merror:0.51041
[18]	train-merror:0.13565	val-merror:0.51295
[19]	train-merror:0.12676	val-merror:0.50838
[20]	train-merror:0.11482	val-merror:0.51651
[21]	train-merror:0.10822	val-merror:0.51955
[22]	train-merror:0.

In [80]:
from src.evaluate import evaluate_model

acc_val = evaluate_model(bst, dval, y_val)
acc_test = evaluate_model(bst, dtest, y_test)

print(f"Validation accuracy: {acc_val:.4f}")
print(f"Test accuracy: {acc_test:.4f}")

Validation accuracy: 0.4901
Test accuracy: 0.4807


In [126]:
from src.postprocessing import  compute_prediction, save_predictions

x_test = load_data(train=False)

x_test = remove_name_columns(x_test)
x_test, _ = impute_missing_values(x_test, imputer=imputer)

dtest = xgb.DMatrix(x_test)

y_pred = bst.predict(dtest, iteration_range=(0, bst.best_iteration))
predictions = compute_prediction(y_pred, x_test)

save_predictions(predictions, "xgboost.csv")