In [None]:
%load_ext autoreload
%autoreload 2

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import xgboost as xgb


In [149]:
from src.dataset import load_data
from src.preprocessing import impute_missing_values, split_data, remove_name_columns, encode_target_variable

x, y = load_data()
x = remove_name_columns(x)
y = encode_target_variable(y)
(x_train, y_train), (x_val, y_val), (x_test, y_test) = split_data(x, y)

x_train, imputer = impute_missing_values(x_train)
x_val, _ = impute_missing_values(x_val, imputer=imputer)
x_test, _ = impute_missing_values(x_test, imputer=imputer)

In [158]:
xgb_params = {
    "booster": "gbtree",
    "device": "cpu",

    "eta": 0.3,
    "gamma": 0,
    "max_depth": 6,
    "lambda": 1,
    "alpha": 0,

    "objective": "multi:softmax",
    "num_class": 3,
    "eval_metric": "merror",

    "verbosity": 0,
}

num_boost_round = 1000
early_stopping_rounds = 10

dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)
dtest = xgb.DMatrix(x_test, label=y_test)

evals = [(dtrain, "train"), (dval, "val")]

bst = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=False)

In [159]:
from src.evaluate import evaluate_model

acc_val = evaluate_model(bst, dval, y_val)
acc_test = evaluate_model(bst, dtest, y_test)

print(f"Validation accuracy: {acc_val:.4f}")
print(f"Test accuracy: {acc_test:.4f}")

Validation accuracy: 0.4901
Test accuracy: 0.4807


In [147]:
from src.postprocessing import  compute_prediction, save_predictions

x_test = load_data(train=False)

x_test = remove_name_columns(x_test)
x_test, _ = impute_missing_values(x_test, imputer=imputer)

dtest = xgb.DMatrix(x_test)

y_pred = bst.predict(dtest, iteration_range=(0, bst.best_iteration))
predictions = compute_prediction(y_pred, x_test)

save_predictions(predictions, "xgboost.csv")