This notebook fits the baseline model. It predicts on the test data without doing any feature engineering, using the xgboost library.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/train.csv')



# Data Prep for xgboost

In [2]:
# cast discrete columns to the "category" dtype for the xgboost library. It can handle category dtypes without a numeric encoding.
# list of discrete columns
cats = data.select_dtypes(exclude=np.number).columns.tolist()

for col in data:
   data[col] = data[col].astype('category')

# create features and target arrays
X, y = data.drop('Status', axis=1), data[['Status']]

In [3]:
# the target needs to be numerical for multiclass classification
# create a recode mapping
# they must be zero indexed
mapping = {'C': 0, 'CL': 1, 'D': 2}
y = y.replace(mapping).astype('int')

In [4]:
# do train/test split on the data
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=.2, random_state=1)

In [5]:
# Create regression matrices in xboost's data format
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_dev, y_dev, enable_categorical=True)

# Fit Model

In [6]:
# Define hyperparameters
params = {
    # objective for multiclass classification.
    # for regression it would be reg:squarederror, for example.
    "objective": "multi:softprob", 
    "num_class": 3,
    "tree_method": "hist"} # optimization without a GPU

# specify the names of the data for xgboost to use
evals = [(dtrain, "train"), (dtest, "validation")]
n = 100

model = xgb.train(
   params=params,
   dtrain=dtrain,
   num_boost_round=n,
   evals=evals,
)

[0]	train-mlogloss:0.82211	validation-mlogloss:0.88628
[1]	train-mlogloss:0.64673	validation-mlogloss:0.76094
[2]	train-mlogloss:0.52433	validation-mlogloss:0.67524
[3]	train-mlogloss:0.43285	validation-mlogloss:0.61589
[4]	train-mlogloss:0.36035	validation-mlogloss:0.57424
[5]	train-mlogloss:0.30329	validation-mlogloss:0.54201
[6]	train-mlogloss:0.25777	validation-mlogloss:0.52011
[7]	train-mlogloss:0.22074	validation-mlogloss:0.50427
[8]	train-mlogloss:0.18997	validation-mlogloss:0.49518
[9]	train-mlogloss:0.16608	validation-mlogloss:0.48933
[10]	train-mlogloss:0.14472	validation-mlogloss:0.48124
[11]	train-mlogloss:0.12796	validation-mlogloss:0.47738
[12]	train-mlogloss:0.11410	validation-mlogloss:0.47362
[13]	train-mlogloss:0.10178	validation-mlogloss:0.47369
[14]	train-mlogloss:0.09167	validation-mlogloss:0.47244
[15]	train-mlogloss:0.08334	validation-mlogloss:0.47452
[16]	train-mlogloss:0.07612	validation-mlogloss:0.47584
[17]	train-mlogloss:0.06995	validation-mlogloss:0.47843
[1

# to-do

1. checkpointing (maybe with wandb)
2. cross-validation