This notebook fits the baseline model. It predicts on the test data without doing any feature engineering, using the xgboost library.

In [None]:
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

df_train = pd.read_csv('data/train.csv').drop(['id'], axis=1)
df_test = pd.read_csv('data/test.csv')
test_ids = df_test.id
dt_test = df_test.drop(['id'], axis=1)
df_supp = pd.read_csv('data/cirrhosis.csv').drop(['ID'], axis=1)
# merge supplemental data
df_train = pd.concat([df_train, df_supp]).reset_index(drop=True)
TARGET = 'Status'

# Prep Data for XGBoost

In [None]:
# category dtype is needed
for col in df_train:
    if col.dtype() == object:
        df_train[col] = df_train[col].astype('category')

# create features and target arrays
X, y = df_train.drop(TARGET, axis=1), df_train[[TARGET]]

# the target needs to be numerical for multiclass classification
# create a recode mapping
# they must be zero indexed
mapping = {'C': 0, 'CL': 1, 'D': 2}
y = y.replace(mapping).astype('int')

In [None]:
# do train/test split on the data
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=.2, random_state=1)

In [None]:
# Create regression matrices in xboost's data format
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_dev, y_dev, enable_categorical=True)

# Fit Model

In [None]:
# define training function
def train(objective='multi:softprob', n_rounds=100, model_note="", early_stopping_rounds=100):
    # Define hyperparameters
    params = {
        # objective for multiclass classification.
        # for regression it would be reg:squarederror, for example.
        "objective": objective,
        "num_class": 3,
        "tree_method": "hist"} # optimization without a GPU

    # specify the names of the data for xgboost to use
    evals = [(dtrain, "train"), (dtest, "validation")]

    model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=n_rounds,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds
    )

    model.save_model(f'checkpoints/{model_note}_{time.time()}.json') 
    # with a categorical target, if its not json then it throws an error

In [None]:
# baseline
train(model_note="first")

In [None]:
# early stopping
train(model_note="first", early_stopping_rounds=3)

# to-do

1. checkpointing (maybe with wandb)
2. cross-validation