This notebook fits the baseline model. It predicts on the test data without doing any feature engineering, using the xgboost library.

In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/train.csv')

# Data Prep for xgboost

In [14]:
# cast discrete columns to the "category" dtype for the xgboost library. It can handle category dtypes without a numeric encoding.
# list of discrete columns
cats = data.select_dtypes(exclude=np.number).columns.tolist()

for col in data:
   data[col] = data[col].astype('category')

# create features and target arrays
X, y = data.drop('Status', axis=1), data[['Status']]

In [15]:
# the target needs to be numerical for multiclass classification
# create a recode mapping
mapping = {'C': 1, 'CL': 2, 'D': 3}
y = y.replace(mapping).astype('int')

In [16]:
# do train/test split on the data
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=.2, random_state=1)

In [18]:
# Create regression matrices in xboost's data format
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_dev, y_dev, enable_categorical=True)

# Fit Model

In [None]:
# Define hyperparameters
params = {
    # objective for multiclass classification.
    # for regression it would be reg:squarederror, for example.
    "objective": "multi:softprob", 
    "tree_method": "hist"} # optimization without a GPU

# specify the names of the data for xgboost to use
evals = [(dtrain, "train"), (dtest, "validation")]
n = 100

model = xgb.train(
   params=params,
   dtrain=dtrain,
   num_boost_round=n,
   evals=evals,
)