This notebook fits the baseline model. It predicts on the test data without doing any feature engineering, using the xgboost library.

In [None]:
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

df_train = pd.read_csv('data/train.csv').drop(['id'], axis=1)
df_train['source'] = 'simulation'

df_test = pd.read_csv('data/test.csv')
test_ids = df_test.id
dt_test = df_test.drop(['id'], axis=1)

df_supp = pd.read_csv('data/cirrhosis.csv').drop(['ID'], axis=1)
df_supp['source'] = 'original'

# merge supplemental data
df_train = pd.concat([df_train, df_supp]).reset_index(drop=True)
train_target = df_train['Status']

TARGET = 'Status'
SKEWED_FEATS = ['Bilirubin', 'Cholesterol', 'Copper', 'Prothrombin', 'Alk_Phos']
CAT_FEATS = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
NUM_FEATS = [x for x in df_train.columns if x not in CAT_FEATS and x != TARGET]
NUM_FEATS.remove('source')
ORIG_FEATS = df_train.drop(TARGET, axis=1).columns.tolist()


# Prep Data for XGBoost

## Helper Functions

In [None]:
def log_transformation(data, skewed_features):
    df_log_feats = pd.DataFrame()
    for col in skewed_features:
        name = f'{col}_log'
        df_log_feats[name] = np.log(data[col])
    return df_log_feats

def encode(data):
    X_raw = data.dropna().copy()
    X_raw = X_raw.iloc[:, 0:18]

    # encode the categorical features
    X_cat = X_raw[CAT_FEATS]
    X_num = np.array(X_raw[NUM_FEATS])
    oe = OrdinalEncoder()
    X_cat = oe.fit_transform(X_cat)
    X_cat = pd.DataFrame(X_cat, columns=CAT_FEATS)
    X_num = pd.DataFrame(X_num, columns=NUM_FEATS)
    return X_cat, X_num

## Encoding

In [None]:
# create features and target arrays
X, y = df_train.drop(TARGET, axis=1), df_train[[TARGET]]

# encode features
X_cat, X_num = encode(X)
X = pd.concat([X_cat, X_num], axis=1)

# encode target
le = LabelEncoder()
y_mi = le.fit_transform(y)

## Transformations

In [None]:
# n symptoms
# recode edema "S" and "Y" to both indicate edema for the purpose of this count variable
X['Edema'] = np.where(X['Edema'] == 2, 1, X['Edema'])
X['N_Symptoms'] = X.Edema + X.Spiders + X.Ascites + X.Hepatomegaly

# logarithmic transformations
def get_logs(data, skewed_features):
    X_log = X[skewed_features]
    other_feats = data.columns.difference(skewed_features).tolist()
    X_other = X[other_feats]
    X_log = log_transformation(X_log, skewed_features)
    return pd.concat([X_log, X_other], axis=1)

X = get_logs(X, SKEWED_FEATS)

# Age at Diagnosis
X['Dgns_Age'] = X['Age'] - X['N_Days']

## Train/Test Split

In [None]:
# do train/test split on the data
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=.2, random_state=1)

## Create regression matrices in XGBoost's data format

In [None]:
d_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
d_test = xgb.DMatrix(X_dev, y_dev, enable_categorical=True)

# Fit Model

## Define wandb sweep parameters

In [None]:
sweep_config = {
    "method": "random", # sweep method
    "metric": {
        "name": "accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "booster": { # tree-based or linear functions?
            "values": ["gbtree", "gblinear"]
        },
        "max_depth": { # depth of individual trees
            "values": [3, 6, 9, 12]
        },
        "learning_rate": {
            "values": [0.1, 0.05, 0.2]
        },
        "subsample": { # the proportion of instances to take a sub-sample per iteration
            "values": [1, 0.5, 0.3] 
        }
    }
}

In [None]:
# define training function
def train(data_train, data_dev, model_note, objective='multi:softprob', n_rounds=10_000, early_stopping_rounds=100, lr=.1):
    # Define hyperparameters
    params = {
        # objective for multiclass classification.
        # for regression it would be reg:squarederror, for example.
        "objective": objective,
        "learning_rate": lr,
        "num_class": 3,
        "tree_method": "hist"} # optimization without a GPU

    model = xgb.train(
    params=params,
    dtrain=data_train,
    num_boost_round=n_rounds,
    evals=data,
    early_stopping_rounds=early_stopping_rounds
    )

    # get predictions on dev set
    y_pred = model.predict()

    model.save_model(f'checkpoints/{model_note}_{time.time()}.json') 
    # with a categorical target, if its not json then it throws an error

In [None]:
model_note = "initial_fe"
# specify the names of the data for xgboost to use
evals = [(dtrain, "train"), (dtest, "validation")]
# baseline
train(data=evals, model_note=model_note)

In [None]:
# early stopping
train(model_note="first", early_stopping_rounds=3)

# to-do

1. cross-validation
2. hyperparameter tuning