This notebook fits the baseline model. It predicts on the test data without doing any feature engineering, using the xgboost library.

In [None]:
import time
import pandas as pd
import numpy as np
import wandb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

df_train = pd.read_csv('data/train.csv').drop(['id'], axis=1)
df_train['source'] = 'simulation'

df_test = pd.read_csv('data/test.csv')
test_ids = df_test.id
dt_test = df_test.drop(['id'], axis=1)

df_supp = pd.read_csv('data/cirrhosis.csv').drop(['ID'], axis=1)
df_supp['source'] = 'original'

# merge supplemental data
df_train = pd.concat([df_train, df_supp]).reset_index(drop=True)
train_target = df_train['Status']

TARGET = 'Status'
SKEWED_FEATS = ['Bilirubin', 'Cholesterol', 'Copper', 'Prothrombin', 'Alk_Phos']
CAT_FEATS = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
NUM_FEATS = [x for x in df_train.columns if x not in CAT_FEATS and x != TARGET]
NUM_FEATS.remove('source')
ORIG_FEATS = df_train.drop(TARGET, axis=1).columns.tolist()

# Prep Data for XGBoost

## Helper Functions

In [None]:
def log_transformation(data, skewed_features):
    df_log_feats = pd.DataFrame()
    for col in skewed_features:
        name = f'{col}_log'
        df_log_feats[name] = np.log(data[col])
    return df_log_feats

def encode(data):
    X_raw = data.iloc[:, 0:18]

    # encode the categorical features
    X_cat = X_raw[CAT_FEATS]
    X_num = np.array(X_raw[NUM_FEATS])
    oe = OrdinalEncoder()
    X_cat = oe.fit_transform(X_cat)
    X_cat = pd.DataFrame(X_cat, columns=CAT_FEATS)
    X_num = pd.DataFrame(X_num, columns=NUM_FEATS)
    return X_cat, X_num

## Encoding

In [None]:
# create features and target arrays
X, y = df_train.drop(TARGET, axis=1), df_train[[TARGET]]

# encode features
X_cat, X_num = encode(X)
X = pd.concat([X_cat, X_num], axis=1)

# encode target
le = LabelEncoder()
y = le.fit_transform(np.ravel(y))

## Transformations

In [None]:
# n symptoms
# recode edema "S" and "Y" to both indicate edema for the purpose of this count variable
X['Edema'] = np.where(X['Edema'] == 2, 1, X['Edema'])
X['N_Symptoms'] = X.Edema + X.Spiders + X.Ascites + X.Hepatomegaly

# logarithmic transformations
def get_logs(data, skewed_features):
    X_log = X[skewed_features]
    other_feats = data.columns.difference(skewed_features).tolist()
    X_other = X[other_feats]
    X_log = log_transformation(X_log, skewed_features)
    return pd.concat([X_log, X_other], axis=1)

X = get_logs(X, SKEWED_FEATS)

# Age at Diagnosis
X['Dgns_Age'] = X['Age'] - X['N_Days']

## Train/Test Split

In [None]:
# do train/test split on the data
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=.2, random_state=1)

# Fit Model

## Define wandb sweep parameters

In [None]:
sweep_config = {
    "method": "random", # sweep method
    "metric": {
        "name": "accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "booster": { # tree-based or linear functions?
            "value": 'gbtree'
        },
        "max_depth": { # depth of individual trees
            "values": [5, 6, 7]
        },
        "learning_rate": {
            'distribution': 'uniform',
            'min': .04,
            'max': .09,
        },
        "subsample": { # the proportion of instances to take as a sub-sample per iteration
            'distribution': 'uniform',
            'min': .33,
            'max': 1
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='cirrhosis-xgb-sweeps')

In [None]:
# define training function
def train():
    config_defaults = {
        "booster": "gbtree",
        "max_depth": 3,
        "learning_rate": 0.1,
        "subsample": 1,
        "seed": 1,
    }

    wandb.init(config=config_defaults) # set defaults. They'll be over-ridden later.
    config = wandb.config

    model = xgb.XGBClassifier(
        n_estimators=10_000,
        early_stopping_rounds=2,
        learning_rate=config.learning_rate,
        booster=config.booster,
        max_depth=config.max_depth,
        subsample=config.subsample
    )
    
    model.fit(X_train, y_train, eval_set=[(X_dev, y_dev)])

    # get predictions on dev set
    y_pred = model.predict(X_dev)
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_dev, predictions)
    print(f"Accuracy: {accuracy:.0%}")
    wandb.log({"accuracy": accuracy})

    #model.save_model(f'checkpoints/{config.booster}"-"{config.max_depth}"-"{config.learning_rate}"-"{config.subsample}"-"{time.time()}.json') 
    # with a categorical target, if its not json then it throws an error

In [None]:
wandb.login()

# specify the sweep, the training function, and the number of sweeps
wandb.agent(sweep_id, train, count=200)

# to-do

- [] switch wandb metric from accuracy to val loss. Seems like the config's metric and the one logged using wandb.log() need to be the same.
