In [1]:
import pandas as pd
import numpy as np
import wandb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

wandb.login()

df_train = pd.read_csv('data/train.csv').drop(['id'], axis=1)
df_train['source'] = 'simulation'

df_test = pd.read_csv('data/test.csv')
test_ids = df_test.id
dt_test = df_test.drop(['id'], axis=1)

df_supp = pd.read_csv('data/cirrhosis.csv').drop(['ID'], axis=1)
df_supp['source'] = 'original'

# merge supplemental data
df_train = pd.concat([df_train, df_supp]).reset_index(drop=True)
train_target = df_train['Status']

TARGET = 'Status'
SKEWED_FEATS = ['Bilirubin', 'Cholesterol', 'Copper', 'Prothrombin', 'Alk_Phos']
CAT_FEATS = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
NUM_FEATS = [x for x in df_train.columns if x not in CAT_FEATS and x != TARGET]
NUM_FEATS.remove('source')
ORIG_FEATS = df_train.drop(TARGET, axis=1).columns.tolist()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mreedmerrill[0m ([33mreeds-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Prep Data

## Preprocessing Functions

In [2]:
# ordinal (or one-hot, if there are two categories) encode the categorical features
def encode_x(data):
    # make a copy
    X_raw = data.iloc[:, 0:18]
    # separate categorical features
    X_cat = X_raw[CAT_FEATS]
    X_num = np.array(X_raw[NUM_FEATS])
    oe = OrdinalEncoder()
    X_cat = oe.fit_transform(X_cat)
    X_cat = pd.DataFrame(X_cat, columns=CAT_FEATS)
    X_num = pd.DataFrame(X_num, columns=NUM_FEATS)
    return X_cat, X_num

# logarithmic transformation
def log_transformation(data, skewed_features):
    df_log_feats = pd.DataFrame()
    for col in skewed_features:
        name = f'{col}_log'
        df_log_feats[name] = np.log(data[col])
    return df_log_feats

def get_logs(data, skewed_features):
    X_log = data[skewed_features]
    other_feats = data.columns.difference(skewed_features).tolist()
    X_other = data[other_feats]
    X_log = log_transformation(X_log, skewed_features)
    return pd.concat([X_log, X_other], axis=1)

## Encoding

In [3]:
# create features and target arrays
X, y = df_train.drop(TARGET, axis=1), df_train[[TARGET]]

# encode features
X_cat, X_num = encode_x(X)
X = pd.concat([X_cat, X_num], axis=1)

# encode target
le = LabelEncoder()
y = le.fit_transform(np.ravel(y))

## Transformations

In [4]:
# recode edema "S" and "Y" to both indicate edema for the purpose of this count variable
X['Edema'] = np.where(X['Edema'] == 2, 1, X['Edema'])
# Number of diseases/symptoms related to liver disease
X['N_Symptoms'] = X.Edema + X.Spiders + X.Ascites + X.Hepatomegaly
# log transformations
X = get_logs(X, SKEWED_FEATS)
# Age at Diagnosis
X['Dgns_Age'] = X['Age'] - X['N_Days']

## Train/Test Split

In [5]:
# do train/test split on the data
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=.2, random_state=1)

## Imputation

In [6]:
imputer = SimpleImputer(strategy='mean')
X_cols = X_train.columns
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_cols)
y_train_imputed = pd.DataFrame(imputer.transform(X_dev), columns=X_cols)


## Calculate Class Weights

In [7]:
class_counts = pd.Series(y_train).value_counts()
n_train = X_train.shape[0]
class_weights = [n_train / count for count in class_counts]
class_weights = dict(zip([0, 2, 1], class_weights))
print(class_counts)
print(class_weights)

0    4166
2    2265
1     227
Name: count, dtype: int64
{0: 1.5981757081132981, 2: 2.939514348785872, 1: 29.330396475770925}


# Fit XGBoost Model

## Define wandb sweep parameters

In [8]:
xgb_sweep_config = {
    "method": "random", # sweep method
    "metric": {
        "name": "accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "booster": { # tree-based or linear functions?
            "value": 'gbtree'
        },
        "max_depth": { # depth of individual trees
            "values": [6, 7]
        },
        "learning_rate": {
            'distribution': 'uniform',
            'min': .04,
            'max': .09,
        },
        "subsample": { # the proportion of instances to take as a sub-sample per iteration
            'distribution': 'uniform',
            'min': .33,
            'max': 1
        }
    }
}

xgb_sweep_id = wandb.sweep(xgb_sweep_config, project='cirrhosis-xgb-sweeps')

Create sweep with ID: llwdythv
Sweep URL: https://wandb.ai/reeds-team/cirrhosis-xgb-sweeps/sweeps/llwdythv


In [9]:
# define training function
def train_xgb():
    config_defaults = {
        "booster": "gbtree",
        "max_depth": 3,
        "learning_rate": 0.1,
        "subsample": 1,
        "seed": 1,
    }

    wandb.init(config=config_defaults) # set defaults. They'll be over-ridden later.
    config = wandb.config

    xgb_model = xgb.XGBClassifier(
        n_estimators=10_000,
        early_stopping_rounds=1,
        learning_rate=config.learning_rate,
        booster=config.booster,
        max_depth=config.max_depth,
        subsample=config.subsample
    )
    
    xgb_model.fit(X_train, y_train, eval_set=[(X_dev, y_dev)])

    # get predictions on dev set
    y_pred = xgb_model.predict(X_dev)
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_dev, predictions)
    wandb.log({"accuracy": accuracy})

    #model.save_model(f'checkpoints/{config.booster}"-"{config.max_depth}"-"{config.learning_rate}"-"{config.subsample}"-"{time.time()}.json') 
    # with a categorical target, if its not json then it throws an error

In [10]:
# specify the sweep, the training function, and the number of sweeps
#wandb.agent(xgb_sweep_id, train_xgb, count=200)

### to-do

- [] switch wandb metric from accuracy to val loss. Seems like the config's metric and the one logged using wandb.log() need to be the same.

# Deep Neural Network

## Prep Data

In [11]:
import tensorflow as tf
from keras import layers
from keras import regularizers
from keras.callbacks import ModelCheckpoint, EarlyStopping

# one-hot encode the y vector
y_train_oh = pd.get_dummies(y_train)
y_dev_oh = pd.get_dummies(y_dev)

# model.fit() requires tf.float32 data types. It's easiest to make the conversion before the data is a tf.dataset
y_train_oh = np.float32(y_train_oh)
y_dev_oh = np.float32(y_dev_oh)

# get input shape
n_features = X_train.shape[1]

# convert to tf.dataset
train = tf.data.Dataset.from_tensor_slices((X_train, y_train_oh))
dev = tf.data.Dataset.from_tensor_slices((X_dev, y_dev_oh))

# specify the batch size
def make_batched_data(batch_size=1):
    train_batched = train.batch(batch_size)
    dev_batched = dev.batch(batch_size)
    return train_batched, dev_batched

2024-01-01 19:43:29.243331: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-01 19:43:29.247263: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-01 19:43:29.307231: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-01 19:43:29.307291: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-01 19:43:29.307351: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [12]:
def build_model(hidden_layers=4, initial_nodes=32, regularizer='L2', dropout_rate=None):

    model = tf.keras.Sequential()
    
    # select Parameter norm penalty
    if regularizer == 'L1':
        model.add(layers.Dense(
            initial_nodes,
            input_dim=n_features,
            kernel_regularizer=regularizers.l1(0.01)))
    elif regularizer == 'L2':
        model.add(layers.Dense(
            initial_nodes,
            input_dim=n_features,
            kernel_regularizer=regularizers.l2(0.01)))
    else: model.add(layers.Dense(nodes, input_dim=n_features))

    # select layers and nodes per layer
    nodes = initial_nodes / 2
    for _ in range(hidden_layers):
        model.add(layers.Dense(nodes, activation='relu'))
        if dropout_rate != None:
            model.add(layers.Dropout(dropout_rate))
        nodes = nodes / 2

    # output layer
    model.add(layers.Dense(
        units=3,
        activation='softmax'))

    return model

In [13]:
nn_hyperp_sweep_config = {
    "method": "random", # sweep method
    "metric": {
        "name": "val_loss",
        "goal": "minimize"
    },
    "parameters": {
        'lr': {
            'distribution': 'uniform',
            'min': 1e-5,
            'max': 1e-1
        },
        'dropout_rate': {
            'distribution': 'uniform',
            'min': 0.0,
            'max': 0.7
        },
        'threshold': {
            'distribution': 'uniform',
            'min': 0.4,
            'max': 0.5
        },
        'batch_size': {
            'values': [1, 4, 8, 16, 32, 64, 128, 256]
        }
    }
}

nn_hyperp_sweep_id = wandb.sweep(nn_hyperp_sweep_config, project='cirrhosis-nn-hyperp-sweeps')

Create sweep with ID: ov8kg1h6
Sweep URL: https://wandb.ai/reeds-team/cirrhosis-nn-hyperp-sweeps/sweeps/ov8kg1h6


In [14]:
nn_arch_sweep_config = {
    "method": "grid",
    "metric": {
        "name": "val_loss",
        "goal": "minimize"
    },
    "parameters": {
        'lr': {
            'values': [0.1, 0.01, 0.001]
        },
        'hidden_layers': {
            'values': [1, 2, 3, 4]
        },
        'initial_nodes': {
            'values': [16, 8]
        },
        'batch_size': {
            'value': 64
        },
    }
}

nn_arch_sweep_id = wandb.sweep(nn_arch_sweep_config, project='cirrhosis-nn-arch-sweeps')

KeyboardInterrupt: 

In [None]:
def train_nn():
    config_defaults = {
        "learning_rate": 0.1,
        "batch_size": 1,
        "dropout_rate": None,
        "threshold": .5,
        "nodes": 16,
        "layers": 1
    }

    wandb.init(config=config_defaults) # set defaults. They'll be over-ridden later.
    config = wandb.config

    sweep_train, sweep_dev = make_batched_data(batch_size=config.batch_size)

    model = build_model(hidden_layers=config.hidden_layers, initial_nodes=config.initial_nodes)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(config.lr),
        loss=tf.keras.losses.categorical_crossentropy,
        metrics=['accuracy'])

    class WandbMetricsLogger(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            wandb.log(logs)
        
    # set callbacks
    callbacks = [
        WandbMetricsLogger(),
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=.05, patience=2)
        ]

    # train the model
    model.fit(
        sweep_train,
        epochs=5,
        validation_data=sweep_dev,
        callbacks=callbacks)

In [None]:
wandb.agent(nn_arch_sweep_id, function=train_nn)

Nan losses for both validation and training. This is likely either exploding or vanishing gradients.

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(
    penalty='l2',
    multi_class='ovr',
    solver='newton-cholesky',
    class_weight=class_weights,
    max_iter=500
)

logit.fit(X_train_imputed, y_train)

The Newton-Cholesky solver is good for when the number of instances is much greater than the number of features, especially if there are one-hot encoded features. This explains why the model converged with this solver but not others. 

Failed solvers: "liblinear", "saga"