In [25]:
import pandas as pd
import numpy as np
import wandb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

wandb.login()

df_train = pd.read_csv('data/train.csv').drop(['id'], axis=1)
df_train['source'] = 'simulation'

df_test = pd.read_csv('data/test.csv')
test_ids = df_test.id
dt_test = df_test.drop(['id'], axis=1)

df_supp = pd.read_csv('data/cirrhosis.csv').drop(['ID'], axis=1)
df_supp['source'] = 'original'

# merge supplemental data
df_train = pd.concat([df_train, df_supp]).reset_index(drop=True)
train_target = df_train['Status']

TARGET = 'Status'
SKEWED_FEATS = ['Bilirubin', 'Cholesterol', 'Copper', 'Prothrombin', 'Alk_Phos']
CAT_FEATS = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
NUM_FEATS = [x for x in df_train.columns if x not in CAT_FEATS and x != TARGET]
NUM_FEATS.remove('source')
ORIG_FEATS = df_train.drop(TARGET, axis=1).columns.tolist()

# Prep Data

## Preprocessing Functions

In [26]:
def log_transformation(data, skewed_features):
    df_log_feats = pd.DataFrame()
    for col in skewed_features:
        name = f'{col}_log'
        df_log_feats[name] = np.log(data[col])
    return df_log_feats

def encode(data):
    X_raw = data.iloc[:, 0:18]

    # encode the categorical features
    X_cat = X_raw[CAT_FEATS]
    X_num = np.array(X_raw[NUM_FEATS])
    oe = OrdinalEncoder()
    X_cat = oe.fit_transform(X_cat)
    X_cat = pd.DataFrame(X_cat, columns=CAT_FEATS)
    X_num = pd.DataFrame(X_num, columns=NUM_FEATS)
    return X_cat, X_num
    
# logarithmic transformations
def get_logs(data, skewed_features):
    X_log = X[skewed_features]
    other_feats = data.columns.difference(skewed_features).tolist()
    X_other = X[other_feats]
    X_log = log_transformation(X_log, skewed_features)
    return pd.concat([X_log, X_other], axis=1)

## Encoding

In [27]:
# create features and target arrays
X, y = df_train.drop(TARGET, axis=1), df_train[[TARGET]]

# encode features
X_cat, X_num = encode(X)
X = pd.concat([X_cat, X_num], axis=1)

# encode target
le = LabelEncoder()
y = le.fit_transform(np.ravel(y))

## Transformations

In [28]:
# recode edema "S" and "Y" to both indicate edema for the purpose of this count variable
X['Edema'] = np.where(X['Edema'] == 2, 1, X['Edema'])
# Number of diseases/symptoms related to liver disease
X['N_Symptoms'] = X.Edema + X.Spiders + X.Ascites + X.Hepatomegaly
# log transformations
X = get_logs(X, SKEWED_FEATS)
# Age at Diagnosis
X['Dgns_Age'] = X['Age'] - X['N_Days']

## Train/Test Split

In [32]:
# do train/test split on the data
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=.2, random_state=1)
print(X_train.head)

<bound method NDFrame.head of       Bilirubin_log  Cholesterol_log  Copper_log  Prothrombin_log  \
2812      -0.510826         5.697093    4.060443         2.272126   
7638      -0.693147         5.513429    3.663562         2.312535   
8121       2.433613         6.410175    4.174387         2.379546   
8069       1.386294         5.278115    4.382027         2.424803   
4278       1.193922         5.170484    5.634790         2.433613   
...             ...              ...         ...              ...   
2895       1.987874         7.151485    5.509388         2.379546   
7813      -0.510826         5.407172    3.784190         2.397895   
905        1.609438         7.377759    4.317488         2.388763   
5192       1.163151         5.616771    4.736198         2.302585   
235       -0.356675         5.463832    4.077537         2.282382   

      Alk_Phos_log      Age  Albumin  Ascites  Drug  Edema  Hepatomegaly  \
2812      6.712956  12641.0     3.50      0.0   1.0    0.0       

AttributeError: 'DataFrame' object has no attribute 'type'

## Imputation

In [38]:
imputer = SimpleImputer(strategy='mean')
X_cols = X_train.columns
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_cols)
y_train_imputed = pd.DataFrame(imputer.transform(X_dev), columns=X_cols)

# Fit XGBoost Model

## Define wandb sweep parameters

In [None]:
xgb_sweep_config = {
    "method": "random", # sweep method
    "metric": {
        "name": "accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "booster": { # tree-based or linear functions?
            "value": 'gbtree'
        },
        "max_depth": { # depth of individual trees
            "values": [6, 7]
        },
        "learning_rate": {
            'distribution': 'uniform',
            'min': .04,
            'max': .09,
        },
        "subsample": { # the proportion of instances to take as a sub-sample per iteration
            'distribution': 'uniform',
            'min': .33,
            'max': 1
        }
    }
}

xgb_sweep_id = wandb.sweep(xgb_sweep_config, project='cirrhosis-xgb-sweeps')

In [None]:
# define training function
def train():
    config_defaults = {
        "booster": "gbtree",
        "max_depth": 3,
        "learning_rate": 0.1,
        "subsample": 1,
        "seed": 1,
    }

    wandb.init(config=config_defaults) # set defaults. They'll be over-ridden later.
    config = wandb.config

    xgb_model = xgb.XGBClassifier(
        n_estimators=10_000,
        early_stopping_rounds=1,
        learning_rate=config.learning_rate,
        booster=config.booster,
        max_depth=config.max_depth,
        subsample=config.subsample
    )
    
    xgb_model.fit(X_train, y_train, eval_set=[(X_dev, y_dev)])

    # get predictions on dev set
    y_pred = xgb_model.predict(X_dev)
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_dev, predictions)
    wandb.log({"accuracy": accuracy})

    #model.save_model(f'checkpoints/{config.booster}"-"{config.max_depth}"-"{config.learning_rate}"-"{config.subsample}"-"{time.time()}.json') 
    # with a categorical target, if its not json then it throws an error

In [None]:
# specify the sweep, the training function, and the number of sweeps
wandb.agent(xgb_sweep_id, train, count=200)

### to-do

- [] switch wandb metric from accuracy to val loss. Seems like the config's metric and the one logged using wandb.log() need to be the same.

# Deep Neural Network Model

## Prep Data

In [None]:
import tensorflow as tf
from keras import layers
from keras.callbacks import ModelCheckpoint, EarlyStopping

# one-hot encode the y vector
y_train_oh = pd.get_dummies(y_train)
y_dev_oh = pd.get_dummies(y_dev)

# model.fit() requires tf.float32 data types. It's easiest to make the conversion before the data is a tf.dataset
y_train_oh = np.float32(y_train_oh)
y_dev_oh = np.float32(y_dev_oh)

# get input shape
n_features = X_train.shape[1]

# convert to tf.dataset
train = tf.data.Dataset.from_tensor_slices((X_train, y_train_oh))
dev = tf.data.Dataset.from_tensor_slices((X_dev, y_dev_oh))

# specify the batch size
train = train.batch(1)
dev = dev.batch(1)

In [None]:
def build_model():
    model = tf.keras.Sequential()
    model.add(layers.InputLayer(input_shape=n_features))
    model.add(layers.Dense(4, activation='relu'))
    model.add(layers.Dense(
        units=3,
        activation='softmax'))
    return model

model = build_model()
print(model.summary())

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.SGD(.001),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics=['accuracy'])

class WandbMetricsLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        wandb.log(logs)
    
# set callbacks
callbacks = [
    WandbMetricsLogger()
    ]

# train the model
#model.fit(
#   train,
#   epochs=1,
#   validation_data=dev,
#   callbacks=callbacks) # uncomment for wandb

Decided not to do hyperparameter tuning on this model because it isn't funtioning very well. There are a few reasons why this might be:

1. Not enough data. This could be getting exascerbated by my choice to do a training/validation split rather k-fold cross validation.
2. The features, as they stand, aren't appropriate for logistic regression. Compared to my Keras implementation, Sci-kit Learn's logistic regression does L1/L2 regularization by default, probably because often when we work with tabular data the features are messy.
3. My feature engineering didn't account for extreme values well enough. 
4. There are highly correlated features (like N_Days and Dgns_Age and Age) that might be ruining the optimization.

# Sci-Kit Learn Logistic Regression

## Calculate Class Weights

In [41]:
class_counts = pd.Series(y_train).value_counts()
n_train = X_train.shape[0]
class_weights = [n_train / count for count in class_counts]
class_weights = dict(zip([0, 2, 1], class_weights))
print(class_counts)
print(class_weights)

0    4166
2    2265
1     227
Name: count, dtype: int64
{0: 1.5981757081132981, 2: 2.939514348785872, 1: 29.330396475770925}


In [55]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(
    penalty='l2',
    multi_class='ovr',
    solver='newton-cholesky',
    class_weight=class_weights,
    max_iter=500
)

logit.fit(X_train_imputed, y_train)

The Newton-Cholesky solver is good for when the number of instances is much greater than the number of features, especially if there are one-hot encoded features. This explains why the model converged with this solver but not others. 

Failed solvers: "liblinear", "saga"