In [None]:
import zipfile

import numpy as np
import pandas as pd

In [None]:
PATH = "/content/playground-series-s3e26.zip"

with zipfile.ZipFile(PATH) as zip:
    zip.extractall()


In [None]:
train_data = pd.read_csv("/content/train.csv", index_col='id')
test_data = pd.read_csv("/content/test.csv", index_col='id')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss, make_scorer
import xgboost as xgb

# Some helper functions

In [None]:
def days_to_years(df, columns):
    return df[columns].apply(lambda x: x / 365.25)


def get_cv_score(X, y, params=None):
    if params:
        estimator = xgb.XGBClassifier(**params)
    else:
        estimator = xgb.XGBClassifier()

    sampler = StratifiedKFold(n_splits=5)
    scorer = make_scorer(log_loss, greater_is_better=True, needs_proba=True)
    return cross_val_score(estimator, X, y, cv=sampler, scoring=scorer)


def plot_train_curve(histories):

    for i, run in enumerate(histories):
        plt.figure(figsize=(5,3))
        plt.plot(run.history['loss'], label='train loss')
        plt.plot(run.history['val_loss'], label='val loss')
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend()


def neural_cv_score(model_func=None, X=None, y=None, folds=5, epochs=10, **kwargs):

    val_loss = []
    val_acc = []
    stories = []

    sampler = StratifiedKFold(n_splits=folds)
    callbacks = keras.callbacks.EarlyStopping()
    for train_idx, val_idx in sampler.split(X[0], y):

        train_data = []
        val_data = []

        for i in X:
            train_data.append(i.iloc[train_idx, :])
            val_data.append(i.iloc[val_idx, :])

        train_y = y.iloc[train_idx]
        val_y = y.iloc[val_idx]

        model = model_func()
        history = model.fit(train_data, train_y, epochs=epochs, batch_size=32,
                             validation_data=[val_data, val_y], **kwargs)

        results = model.evaluate(val_data, val_y)
        val_loss.append(results[0])
        val_acc.append(results[1])

        stories.append(history)
        tf.keras.backend.clear_session()

    plot_train_curve(stories)
    return {"val_loss": np.array(val_loss), "val_accuracy": np.array(val_acc)}


def get_two_input_model():
    numerical_input = keras.Input(shape=(num_cols.shape[1]), name='num_input')
    categorical_input = keras.Input(shape=(13), name='cat_input')

    embed_dim = 3
    embedding_layer = keras.layers.Embedding(input_dim=6, output_dim=embed_dim)(categorical_input)
    flattened_embed_layer = keras.layers.Flatten()(embedding_layer)

    concat_layer = keras.layers.Concatenate()([numerical_input,
                                               flattened_embed_layer])
    x = keras.layers.Dense(64, activation='relu')(concat_layer)
    x = keras.layers.Dense(32, activation='relu')(x)
    x = keras.layers.Dense(16, activation='relu')(x)
    output = keras.layers.Dense(3, activation='softmax')(x)

    model = keras.Model(inputs=[numerical_input, categorical_input], outputs=output)
    model.compile(
        optimizer='rmsprop',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


def get_simple_network():
    net = keras.Sequential([
    keras.layers.Dense(input_dim=25, units=64, activation='relu'),
    keras.layers.Dense(units=32, activation='relu'),
    keras.layers.Dense(units=16, activation='relu'),
    keras.layers.Dense(3, activation='softmax')])

    net.compile(optimizer='Adam',loss = 'sparse_categorical_crossentropy',
                    metrics=['accuracy'])
    return net

# Preprocess the data

In [None]:
train_data.head()

Unnamed: 0_level_0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C
4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C


In [None]:
train_data[['N_Days', 'Age']] = days_to_years(train_data, ['N_Days', 'Age'])
test_data[['N_Days', 'Age']] = days_to_years(test_data, ['N_Days', 'Age'])

In [None]:
data = train_data.copy()
labels = data['Status'].map({'C':0, 'CL':1, 'D':2})
data.drop('Status', axis=1, inplace=True)

num_cols = data.select_dtypes(np.number)
cat_cols = data.select_dtypes('object')

data = pd.concat([num_cols, pd.get_dummies(cat_cols, dtype='int32')], axis=1)


In [None]:
get_cv_score(data, labels).mean()

0.5153932501026709

# Testing simple stacking model (RandomForest and XGBoost)

In [None]:
# from sklearn.ensemble import RandomForestClassifier, StackingClassifier

# sampler = StratifiedKFold(n_splits=5)
# scorer = make_scorer(log_loss, greater_is_better=True, needs_proba=True)
# p = {'lambda': 8.366432100590675,
#      'alpha': 0.012595143782964113,
#      'n_estimators': 250,
#      'max_depth': 11,
#      'min_child_weight': 1,
#      'learning_rate': 0.057251606328022064,
#      'subsample': 0.8,
#      'colsample_bytree': 0.11768228992035056}

# model = StackingClassifier([
#         ('rf', RandomForestClassifier(max_depth=6, min_samples_split=2)),
#         ('xgb', xgb.XGBClassifier(**p))
#     ])

# model = xgb.XGBClassifier(**p)

# cross_val_score(model, data, labels, cv=sampler, scoring=scorer).mean()


# Make keras model as Sklearn model to use cross_val_score

In [None]:
# %pip install scikeras

In [None]:
# import tensorflow as tf
# from tensorflow import keras
# from scikeras.wrappers import KerasClassifier

# sampler = StratifiedKFold(n_splits=5)
# scorer = make_scorer(log_loss, greater_is_better=True, needs_proba=True)
# keras_model = KerasClassifier(get_simple_network, epochs=10, batch_size=32, verbose=0)
# cross_val_score(keras_model, data, labels, cv=sampler, scoring=scorer)

# Testing two inputs model(Embedding categorical + Dense)

In [None]:
# encoder = OneHotEncoder(sparse=False).set_output(transform='pandas')
# cat_cols_encoded = encoder.fit_transform(cat_cols)

# scores = neural_cv_score(get_two_input_model, X=[num_cols, cat_cols_encoded], y=labels, epochs=10)

# scores['val_loss'].mean()

# Play with simple NN parameters

In [None]:
# num_cols = train_data.select_dtypes(np.number)
# cat_cols = train_data.select_dtypes('object').drop('Status', axis=1)

# scalers = {}
# for col in num_cols.columns:
#     scalers[col] = StandardScaler()
#     num_cols[col] = scalers[col].fit_transform(num_cols.loc[:,col].to_numpy().reshape(-1,1))


# scaled_data = pd.concat([num_cols, pd.get_dummies(cat_cols, dtype='int32')], axis=1)


# def get_model():
#     model = keras.Sequential([
#         keras.layers.Dense(input_shape=(None,25), units=64, activation='relu'),
#         keras.layers.Dense(32, activation='relu'),
#         keras.layers.Dense(16, activation='relu'),
#         keras.layers.Dense(3, activation='softmax')
#     ])

#     model.compile(optimizer='Adam',
#                   loss='sparse_categorical_crossentropy',
#                   metrics=['accuracy'])
#     return model

In [None]:
# scores = neural_cv_score(get_two_input_model, X=[num_cols, cat_cols_encoded], y=labels, epochs=20, verbose=1)
# scores['val_loss'].mean()

In [None]:
# scores = neural_cv_score(get_simple_network, X=[scaled_data], y=labels, epochs=10, verbose=0)
# scores['val_loss'].mean()