This notebook fits the baseline model. It predicts on the test data without doing any feature engineering, using the xgboost library.

In [29]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

df_train = pd.read_csv('data/train.csv').drop(['id'], axis=1)
df_train['source'] = 'simulation'

df_test = pd.read_csv('data/test.csv')
test_ids = df_test.id
dt_test = df_test.drop(['id'], axis=1)

df_supp = pd.read_csv('data/cirrhosis.csv').drop(['ID'], axis=1)
df_supp['source'] = 'original'

# merge supplemental data
df_train = pd.concat([df_train, df_supp]).reset_index(drop=True)
train_target = df_train['Status']

# list of discrete columns
TARGET = 'Status'
CAT_FEATS = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
NUM_FEATS = [x for x in df_train.columns if x not in CAT_FEATS and x != TARGET]
NUM_FEATS.remove('source')
ORIG_FEATS = df_train.drop(TARGET, axis=1).columns.tolist()

Index(['N_Days', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders',
       'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos',
       'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage', 'Status',
       'source'],
      dtype='object')


# Exploratory Data Analysis

In [None]:
print(f'shape: {df_train.shape}')

In [None]:
desc_df = df_train.describe(include="all")
desc_df = desc_df.T
desc_df['unique'] = desc_df['unique'].fillna(df_train.nunique())
desc_df['count'] = desc_df['count'].astype('int16')
desc_df['missing'] = df_train.shape[0] - desc_df['count']
desc_df['dtypes'] = df_train.dtypes
desc_df

- Missing values are present but not frequent
- only 50 unique values of "Prothrombin"

dig into missingness:

In [None]:
for column in CAT_FEATS:
    print(df_train.loc[:, column].value_counts())

In [None]:
plt.figure(figsize=(14, len(NUM_FEATS) * 2))
for i, col in enumerate(NUM_FEATS):
    plt.subplot(len(CAT_FEATS) // 2 + 1, 3, i + 1)
    sns.histplot(x=col, data=df_train)
    plt.title(f"{col}")
    plt.tight_layout()

Several variables are skewed. Let's see if a logarithmic transformation will help with the outliers.

In [None]:
SKEWED_FEATS = ['Bilirubin', 'Cholesterol', 'Copper', 'Prothrombin', 'Alk_Phos']
df_log_feats = pd.DataFrame()
for col in SKEWED_FEATS:
    name = f'{col}_log'
    df_log_feats[name] = np.log(df_train[col])
    
for i, col in enumerate(df_log_feats.columns):
    plt.subplot(len(df_log_feats.columns) // 2 + 1, 3, i + 1)
    sns.histplot(x=col, data=df_log_feats)
    plt.title(f"{col}")
    plt.tight_layout()

This is probably an improvement.


Distribution of the Target:

In [None]:
# Counting the observations for each category
status_counts = df_train[TARGET].value_counts()
labels = status_counts.index
sizes = status_counts.values

# Calculating the percentage of each category
percentages = 100.*sizes/sizes.sum()

# Creating the pie chart with percentages in the labels
plt.figure(figsize=(10, 6))
plt.pie(sizes, labels=[f"{l}, {s:.1f}%" for l, s in zip(labels, percentages)], startangle=90)
plt.gca().set_aspect("equal")
plt.legend(loc="upper right", bbox_to_anchor=(1.2, 1), labels=labels, title=TARGET)
plt.title(f"Distribution of {TARGET}")
plt.show()

The classes are imbalanced and CL is a very small class.

Distribution of the features by the target classes:

In [None]:
plt.figure(figsize=(14, len(CAT_FEATS) * 2))
for i, col in enumerate(CAT_FEATS):
    plt.subplot(len(CAT_FEATS) // 2 + 1, 3, i + 1)
    sns.countplot(x=col, hue=TARGET, data=df_train)
    plt.title(f"{col} vs {TARGET}")
    plt.tight_layout()


- "drug" appears to have no effect on the target
- death is most common outcome for males, censored for females
- ascites, hepatomegaly, spiders, edema, and stage are associated with death

Distribution of the Target by Continuous Features:

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
for i, ax in enumerate(axes.flatten()):
    sns.violinplot(x=TARGET, y=NUM_FEATS[i], data=df_train, ax=ax)
    # Set x ticks to be the original labels (inverse transform)
    ax.set_title(f"{NUM_FEATS[i]} vs {TARGET}")
plt.tight_layout()
plt.show()

They are distributed somewhat normally across classes of the target, but there are outliers.

Correlation of the features:

In [None]:
plt.figure(figsize=(12, 8))
mask = np.triu(np.ones_like(df_train[NUM_FEATS].corr(), dtype=bool))
sns.heatmap(df_train[NUM_FEATS].corr(), annot=True, mask=mask, cmap='vlag')
plt.show()

- bilirubin is correlated with albumin, copper, and n days

To investigate correlation by the target classes, use a pairplot:

In [None]:
pairplot = sns.pairplot(
    df_train[NUM_FEATS + [TARGET]].sample(frac=.01), 
    hue=TARGET, 
    corner=True)

Mutual Information:

In [32]:
def mi(X, y, discrete):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = discrete
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

X_mi_raw = df_train.dropna().copy()
y_mi_raw = X_mi_raw['Status']

# encode the categorical features
X_mi_cat = X_mi_raw[CAT_FEATS]
X_mi_num = np.array(X_mi_raw[NUM_FEATS])
oe = OrdinalEncoder()
X_mi_cat = oe.fit_transform(X_mi_cat)
X_mi = np.concatenate((X_mi_cat, X_mi_num), axis=1)
COLUMNS = CAT_FEATS + NUM_FEATS
X_mi = pd.DataFrame(X_mi, columns=COLUMNS)

# encode the target
le = LabelEncoder()
y_mi = le.fit_transform(y_mi_raw)

In [33]:
# prep data for MI
mi_scores = mi(X_mi, y_mi, CAT_FEATS)
print(mi_scores)

ValueError: dtype='numeric' is not compatible with arrays of bytes/strings.Convert your data to numeric values explicitly instead.