In [None]:
# essentials
import os
import pathlib

import pandas as pd
import numpy as np
from tqdm import tqdm

# visualisation
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC, NuSVC

from sklearn.utils import compute_sample_weight, compute_class_weight


# others
import xgboost as xgb 

In [None]:
RANDOM_SEED = 64

input_folder = "./data" # /kaggle/input/playground-series-s3e26
train_df = pd.read_csv(input_folder+ "/train.csv", index_col="id")
test_df = pd.read_csv(input_folder+"/test.csv", index_col="id")

target_column = "Status"

categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema", "Stage"]
numerical_features = ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]

train_df

In [None]:
X = train_df.drop(columns=target_column)
y = train_df[target_column]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED, stratify=y, shuffle=True)

print(f"Number of training examples: {len(X_train)}")
print(f"Number of validation examples: {len(X_val)}")

print("Number of examples per class in training set")
print(y_train.value_counts())

print("Number of examples per class in validation set")
print(y_val.value_counts())

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)


In [None]:
class_weight_keys = np.unique(y_train)
class_weight_values = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)

class_weights = dict(zip(class_weight_keys, class_weight_values))
class_weights

sample_weights = [ class_weights[j] for j in y_train]

In [None]:

models = {
    "xgboost": xgb.XGBClassifier(objective="multi:softprob", random_state=RANDOM_SEED, n_jobs=-1),
    "logistic_regression": LogisticRegression(random_state=RANDOM_SEED, multi_class="ovr", n_jobs=-1),
    "sgd": SGDClassifier(random_state=RANDOM_SEED, n_jobs=-1, loss="log_loss"),
    "decision_tree": DecisionTreeClassifier(random_state=RANDOM_SEED),
    "random_forest": RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1),
    "gradient_boosting": GradientBoostingClassifier(random_state=RANDOM_SEED),
    "naive_bayes": GaussianNB(),
    "svc": SVC(random_state=RANDOM_SEED, probability=True),
    "linear_svc": LinearSVC(random_state=RANDOM_SEED),
    "nu_svc": NuSVC(random_state=RANDOM_SEED),
    "kneighbors": KNeighborsClassifier(n_jobs=-1),
    "gaussian_process": GaussianProcessClassifier(random_state=RANDOM_SEED, multi_class = "one_vs_rest", n_jobs=-1),
}

In [None]:
# pipeline

numeric_transformer = Pipeline(
    [
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    [
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

models_scores = []

for model_name, model in tqdm(models.items()):
    try:
        clf = Pipeline(
            [
                ("preprocessor", preprocessor),
                ("classifier", CalibratedClassifierCV(model, cv=3)),
            ]
        )

        clf.fit(X_train, y_train, classifier__sample_weight=sample_weights)
        y_pred_proba = clf.predict_proba(X_val)
        y_pred = clf.predict(X_val)

        log_loss_score = log_loss(y_val, y_pred_proba)
    except Exception as e:
        print("Problem with model: ", model_name)
        print(e)
        log_loss_score = None

    models_scores.append({
        "model_name": model_name,
        "log_loss": log_loss_score
    })

models_scores_df = pd.DataFrame(models_scores).sort_values(by="log_loss", ascending=True)
models_scores_df

In [None]:
best_model_name = models_scores_df.iloc[0]["model_name"]

best_model = models[best_model_name]

clf = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", best_model),
    ]
)

clf.fit(X_train, y_train, classifier__sample_weight=sample_weights)
y_pred_proba = clf.predict_proba(X_val)
y_pred = clf.predict(X_val)

print(f"Log loss: {log_loss(y_val, y_pred_proba)}")
print(classification_report(y_val, y_pred, target_names=le.classes_))

In [None]:
# visualise confusion matrix

fig, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.title.set_text("Confusion matrix")
sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, ax=ax)
ax.set_xticklabels(le.classes_)
ax.set_yticklabels(le.classes_)

# show on top of heatmap text "Predicted values"
ax.set_xlabel("Predicted values")
ax.set_ylabel("True values")
plt.show()