In [4]:
# essentials
import os
import pathlib

import pandas as pd
import numpy as np
from tqdm import tqdm

# visualisation
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SequentialFeatureSelector, RFECV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone as clone_model
from sklearn.metrics import classification_report, confusion_matrix, log_loss


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# others
import xgboost as xgb 
import lightgbm as lgb
import catboost as cb

from tpot import TPOTClassifier

RANDOM_SEED = 64

In [5]:
IN_KAGGLE = False
kaggle_folder = "/kaggle/input/"
local_folder = "./data/"
train_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder + "playground-series-s3e26/train.csv", index_col="id")
test_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "playground-series-s3e26/test.csv", index_col="id")
target_column = "Status"

target_map = {"C": "censored", "CL": "censored due to liver transplant", "D": "death"} # for readability of charts
train_df[target_column] = train_df[target_column].map(target_map)

categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema", "Stage"]
numerical_features = ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]

In [14]:
y_train.value_counts()

Status
censored                            4468
death                               2398
censored due to liver transplant     248
Name: count, dtype: int64

In [15]:
y_val.value_counts()

Status
censored                            497
death                               267
censored due to liver transplant     27
Name: count, dtype: int64

In [6]:
from sklearn.metrics import log_loss, make_scorer


X = pd.get_dummies(train_df.drop(columns=target_column), columns=categorical_features, drop_first=True)
y = train_df[target_column]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED, stratify=y, shuffle=True)

tpot = TPOTClassifier(cv=3, 
                        n_jobs=-1,
                        generations=200,
                      random_state=RANDOM_SEED, verbosity=2, 
                      scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True))
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/20100 [00:00<?, ?pipeline/s]




Generation 1 - Current best internal CV score: -0.4537661454067711

Generation 2 - Current best internal CV score: -0.4523003015405807

Generation 3 - Current best internal CV score: -0.4523003015405807

Generation 4 - Current best internal CV score: -0.4514864046668561

Generation 5 - Current best internal CV score: -0.4514864046668561

Generation 6 - Current best internal CV score: -0.45101978749745214

Generation 7 - Current best internal CV score: -0.45101978749745214

Generation 8 - Current best internal CV score: -0.45101978749745214

Generation 9 - Current best internal CV score: -0.44902864940667353

Generation 10 - Current best internal CV score: -0.4487660615609132

Generation 11 - Current best internal CV score: -0.4487660615609132

Generation 12 - Current best internal CV score: -0.4483743357515048

Generation 13 - Current best internal CV score: -0.4477751535930479

Generation 14 - Current best internal CV score: -0.44776182730226227

Generation 15 - Current best internal

In [16]:
tpot.score(X_train, y_train)

ValueError: could not convert string to float: 'death'

In [17]:
tpot.export('tpot_pipeline_2.py')