# try lazypredict

> Lazy Predict helps build a lot of basic models without much code and helps understand which models works better without any parameter tuning.

In [8]:
# essentials
import os
import pathlib
from copy import copy


import pandas as pd
import numpy as np
from tqdm import tqdm

# visualisation
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn imports
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline, make_union, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SequentialFeatureSelector, RFECV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone as clone_model
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from sklearn.impute import SimpleImputer, MissingIndicator, KNNImputer


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import RocCurveDisplay, roc_auc_score, make_scorer, roc_curve

from sklearn.preprocessing import Binarizer, Normalizer, RobustScaler, StandardScaler
from tpot.builtins import StackingEstimator, ZeroCount
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer

# others
from lazypredict.Supervised import LazyClassifier

RANDOM_SEED = 64

In [9]:
IN_KAGGLE = False

kaggle_folder = "/kaggle/input/"
local_folder = "./data/"
train_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder + "playground-series-s3e26/train.csv", index_col="id")
test_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "playground-series-s3e26/test.csv", index_col="id")
original_df = pd.read_csv(kaggle_folder if IN_KAGGLE else local_folder  + "cirrhosis-prediction-dataset/cirrhosis.csv", index_col="ID")

train_df = pd.concat([train_df, original_df], axis=0)
train_df = train_df.reset_index(drop=True)

target_column = "Status"

target_map = {"C": "censored", "CL": "censored due to liver transplant", "D": "death"} # for readability of charts
train_df[target_column] = train_df[target_column].map(target_map)

categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema", "Stage"]
numerical_features = ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]

#categorical_features += ["generated"]

def num_features_1(df):
    df['bilirubin_increased_levels'] = df['Bilirubin'] > 1.1
    df['cholesterol_increased'] = df['Cholesterol'] > 240
    df["albumin_low"] = df['Albumin'] < 3.5
    df["urinary_copper_increased"] = df['Copper'] > 40
    df["Alk_Phos_increased"] = df['Alk_Phos'] > 1400
    df['SGOT_increased'] = df["SGOT"] > 80 
    df["Tryglicerides_normal"] = df['Tryglicerides'] < 150 
    df["Platelets_normal"] = (df['Platelets'] >= 150) & (df['Platelets'] <= 400)
    
    threshold_platelets = 150
    df['thrombocytopenia'] = np.where(df['Platelets'] < threshold_platelets, 1, 0)

    new_cat_feature_names = [
        "thrombocytopenia"
    ]
    new_num_feature_names = [
        'bilirubin_increased_levels',
        'cholesterol_increased',
        'albumin_low',
        'urinary_copper_increased',
        'Alk_Phos_increased',
        'SGOT_increased',
        'Tryglicerides_normal',
        'Platelets_normal',
    ]
    return df, new_cat_feature_names, new_num_feature_names

def num_features_2(df):
    normal_ranges = {
        'Bilirubin': (0.1, 1.2),
        'Cholesterol': (0, 200),
        'Albumin': (3.5, 5.5),
        'Copper': (10, 30),
        'Alk_Phos': (40, 129),
        'SGOT': (8, 45),
        'Tryglicerides': (48.68, 168.15),
        'Platelets': (150, 400),
        'Prothrombin': (9.4, 12.5)
    }

    for feature, (normal_range_min, normal_range_max) in normal_ranges.items():
        if feature == 'Albumin':
            df[f'{feature}_is_normal'] = (df[feature] >= normal_range_min)
        else:
            df[f'{feature}_is_normal'] = (df[feature] >= normal_range_min) & (df[feature] <= normal_range_max) 

        # Add deviation calculation as before
        df.loc[~df[f'{feature}_is_normal'], f'{feature}_deviation'] = df[feature] - ((normal_range_min + normal_range_max) / 2)

    threshold_platelets = 150
    df['thrombocytopenia'] = np.where(df['Platelets'] < threshold_platelets, 1, 0)


    df.fillna(0, inplace=True)
    
    new_cat_feature_names = [f'{feature}_is_normal' for feature in normal_ranges.keys() ]
    new_num_feature_names = [f'{feature}_deviation' for feature in normal_ranges.keys()]

    new_cat_feature_names.append('thrombocytopenia')
    
    return df, new_cat_feature_names, new_num_feature_names

def num_features_3(df):
    ##initial pass as creating features for meaningful cutoffs
    df['APRI']=100 * (df['SGOT'])/df['Platelets']
    df['under769days']=np.where(df['N_Days']<769, 1, 0)
    df['bilirubin_1.2']=np.where(df['Bilirubin']>1.2, 1, 0)
    df['albumin_low']=np.where(df['Albumin']<2.23, 1, 0)
    df['copper_high']=np.where(df['Copper']>73, 1, 0)
    df['SGOT_high']=np.where(df['SGOT']>73, 1, 0)
    df['Prothrombin_high']=np.where(df['Prothrombin']>10.8, 1, 0)
    df['Edema_yn']=np.where(df['Edema']=='N', 0, 1)
    df['bilirubin_3']=np.where(df['Bilirubin']>3, 1, 0)
    df['high_cholesteroal']=np.where(df['Cholesterol']>240, 1, 0)
    df['age_over_70']=np.where((df['Age']/365)>=70, 1, 0)
    df['abnormal_alp']=np.where(((df['Alk_Phos']<30 )| (df['Alk_Phos']>147)), 1, 0)
    df['very_high_tri']=np.where(df['Tryglicerides']>500, 1, 0)
    df['high_tri']=np.where(df['Tryglicerides']>200, 1, 0)
    df['copper_deficient']=np.where(((df['Sex']=='F') & (df['Copper']<80) |(df['Sex']=='M') & (df['Copper']<70)), 1, 0)
    df['FIB4']=(df['Age']/365)* (df['SGOT']/df['Platelets'])
    df['ALBI']=.66*np.log(df['Bilirubin'])-.085 * df['Albumin']

    new_cat_feature_names = [
        "under769days",
        "bilirubin_1.2",
        "albumin_low",
        "copper_high",
        "SGOT_high",
        "Prothrombin_high",
        "Edema_yn",
        "bilirubin_3",
        "high_cholesteroal",
        "age_over_70",
        "abnormal_alp",
        "very_high_tri",
        "high_tri",
        "copper_deficient",
    ]
    new_num_feature_names = [
        "APRI",
        "FIB4",
        "ALBI",
    ]

    
    return df, new_cat_feature_names, new_num_feature_names

def base_feature_engineering(df):
    df['Drug'] = df['Drug'].map({"D-penicillamine": 1,"Placebo": 0})
    df['Sex'] = df['Sex'].map({"F": 1,"M": 0})
    
    df["Stage"] = df["Stage"].apply(lambda x: str(x))
    df['Stage'] = df['Stage'].astype('category')

    for c in categorical_features:
        df[c] = df[c].astype('category')
    return df


train_df, new_cat, new_num = num_features_2(train_df)
train_df = base_feature_engineering(train_df)


categorical_features += new_cat
numerical_features += new_num

X = train_df.drop(columns=target_column)
y = train_df[target_column]

le = LabelEncoder()

y = le.fit_transform(y)

# preprocessing
num_imputer = SimpleImputer(strategy="constant", fill_value=0, add_indicator=True)
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing", add_indicator=True)

numeric_transformer = Pipeline(
    [
        ("num_imputer", num_imputer),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    [
        ("cast as str", FunctionTransformer(lambda x: x.astype(str), validate=False)),
        ("cat_imputer", cat_imputer),
        ("onehot", OneHotEncoder(handle_unknown="ignore", drop="if_binary")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ])

X = preprocessor.fit_transform(X)


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y, shuffle=True)

In [13]:
log_loss_metric = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
log_loss_metric.__name__ = "log_loss"

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_val, y_train, y_val)
models

 97%|█████████▋| 28/29 [00:13<00:00,  2.32it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3071
[LightGBM] [Info] Number of data points in the train set: 6658, number of used features: 49
[LightGBM] [Info] Start training from score -0.471025
[LightGBM] [Info] Start training from score -3.322935
[LightGBM] [Info] Start training from score -1.080012


100%|██████████| 29/29 [00:14<00:00,  2.07it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.83,0.64,,0.83,0.77
LGBMClassifier,0.83,0.63,,0.82,0.41
NearestCentroid,0.68,0.61,,0.73,0.04
RandomForestClassifier,0.83,0.6,,0.82,1.06
BaggingClassifier,0.81,0.6,,0.8,0.59
AdaBoostClassifier,0.8,0.59,,0.79,0.47
BernoulliNB,0.76,0.58,,0.76,0.03
ExtraTreesClassifier,0.81,0.58,,0.8,0.64
DecisionTreeClassifier,0.75,0.57,,0.75,0.1
GaussianNB,0.65,0.56,,0.68,0.02
