In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('liver-patient-clean.csv')
data.drop(["Unnamed: 0"], axis = 1, inplace = True)

In [3]:
X = data.drop(["Liver Patient"], axis = 1).copy()
y = data["Liver Patient"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
models = {"Logistic Regression": LogisticRegression(),
          "Random Forest": RandomForestClassifier()}

parameters = {"Logistic Regression": {'C': [0.001,0.01,0.1,1,10],
                                      'max_iter': [1000, 2500, 5000]},
              "Random Forest": {"n_estimators": [100, 300, 500, 800, 1200],
                                "max_depth": [5, 10, 15, 25, 30],
                                "min_samples_split": [2, 5, 10, 25, 100],
                                "min_samples_leaf": [1, 2, 5, 10]}}

trained_models = {key: None for key in parameters.keys()}

classfications = {key: None for key in parameters.keys()}

In [5]:
def parameter_tuning(name, model, models, parameters, X, y, i, kfold = 5):
    random = RandomizedSearchCV(model, parameters, cv = kfold)
    random.fit(X, y)
    
    if i == 0:
        return LogisticRegression(penalty = 'l2', **random.best_params_)
    elif i == 1:
        return RandomForestClassifier(**random.best_params_)

def feature_selection(model, X, y, kfold = 5):
    features = RFECV(model, cv = kfold)
    features.fit(X, y)
    
    return X[X.columns[features.support_]], X.columns[features.support_]

def cross_test(model, X, y, score, kfold = 5):
    cv_results = cross_val_score(model, X, y, cv = kfold, scoring = score)
    return [cv_results.mean(), cv_results.std()]

In [6]:
k_fold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 3)

In [7]:
for name, model, i in zip(models.keys(), models.values(), range(0, 2)):
    model = parameter_tuning(name, model, models, parameters[name], X_train, y_train, i, kfold = k_fold)
    X_train, columns = feature_selection(model, X_train, y_train, kfold = k_fold)
    model.fit(X_train, y_train)
    trained_models[name] = model
    train_results = cross_test(model, X_train, y_train, score = 'accuracy', kfold = k_fold)
    print(f"Train Accuracy {name}: {train_results[0]} ({train_results[1]})")
    report = classification_report(y_test, model.predict(X_test[columns]), output_dict = True)
    report_simple = {0: report["0"], 1: report["1"]}
    classfications[name] = pd.DataFrame(report_simple)

Train Accuracy Logistic Regression: 0.7299154334038055 (0.0306544798289862)
Train Accuracy Random Forest: 0.727431289640592 (0.04446099978974973)


In [8]:
classfications["Logistic Regression"]

Unnamed: 0,0,1
precision,0.666667,0.742188
recall,0.266667,0.940594
f1-score,0.380952,0.829694
support,45.0,101.0


In [9]:
classfications["Random Forest"]

Unnamed: 0,0,1
precision,0.428571,0.712
recall,0.2,0.881188
f1-score,0.272727,0.787611
support,45.0,101.0
