<a href="https://colab.research.google.com/github/Raduchak/Hepatitis-prediction/blob/main/Hepatitis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing

### Libraries loading


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from scipy import stats
from google.colab import files
import os
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import ADASYN

### Data import

In [None]:
file = 'hepatitis.data'
if file not in os.listdir():
  uploaded = files.upload()

In [None]:
header_names = ["Class", "AGE", "SEX", "STEROID", "ANTIVIRALS", "FATIGUE",
                "MALAISE", "ANOREXIA", "LIVER BIG", "LIVER FIRM",
                "SPLEEN PALPABLE", "SPIDERS", "ASCITES", "VARICES", "BILIRUBIN",
                "ALK PHOSPHATE", "SGOT", "ALBUMIN", "PROTIME", "HISTOLOGY"]
df = pd.read_csv(file, na_values='?', names=header_names)
df.head()

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,2,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,2,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,2,31,1,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,2,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1


In [None]:
df.isnull().sum()

Class               0
AGE                 0
SEX                 0
STEROID             1
ANTIVIRALS          0
FATIGUE             1
MALAISE             1
ANOREXIA            1
LIVER BIG          10
LIVER FIRM         11
SPLEEN PALPABLE     5
SPIDERS             5
ASCITES             5
VARICES             5
BILIRUBIN           6
ALK PHOSPHATE      29
SGOT                4
ALBUMIN            16
PROTIME            67
HISTOLOGY           0
dtype: int64

### Empty values removal

In [None]:
drop_condition = df.isnull().sum() / len(df.index) < 0.1
print(drop_condition)

df = df.loc[:, drop_condition.values]
df = df.fillna(value=df.median(axis=0))
df.head()

Class               True
AGE                 True
SEX                 True
STEROID             True
ANTIVIRALS          True
FATIGUE             True
MALAISE             True
ANOREXIA            True
LIVER BIG           True
LIVER FIRM          True
SPLEEN PALPABLE     True
SPIDERS             True
ASCITES             True
VARICES             True
BILIRUBIN           True
ALK PHOSPHATE      False
SGOT                True
ALBUMIN            False
PROTIME            False
HISTOLOGY           True
dtype: bool


Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,SGOT,HISTOLOGY
0,2,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,18.0,1
1,2,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,42.0,1
2,2,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,32.0,1
3,2,31,1,2.0,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,52.0,1
4,2,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,200.0,1


### Outliers finding and removal

In [None]:
print(f'Number of instances = {len(df.index)}')

Number of instances = 155


In [None]:
z_scores = stats.zscore(df)

abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]

In [None]:
print(f'Number of instances after outliers removal = {len(df.index)}')

Number of instances after outliers removal = 150


### Normalization

In [None]:
min_max_scaler = MinMaxScaler()
scaled_values = min_max_scaler.fit_transform(df)
df = pd.DataFrame(scaled_values, columns=df.columns)
df.head()

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,SGOT,HISTOLOGY
0,1.0,0.323944,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.155556,0.015152,0.0
1,1.0,0.605634,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.133333,0.106061,0.0
2,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.088889,0.068182,0.0
3,1.0,0.338028,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.088889,0.143939,0.0
4,1.0,0.380282,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.155556,0.704545,0.0


### Smote for imbalanced class

In [None]:
df

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,SGOT,HISTOLOGY
0,1.0,0.323944,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.155556,0.015152,0.0
1,1.0,0.605634,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.133333,0.106061,0.0
2,1.0,1.000000,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.088889,0.068182,0.0
3,1.0,0.338028,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.088889,0.143939,0.0
4,1.0,0.380282,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.155556,0.704545,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1.0,0.408451,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.066667,0.060606,1.0
146,1.0,0.521127,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.133333,0.484848,1.0
147,1.0,0.760563,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.111111,0.022727,1.0
148,1.0,0.647887,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.266667,0.018939,1.0


### Data reduction

In [None]:
y = df.pop('Class')
x = df
print(f'Number of attributes = {x.shape[1]}')

Number of attributes = 16


In [None]:
pca = PCA(n_components=0.95)
x = pca.fit_transform(x)
print(f'Number of attributes after PCA = {x.shape[1]}')

Number of attributes after PCA = 13


In [None]:
x

array([[-0.76611315, -0.24855758,  0.73890279, ..., -0.52829586,
        -0.19686574,  0.00569281],
       [-0.31945868, -0.45143977,  0.55243082, ..., -0.42874342,
        -0.62376797, -0.00520745],
       [-0.63475402, -0.15772775, -0.33080907, ..., -0.04363929,
        -0.26304124,  0.00224009],
       ...,
       [ 1.1690073 ,  0.06127515,  0.68965601, ..., -0.21058372,
         0.12473716,  0.44690351],
       [ 0.65437377,  0.62260338, -0.0910443 , ...,  0.17525554,
        -0.56352491,  0.06345966],
       [ 0.45518963,  0.88422066, -0.79296423, ..., -0.59035629,
         0.16176327, -0.44807697]])

### Handling imbalanced data set

In [None]:
y_data_balanced = {}
x_data_balanced = {}

In [None]:
smtt = SMOTETomek(random_state=42)
x_balanced, y_balanced = smtt.fit_resample(x, y)

y_data_balanced['smtt'] = y_balanced
x_data_balanced['smtt'] = x_balanced

In [None]:
smte = SMOTEENN(random_state=42)
x_balanced, y_balanced = smte.fit_resample(x, y)

y_data_balanced['smte'] = y_balanced
x_data_balanced['smte'] = x_balanced

In [None]:
adasyn = ADASYN(random_state=42)
x_balanced, y_balanced = adasyn.fit_resample(x, y)

y_data_balanced['adasyn'] = y_balanced
x_data_balanced['adasyn'] = x_balanced

In [None]:
for key in y_data_balanced:
  print(f'{key}: {len(y_data_balanced[key])}')
  print(y_data_balanced[key].value_counts())
print(f'Original data: {len(y)}')

smtt: 236
1.0    118
0.0    118
Name: Class, dtype: int64
smte: 182
0.0    102
1.0     80
Name: Class, dtype: int64
adasyn: 242
1.0    121
0.0    121
Name: Class, dtype: int64
Original data: 150


# Classification with non-optimized models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay, f1_score, roc_auc_score, roc_curve, plot_roc_curve

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import time

import seaborn as sns

RAND_SEED = 123

In [None]:
models = [
SVC(kernel="linear", tol=1e-3, decision_function_shape="ovo", random_state=RAND_SEED),
DecisionTreeClassifier(criterion="gini", max_depth=2, max_leaf_nodes=5, max_features=5, min_samples_split=2, random_state=RAND_SEED),
KNeighborsClassifier(n_neighbors=5)
]

In [None]:
model_labels = ["Linear SVM", "Decision tree", "KNN"]

metric_labels = ["Test score", "Training score", "F1 score", "ROC score"]

models_dfs = {}


for balancing_method in x_data_balanced:

  X_train, X_test, y_train, y_test = train_test_split(x_data_balanced[balancing_method],
                                                      y_data_balanced[balancing_method],
                                                      test_size=0.33,
                                                      random_state=RAND_SEED)

  models_dfs[balancing_method] = pd.DataFrame(index=model_labels, columns=metric_labels)

  for i, model in enumerate(models):
    start = time.time()
    model.fit(X_train, y_train)
    stop = time.time()
    learning_time = stop - start
    test_score = model.score(X_test, y_test)
    train_score = model.score(X_train, y_train)
    y_predicted = model.predict(X_test)
    models_dfs[balancing_method].iloc[i] = [test_score, train_score, f1_score(y_test, y_predicted), roc_auc_score(y_test, y_predicted)]
    # ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)



In [None]:
models_dfs["smtt"])

Unnamed: 0,Test score,Training score,F1 score,ROC score
Linear SVM,0.846154,0.898734,0.833333,0.847176
Decision tree,0.769231,0.835443,0.75,0.769435
KNN,0.871795,0.886076,0.848485,0.865116


In [None]:
models_dfs["smte"]

Unnamed: 0,Test score,Training score,F1 score,ROC score
Linear SVM,0.967213,1.0,0.965517,0.966667
Decision tree,0.934426,0.975207,0.931034,0.933871
KNN,0.983607,0.991736,0.983051,0.983333


In [None]:
models_dfs["adasyn"]

Unnamed: 0,Test score,Training score,F1 score,ROC score
Linear SVM,0.82716,0.864198,0.815789,0.828049
Decision tree,0.753086,0.839506,0.714286,0.754878
KNN,0.851852,0.839506,0.833333,0.853354


# Classification with model optimization

In [None]:
grid_params = []

grid_params.append({"kernel":["linear", "sigmoid", "rbf"],
                    "decision_function_shape":["ovo", "ovr"],
                    "C": [.001, .002, .1, .2 , .5,  1, 5, 10, 20, 50],
                    "gamma": [1, .1, .01, .001, .0001]})

grid_params.append({"max_depth": [2, 5, 10],
                    "min_samples_split": [2, 5, 10, 20],
                    "max_leaf_nodes": [2, 5, 10, 20]})

grid_params.append({"n_neighbors":[1, 2, 3, 4, 5,  6, 7, 8, 9, 10],
                    "weights": ["uniform", "distance"],
                    "algorithm": ["ball_tree", "kd_tree", "brute"],
                    "leaf_size": [5, 10, 20, 30, 40, 50],
                    "p": [1, 2, 3, 4, 5]})

for balancing_method in x_data_balanced:

  X_train, X_test, y_train, y_test = train_test_split(x_data_balanced[balancing_method],
                                                      y_data_balanced[balancing_method],
                                                      test_size=0.33,
                                                      random_state=RAND_SEED)

  for i, model in enumerate(models):
    md = GridSearchCV(model, grid_params[i], refit=True, verbose=0)
    start = time.time()
    md.fit(X_train, y_train)
    stop = time.time()
    learning_time = stop - start
    test_score = md.score(X_test, y_test)
    train_score = md.score(X_train, y_train)
    y_predicted = md.predict(X_test)
    models_dfs[balancing_method].iloc[i] = [test_score, train_score, f1_score(y_test, y_predicted), roc_auc_score(y_test, y_predicted)]
    # ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)

In [None]:
models_dfs["smtt"]

In [None]:
models_dfs["smte"]

In [None]:
models_dfs["adasyn"]

# Conclusion

Uzyskane wyniki zaprezentowane wyżej w tabelach wskazują, że najlepszym algorytem jest KNN. Dla zoptymalizowanych i zbliansownaych danych przy użyciu metody SMOTEEN uzyskano 100% skuteczność zarówno dla danych testowych jak i treningowych oraz metryki f1-score czy ROC. Ciężko jest natomiast stwierdzic czy modele osiągają zaprezentowaną skuteczność ze względu na początkowe duże niezbilansowanie danych, które mogło mieć w wpływ na proces uczenia. W trakcie analizy modeli oraz procesów uczenia udało się jednak zaobserwować, że model oparty o SVM wydaje się być najbardziej niezależny od tego jak bardzo niezbilansowane lub zbilansowane są klasy. Objawiało się to w miarę stało skutecznością modelu niezależnie od zbilansowania. Przedstawione rozwiązanie w problemie klasyfikacji może być jeszcze rozwijane pod względem optymalizacji modeli w węższych zakresach parametrów oraz metod balansowania danych.