In [18]:
import pandas as pd
import numpy as np

df = pd.read_csv('hepatitis.csv')
df

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1,85,18,4,?,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,?,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4,?,1
3,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.7,46,52,4,80,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1,?,200,4,?,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,1,46,1,2,2,1,1,1,2,2,2,1,1,1,7.6,?,242,3.3,50,2
151,2,44,1,2,2,1,2,2,2,1,2,2,2,2,0.9,126,142,4.3,?,2
152,2,61,1,1,2,1,1,2,1,1,2,1,2,2,0.8,75,20,4.1,?,2
153,2,53,2,1,2,1,2,2,2,2,1,1,2,1,1.5,81,19,4.1,48,2


In [2]:
df.isnull().sum()

Unnamed: 0,0
Class,0
AGE,0
SEX,0
STEROID,0
ANTIVIRALS,0
FATIGUE,0
MALAISE,0
ANOREXIA,0
LIVER BIG,0
LIVER FIRM,0


In [3]:
df.isna().sum()

Unnamed: 0,0
Class,0
AGE,0
SEX,0
STEROID,0
ANTIVIRALS,0
FATIGUE,0
MALAISE,0
ANOREXIA,0
LIVER BIG,0
LIVER FIRM,0


In [19]:
for col in df.columns:
  df = df[df[col] != '?']

In [20]:
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
print(df.dtypes)

Class                int64
AGE                  int64
SEX                  int64
STEROID              int64
ANTIVIRALS           int64
FATIGUE              int64
MALAISE              int64
ANOREXIA             int64
LIVER BIG            int64
LIVER FIRM           int64
SPLEEN PALPABLE      int64
SPIDERS              int64
ASCITES              int64
VARICES              int64
BILIRUBIN          float64
ALK PHOSPHATE        int64
SGOT                 int64
ALBUMIN            float64
PROTIME              int64
HISTOLOGY            int64
dtype: object


In [21]:
cols = ['AGE', 'BILIRUBIN', 'ALK PHOSPHATE', 'SGOT', 'ALBUMIN',	'PROTIME']
Q1 = df[cols].quantile(0.25)
Q3 = df[cols].quantile(0.75)

IQR = Q3-Q1
lower = Q1-1.5*IQR
upper = Q3+1.5*IQR

for col in cols:
  df = df[(df[col] >= lower[col]) & (df[col] <= upper[col])]

In [22]:
from sklearn.preprocessing import StandardScaler


sc = StandardScaler()
scaled = sc.fit_transform(df[cols])

In [23]:
df_scaled = df.copy()

In [24]:
df_scaled[cols] = scaled
df_scaled

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
5,2,-0.568180,1,2,2,2,2,2,2,2,2,2,2,2,-0.231881,0.058017,-0.845364,0.108348,0.457450,1
10,2,-0.131620,1,1,1,2,2,2,1,1,2,2,2,2,0.839570,-0.340066,-0.801183,0.893135,0.891574,1
12,2,0.043004,1,2,1,1,2,2,2,1,2,2,2,2,-0.231881,-0.269816,-0.138476,-0.087849,-0.541035,1
13,2,-0.917428,1,2,2,1,2,2,2,1,2,2,2,2,3.250336,-0.831817,1.717103,1.874119,0.587687,1
15,2,-0.218932,1,1,2,1,1,1,2,2,2,2,1,2,2.714610,-0.480566,0.502141,-2.049818,-0.801509,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,2,0.392252,1,2,1,2,2,2,2,2,2,2,2,2,0.839570,-0.176149,-0.491920,0.500741,0.891574,2
143,1,0.741500,1,1,2,1,1,2,2,2,1,1,2,2,1.107433,-0.176149,0.082426,-0.872637,-1.279045,2
145,2,-0.830116,1,1,2,1,2,2,2,2,2,2,2,2,0.571707,-0.410316,2.357720,0.500741,-0.454210,2
153,2,1.090748,2,1,2,1,2,2,2,2,1,1,2,1,1.375296,-0.269816,-1.044176,0.304544,-0.714684,2


In [28]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

X = df_scaled.drop('Class', axis=1)
y = df_scaled['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [29]:
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(accuracy_score(y_pred_lr, y_test))

0.8235294117647058


In [30]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print(accuracy_score(y_pred_nb, y_test))

0.8235294117647058


In [31]:
print((y_pred_lr == y_pred_nb).all())

False


In [32]:
from sklearn.metrics import classification_report
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))
print("Gaussian NB:\n", classification_report(y_test, y_pred_nb))

Logistic Regression:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.88      0.93      0.90        15

    accuracy                           0.82        17
   macro avg       0.44      0.47      0.45        17
weighted avg       0.77      0.82      0.80        17

Gaussian NB:
               precision    recall  f1-score   support

           1       0.33      0.50      0.40         2
           2       0.93      0.87      0.90        15

    accuracy                           0.82        17
   macro avg       0.63      0.68      0.65        17
weighted avg       0.86      0.82      0.84        17



In [46]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print(accuracy_score(y_pred_knn, y_test))

0.8823529411764706
