In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

In [2]:
liver=pd.read_csv('https://raw.githubusercontent.com/DrSaadLa/PythonTuts/main/TreeBasedModels/liver.csv')

In [3]:
liver.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               583 non-null    int64  
 1   gender            583 non-null    object 
 2   tot_bilirubin     583 non-null    float64
 3   direct_bilirubin  583 non-null    float64
 4   tot_proteins      583 non-null    int64  
 5   albumin           583 non-null    int64  
 6   ag_ratio          583 non-null    int64  
 7   sgpt              583 non-null    float64
 8   sgot              583 non-null    float64
 9   alkphos           579 non-null    float64
 10  is_patient        583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [4]:
liver.isnull().sum()

age                 0
gender              0
tot_bilirubin       0
direct_bilirubin    0
tot_proteins        0
albumin             0
ag_ratio            0
sgpt                0
sgot                0
alkphos             4
is_patient          0
dtype: int64

In [5]:
liver[liver['alkphos'].isnull()]

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
209,45,Female,0.9,0.3,189,23,33,6.6,3.9,,1
241,51,Male,0.8,0.2,230,24,46,6.5,3.1,,1
253,35,Female,0.6,0.2,180,12,15,5.2,2.7,,2
312,27,Male,1.3,0.6,106,25,54,8.5,4.8,,2


**replace null with mean**

In [6]:
from sklearn.impute import SimpleImputer
imp=SimpleImputer(missing_values=np.nan,strategy='mean')
liver['alkphos']=imp.fit_transform(liver[['alkphos']])

In [7]:
liver['alkphos'].isnull().sum()

0

In [8]:
liver['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [9]:
liver['gender']=liver['gender'].apply(lambda x:1 if x=='Male' else 0)
liver['gender'].unique()

array([0, 1])

In [10]:
liver.corr()

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
age,1.0,0.05656,0.011763,0.007529,0.080425,-0.086883,-0.01991,-0.187461,-0.265924,-0.216089,-0.137351
gender,0.05656,1.0,0.089291,0.100436,-0.027496,0.082332,0.080336,-0.089121,-0.093799,-0.003404,-0.082416
tot_bilirubin,0.011763,0.089291,1.0,0.874618,0.206669,0.214065,0.237831,-0.008099,-0.22225,-0.206159,-0.220208
direct_bilirubin,0.007529,0.100436,0.874618,1.0,0.234939,0.233894,0.257544,-0.000139,-0.228531,-0.200004,-0.246046
tot_proteins,0.080425,-0.027496,0.206669,0.234939,1.0,0.12568,0.167196,-0.028514,-0.165453,-0.23396,-0.184866
albumin,-0.086883,0.082332,0.214065,0.233894,0.12568,1.0,0.791966,-0.042518,-0.029742,-0.002374,-0.163416
ag_ratio,-0.01991,0.080336,0.237831,0.257544,0.167196,0.791966,1.0,-0.025645,-0.08529,-0.070024,-0.151934
sgpt,-0.187461,-0.089121,-0.008099,-0.000139,-0.028514,-0.042518,-0.025645,1.0,0.784053,0.233904,0.035008
sgot,-0.265924,-0.093799,-0.22225,-0.228531,-0.165453,-0.029742,-0.08529,0.784053,1.0,0.686322,0.161388
alkphos,-0.216089,-0.003404,-0.206159,-0.200004,-0.23396,-0.002374,-0.070024,0.233904,0.686322,1.0,0.162319


In [11]:
x=liver.drop('is_patient',axis=1)
y=liver['is_patient'].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)

**scaling the data**

In [12]:
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [13]:
logreg=LogisticRegression(random_state=1)
knn=KNN(n_neighbors=27)
dt=DecisionTreeClassifier(min_samples_leaf=0.13,random_state=1)

In [14]:
classifiers=[('logistic regression',logreg),('k nearest',knn),('classification tree',dt)]
for clf_name,clf in classifiers:
    clf.fit(x_train,y_train)
    y_pred=clf.predict(x_test)
    accuracy=accuracy_score(y_pred,y_test)
    print(clf_name,accuracy)

logistic regression 0.7257142857142858
k nearest 0.72
classification tree 0.7257142857142858


In [15]:
vc=VotingClassifier(estimators=classifiers)
vc.fit(x_train,y_train)
pred=vc.predict(x_test)
acc=accuracy_score(pred,y_test)
print("voting classifier ",acc)

voting classifier  0.7371428571428571
