In [131]:
# importing libraries 
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import VotingClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold 

In [132]:
# loading iris dataset 
df = pd.read_csv('indian_liver_patient.csv') 
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [133]:
#shape of data
df.shape

(583, 11)

In [134]:
#columns to keep
columns_to_retain = ['Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase', 'Alamine_Aminotransferase', 'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin','Albumin_and_Globulin_Ratio','Dataset']

#after droping unnecessary columns, new dataset
df = df.drop([col for col in df.columns if not col in columns_to_retain], axis=1)

#drop the rows with missing values
df = df.dropna(axis=0)

In [135]:
#transform the non-numeric data in the columns into number
for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = LabelEncoder().fit_transform(df[column])

  This is separate from the ipykernel package so we can avoid doing imports until


In [136]:
df.head()

Unnamed: 0,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,0.7,0.1,71,6,8,6.8,3.3,0.9,0
1,10.9,5.5,235,54,86,7.5,3.2,0.74,0
2,7.3,4.1,201,50,58,7.0,3.3,0.89,0
3,1.0,0.4,66,4,10,6.8,3.4,1.0,0
4,3.9,2.0,79,17,49,7.3,2.4,0.4,0


In [137]:
#splitting the dataset into independant (x) data set for the features and dependant (y) dataset for the target
X = df.drop(['Dataset'], axis=1)
Y = df['Dataset']

In [138]:
#scaling the dataset to bring all the values between 0 and 1
X_scaler = MinMaxScaler()
X_scaler.fit(X)
column_names = X.columns
X[column_names] = X_scaler.transform(X)

In [139]:
# train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X,  
                                                    Y,  
                                                    test_size = 0.20,  
                                                    random_state = 42) 

In [140]:
# group / ensemble of models 
estimator = [] 
estimator.append(('LR',  
                  LogisticRegression(solver ='lbfgs',  
                                     multi_class ='multinomial',  
                                     max_iter = 200))) 
estimator.append(('SVC', SVC(gamma ='auto', probability = True))) 
estimator.append(('DTC', DecisionTreeClassifier())) 
estimator.append(('randomforest', RandomForestClassifier()))
  
# Voting Classifier with hard voting 
vot_hard = VotingClassifier(estimators = estimator, voting ='hard') 
vot_hard.fit(X_train, y_train) 
y_pred = vot_hard.predict(X_test) 
  
# using accuracy_score metric to predict accuracy 
score = accuracy_score(y_test, y_pred) 
print("Hard Voting Score % f" % score) 
  
# Voting Classifier with soft voting 
vot_soft = VotingClassifier(estimators = estimator, voting ='soft') 
vot_soft.fit(X_train, y_train) 
y_pred = vot_soft.predict(X_test) 
  
# using accuracy_score 
score = accuracy_score(y_test, y_pred) 
print("Soft Voting Score % f" % score)

Hard Voting Score  0.612069
Soft Voting Score  0.629310
