In [1]:
#importing librabries

import pandas as pd   
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('indian_liver_patient.csv')

## A little data prep
========================================================================================================================

In [3]:
data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [4]:
data.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [5]:
data.shape

(583, 11)

In [6]:
data.dropna(inplace=True)

#### Here we one-hot encode the gender column using get_dummies(), dropping the first column.


In [7]:
gender = pd.get_dummies(data['Gender'],drop_first=True)

In [8]:
data.drop(columns='Gender', inplace=True)

In [9]:
data = pd.concat([data,gender], axis=1)

In [10]:
data.head()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset,Male
0,65,0.7,0.1,187,16,18,6.8,3.3,0.9,1,0
1,62,10.9,5.5,699,64,100,7.5,3.2,0.74,1,1
2,62,7.3,4.1,490,60,68,7.0,3.3,0.89,1,1
3,58,1.0,0.4,182,14,20,6.8,3.4,1.0,1,1
4,72,3.9,2.0,195,27,59,7.3,2.4,0.4,1,1


In [11]:
data = data.astype(float)

 ##### We rename the columns using the common medical abbreviations to make them easier to use.
 

In [12]:
data.columns=['Age', 'Total_BUN', 'Direct_BUN',
              'ALP', 'ALAT', 'AST', 'Total_Protiens',
              'ALB', 'ALB:BLOB','Dataset', 'Female']

In [13]:
data.head()

Unnamed: 0,Age,Total_BUN,Direct_BUN,ALP,ALAT,AST,Total_Protiens,ALB,ALB:BLOB,Dataset,Female
0,65.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1.0,0.0
1,62.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1.0,1.0
2,62.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1.0,1.0
3,58.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1.0,1.0
4,72.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,1.0,1.0


####  The classes are unbalanced(we will deal with this later using upsampling or by choosing a model that does well with inbalance). The README said the dataset had an unbalanced class distribution of 416 patients with liver disease and 167 patients without liver disease.
I'll use this information to make a succinct y column out of the data['Dataset'] column.


In [14]:
data['Dataset'].value_counts() 

1.0    414
2.0    165
Name: Dataset, dtype: int64

 #### The counts are 2 patients less on each from the dropna called at the beginning of the notebook. Patients with the value 1 have liver disease, those with the value 2 do not. 
 Below I reassigned the patients without liver disease(2.0) to 0  and renamed the column to 'Diagnosis' because I like my columns simple to interpret.

In [15]:
data.Dataset = data.Dataset.astype(int)
data.rename(columns={'Dataset':'Diagnosis'}, inplace=True)

In [16]:
data.Diagnosis.replace({2:0}, inplace=True)

In [17]:
data.head(10)

Unnamed: 0,Age,Total_BUN,Direct_BUN,ALP,ALAT,AST,Total_Protiens,ALB,ALB:BLOB,Diagnosis,Female
0,65.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1,0.0
1,62.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1,1.0
2,62.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1,1.0
3,58.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1,1.0
4,72.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,1,1.0
5,46.0,1.8,0.7,208.0,19.0,14.0,7.6,4.4,1.3,1,1.0
6,26.0,0.9,0.2,154.0,16.0,12.0,7.0,3.5,1.0,1,0.0
7,29.0,0.9,0.3,202.0,14.0,11.0,6.7,3.6,1.1,1,0.0
8,17.0,0.9,0.3,202.0,22.0,19.0,7.4,4.1,1.2,0,1.0
9,55.0,0.7,0.2,290.0,53.0,58.0,6.8,3.4,1.0,1,1.0


### We will upsample the minority class to prevent the inbalance from affecting our model*


In [18]:
#seperating the dataset by the y column

positive = data[data['Diagnosis'] == 1]  
negative = data[data['Diagnosis'] == 0]

# upsampling the negative class

negative_up = resample(negative,
                      replace=True,
                      n_samples=414,
                      random_state=42)

In [19]:
# Creating the new dataframe with balanced classes

data = pd.concat([positive, negative_up], axis=0)

#confirming our classes are balanced so we can move onto modeling

data.Diagnosis.value_counts()

1    414
0    414
Name: Diagnosis, dtype: int64

## Modeling

### Below we will create a model for our baseline then use gridsearch to choose a model and parameters

=====================================================================================================================
####  Random Forest base accuracy model



In [20]:
#creating my X and y

X = data.drop(columns=['Diagnosis'])
y = data.Diagnosis

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, # creating a train and test set 
                                                    random_state=42) # Random state for reproducability

# scaling for the models that need it

ss= StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [22]:
#instaniating our model

rfc = RandomForestClassifier()
rfc.fit(X_train_ss, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
preds = rfc.predict(X_test_ss)

### We got a base accuracy of 80% using a Random Forest on  the balanced Dataset

In [24]:
print(accuracy_score(y_test, preds))

0.8309178743961353


#### Next we will try a basic logistic regression and a support vector machine

In [25]:
log = LogisticRegression()
log.fit(X_train_ss, y_train)
log.score(X_test_ss, y_test)



0.6666666666666666

In [26]:
svc = SVC(class_weight='balanced')
svc.fit(X_train, y_train)
svc.score(X_test, y_test)



0.9178743961352657

### The SVM did a whopping 92% out of the box, now lets see if we can improve on that!

## Model  improvement 
==========================

Here we will attempt to improve our score through tuning our models hyperparameters using gridsearch

In [30]:
# creating our gridsearch params

params = {
    'C':[2, 3, 1.5, 1],
    'kernel':['rbf','linear']
    
    }

# instaniating our gridsearch

gs = GridSearchCV(svc, params, cv=3)


In [31]:


'C=1.0', "kernel='rbf'", 'degree=3', "gamma='auto_deprecated'", 'coef0=0.0',
'shrinking=True', 'probability=False', 'tol=0.001', 'cache_size=200',
'class_weight=None', 'verbose=False', 'max_iter=-1',
"decision_function_shape='ovr'", 'random_state=None'

("decision_function_shape='ovr'", 'random_state=None')

In [32]:
gs.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [2, 3, 1.5, 1], 'kernel': ['rbf', 'linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
gs.best_params_


{'C': 2, 'kernel': 'rbf'}

In [38]:
gs.score(X_test, y_test)

0.9130434782608695

In [37]:
preds = svc.predict(X_test)
confusion_matrix(y_test, preds)

array([[91, 17],
       [ 0, 99]])