In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split as tts
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
data = pd.read_csv('../data/pima-indians-diabetes.csv', header = None)
data.columns = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI", "DiabetesPedigreeFunction",
                "Age","Class"]
data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [3]:
data['Class'].value_counts()

0    500
1    268
Name: Class, dtype: int64

In [4]:
y = data.pop('Class')
X = data

y.shape, X.shape

((768,), (768, 8))

In [5]:
trainInputs, testInputs, trainTarget, testTarget = tts(X, y, test_size = .25, random_state = 42, stratify = y)
print(trainInputs.shape, trainTarget.shape)
print(testInputs.shape, testTarget.shape)

(576, 8) (576,)
(192, 8) (192,)


In [6]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

In [7]:
grbt = GradientBoostingClassifier(random_state = 0, max_depth = 1).fit(trainInputs, trainTarget)
# -----------
forest = RandomForestClassifier().fit(trainInputs, trainTarget)
# -----------
logistic = LogisticRegression(C = 100).fit(trainInputs, trainTarget)
# -----------
tree = DecisionTreeClassifier(random_state = 0).fit(trainInputs, trainTarget)
# -----------
knn = KNeighborsClassifier(n_neighbors = 3).fit(trainInputs, trainTarget)
# -----------
svm = LinearSVC(C = 100).fit(trainInputs, trainTarget)

In [8]:
print(f'Gradient : Train accuracy : {round(grbt.score(trainInputs, trainTarget), 2)} \
                  Test accuracy : {round(grbt.score(testInputs, testTarget), 2)}')

print(f'RandomForest : Train accuracy : {round(forest.score(trainInputs, trainTarget), 2)} \
               Test accuracy : {round(forest.score(testInputs, testTarget), 2)}')

print(f'Logistic : Train accuracy : {round(logistic.score(trainInputs, trainTarget), 2)} \
                   Test accuracy : {round(logistic.score(testInputs, testTarget), 2)}')
 
print(f'DecisionTree : Train accuracy : {round(tree.score(trainInputs, trainTarget), 2)} \
               Test accuracy : {round(tree.score(testInputs, testTarget), 2)}')

print(f'Kneighbors : Train accuracy : {round(knn.score(trainInputs, trainTarget), 2)} \
                Test accuracy : {round(knn.score(testInputs, testTarget), 2)}')

print(f'SVM : Train accuracy : {round(svm.score(trainInputs, trainTarget), 2)} \
                       Test accuracy : {round(svm.score(testInputs, testTarget), 2)}')


Gradient : Train accuracy : 0.81                   Test accuracy : 0.73
RandomForest : Train accuracy : 1.0                Test accuracy : 0.74
Logistic : Train accuracy : 0.8                    Test accuracy : 0.73
DecisionTree : Train accuracy : 1.0                Test accuracy : 0.7
Kneighbors : Train accuracy : 0.86                 Test accuracy : 0.7
SVM : Train accuracy : 0.67                        Test accuracy : 0.65


## Pipeline setup :

In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [10]:
# 1. LogisticRegression Pipeline :
LogisticPipeline = Pipeline([('scaler', MinMaxScaler()), ('pca', PCA(n_components = 3)), ('logistic', LogisticRegression())])

# 2. DecisionTree Pipeline :
TreePipeline = Pipeline([('scaler', MinMaxScaler()), ('pca', PCA(n_components = 3)), ('tree', DecisionTreeClassifier())])

# 3. RandomForest Classifier Pipeline
ForestPipeline = Pipeline([('scaler', MinMaxScaler()), ('pca', PCA(n_components = 3)), ('forest', RandomForestClassifier())])

# 4. KNeighbors Pipeline :
KnnPipeline = Pipeline([('scaler', MinMaxScaler()), ('pca', PCA(n_components = 3)), ('knn', KNeighborsClassifier())])


## Model training & evaluation :

In [11]:
# ---------------------- Defining pipeline in a list ----------------------------
mypipeline = [LogisticPipeline, TreePipeline, ForestPipeline, KnnPipeline]

# ------------------ variables for choosing best mode ---------------------------
accuracy = .0
classifier = 0
pipeline = ''

In [12]:
# ---------- Creating dictionary of pipelines and training model ----------------
pipelineDict = {0 : 'Logistic Regression', 1 : 'Decision Tree', 2 : 'Random Forest', 3 : 'Knn'}

# ------------ fit the model -------------
for pipe in mypipeline :
    pipe.fit(trainInputs, trainTarget)

In [13]:
# ------------ getting the accuracy score for all models --------------

for i, model in enumerate(mypipeline) :
    print(f"{pipelineDict[i]} -> Test accuracy : {round(model.score(testInputs, testTarget), 2)}")

Logistic Regression -> Test accuracy : 0.72
Decision Tree -> Test accuracy : 0.69
Random Forest -> Test accuracy : 0.71
Knn -> Test accuracy : 0.7


In [17]:
# ------------ Choosing best model for the given data ----------------
for i, model in enumerate(mypipeline):
    if model.score(testInputs, testTarget) > accuracy:
        accuracy = model.score(testInputs, testTarget)
        pipeline = model
        classifier = i
print('Classifier with best accuracy : {}'.format(pipelineDict[classifier]))

Classifier with best accuracy : Logistic Regression
