### Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('diabetes.csv')

In [3]:
df.shape

(768, 9)

### Preprocessing

#### Missing Value Treatment

In [4]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(df)

SimpleImputer()

In [5]:
imp.transform(df)

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

#### Select X & Y

In [6]:
X = df.drop(columns=['diabetes'])
X.head()

Unnamed: 0,gnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [7]:
y = df['diabetes']
y[0:5]

0    1
1    0
2    1
3    0
4    1
Name: diabetes, dtype: int64

#### Scale Dataset

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#### Data Selection

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=1, stratify=y)

### KNN Classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=8)
knn_model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=8)

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

LogisticRegression()

### Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train,y_train)

RandomForestClassifier()

### SVM Classifier

In [13]:
from sklearn import svm
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train, y_train)

SVC(kernel='linear')

### Voting Classifier

In [14]:
from sklearn.ensemble import VotingClassifier
estimators=[('knn', knn_model), ('log_reg', log_model), ('rf', rf_model), ('svm', svm_model)]
ensemble = VotingClassifier(estimators, voting='hard')
ensemble.fit(X_train, y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=8)),
                             ('log_reg', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('svm', SVC(kernel='linear'))])

In [15]:
ensemble.score(X_test, y_test)

0.7316017316017316