# load Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load Dataset

In [2]:
df=pd.read_csv("diabetes.csv")

In [3]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [5]:
df.keys()

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [6]:
X=df.drop('Outcome',axis=1)
y=df[['Outcome']]

# Feature Scaling

In [7]:
sc=StandardScaler()
X=pd.DataFrame(sc.fit_transform(X),columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


# Splitting the dataset into training and testing sets

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((576, 8), (192, 8), (576, 1), (192, 1))

# Model Fitting

In [9]:
# Model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

#Accuracy measuring library
from sklearn.metrics import accuracy_score

In [10]:
#1. Logistic Regression - used because this model is best suited for binary classification
LR_model = LogisticRegression()

#Fitting training set to the model
LR_model.fit(X_train,y_train)

#Predicting the test set results based on the model
# accuracy on training data
lr_y_pred = LR_model.predict(X_train)
score = accuracy_score(y_train,lr_y_pred)
print('Training Accuracy of LR model is ', score)

# accuracy on testing data
lr_y_pred_test = LR_model.predict(X_test)
score = accuracy_score(y_test,lr_y_pred_test)
print('Testing Accuracy of LR model is ', score)

Training Accuracy of LR model is  0.7777777777777778
Testing Accuracy of LR model is  0.7708333333333334


In [11]:
#2. Linear Regression - used because this model is best suited for Linear classification
Linear_model = LinearRegression()

#Fitting training set to the model
Linear_model.fit(X_train,y_train)

#Predicting the test set results based on the model
# accuracy on training data
linear_y_pred = Linear_model.predict(X_train)
score = accuracy_score(y_train,lr_y_pred)
print('Training Accuracy of LR model is ', score)

# accuracy on testing data
linear_y_pred_test = Linear_model.predict(X_test)
score = accuracy_score(y_test,lr_y_pred_test)
print('Testing Accuracy of LR model is ', score)

Training Accuracy of LR model is  0.7777777777777778
Testing Accuracy of LR model is  0.7708333333333334


In [12]:
#3. Support Vector Machine(SVM) 
svm_model = SVC(kernel='linear')

#Fitting training set to the model
svm_model.fit(X_train,y_train)

#Predicting the test set results based on the model
# accuracy on training data
train_svm_y_pred = svm_model.predict(X_train)
score = accuracy_score(y_train,train_svm_y_pred)
print('Training Accuracy of SVM model is ', score)

# accuracy on testing data
test_svm_y_pred = svm_model.predict(X_test)
score = accuracy_score(y_test,test_svm_y_pred)
print('Testing Accuracy of SVM model is ', score)

Training Accuracy of SVM model is  0.7881944444444444
Testing Accuracy of SVM model is  0.7760416666666666


# Creating the SVM model

In [13]:
#rbf Kernal use
model_r=SVC(kernel='rbf')
model_r.fit(X_train,y_train)
model_r.score(X_test,y_test)

0.7604166666666666

In [14]:
#Linear Kernal use
model_l=SVC(kernel='linear')
model_l.fit(X_train,y_train)
model_l.score(X_test,y_test)

0.7760416666666666

In [15]:
#poly Kernal use
model_p=SVC(kernel='poly')
model_p.fit(X_train,y_train)
model_p.score(X_test,y_test)

0.7239583333333334

In [16]:
#sigmoid Kernal use
model_s=SVC(kernel='sigmoid')
model_s.fit(X_train,y_train)
model_s.score(X_test,y_test)

0.6822916666666666

Linear Kernal give high Accuracy so it is used

In [17]:
#Linear Kernal use
model_l=SVC(kernel='linear')
model_l.fit(X_train,y_train)
model_l.score(X_test,y_test)

0.7760416666666666

In [18]:
y_predict=model_l.predict(X_test)
y_predict[:10],y_test[:10]

(array([1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int64),
      Outcome
 663        1
 278        0
 273        0
 694        0
 59         0
 717        0
 615        0
 40         0
 107        0
 378        1)

# RMS Error

In [19]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_test,y_predict)
RMSE=np.sqrt(mse)
RMSE

0.4732423621500228