In [918]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

Collecting Data and Analysis

In [919]:
# loading diabetes dataset

diabetes_data = pd.read_csv('Diabetes.csv')
print(diabetes_data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [920]:
# number of rows and columns
diabetes_data.shape

(768, 9)

In [921]:
# statistical values of data
diabetes_data.describe()
diabetes_data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

Diabetecs - 1

Non diabetecs - 0

In [922]:
diabetes_data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [923]:
# separate data and labels
X = diabetes_data.drop(columns = 'Outcome', axis = 1)
Y = diabetes_data['Outcome']

print("X values - ", X.head())
print("Y values - ", Y.head())

X values -     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
Y values -  0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


Data Standarization

In [924]:
def standard_normalization(data):
  mean = np.mean(data)
  std = np.std(data)
  scaled_data = (data - mean) / std
  return scaled_data

In [925]:
standarized_data = standard_normalization(X)
print(standarized_data)

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0       0.639947  0.848324       0.149641       0.907270 -0.692891  0.204013   
1      -0.844885 -1.123396      -0.160546       0.530902 -0.692891 -0.684422   
2       1.233880  1.943724      -0.263941      -1.288212 -0.692891 -1.103255   
3      -0.844885 -0.998208      -0.160546       0.154533  0.123302 -0.494043   
4      -1.141852  0.504055      -1.504687       0.907270  0.765836  1.409746   
..           ...       ...            ...            ...       ...       ...   
763     1.827813 -0.622642       0.356432       1.722735  0.870031  0.115169   
764    -0.547919  0.034598       0.046245       0.405445 -0.692891  0.610154   
765     0.342981  0.003301       0.149641       0.154533  0.279594 -0.735190   
766    -0.844885  0.159787      -0.470732      -1.288212 -0.692891 -0.240205   
767    -0.844885 -0.873019       0.046245       0.656358 -0.692891 -0.202129   

     DiabetesPedigreeFunction       Age

In [926]:
X = standarized_data
Y = diabetes_data["Outcome"]

Spliting Data

In [927]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)
print(X.shape, x_test.shape, x_train.shape)

(768, 8) (154, 8) (614, 8)


Traning the Model

In [928]:
model = SVC(kernel='linear')

# fitting the support vrctor machine clasifier
model.fit(x_train, y_train)

SVC(kernel='linear')

Model Evaluation

In [929]:
# accuracy score on traning data
x_train_predictions = model.predict(x_train)
traning_data_accuracy = accuracy_score(x_train_predictions, y_train)

print("Accuracy score of traning data - ", traning_data_accuracy)

Accuracy score of traning data -  0.7866449511400652


In [930]:
# accuracy score on test data
x_test_predictions = model.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_predictions, y_test)

print("Accuracy score of testing data - ", testing_data_accuracy)

Accuracy score of testing data -  0.7727272727272727


Making predictive system

In [931]:
input_data = (0,179,90,27,0,44.1,0.686,23)

# making numpy array
data_as_array = np.asarray(input_data)

input_data_reshaped = data_as_array.reshape(1,-1)

#standerize input data
std_data = standard_normalization(input_data_reshaped)
print(std_data)

prediction = model.predict(std_data)
print(prediction)

if prediction[0] == 1:
    print("The person is diabetic")

else:
    print("The person is not diabetic")






[[-0.78563246  2.3069156   0.76927997 -0.31915873 -0.78563246 -0.02372537
  -0.77378058 -0.38826595]]
[1]
The person is diabetic


Saving the train model

In [932]:
import pickle

filename = 'diabetecs_model.sav'
pickle.dump(model, open(filename, 'wb'))

#load the save model
loaded_model = pickle.load(open('diabetecs_model.sav', 'rb'))

In [933]:

import numpy as np
input_data = (0,179,90,27,0,44.1,0.686,23)

# making numpy array
data_as_array = np.asarray(input_data)


input_data_reshaped = data_as_array.reshape(1,-1)

#standerize input data

std_data = standard_normalization(input_data_reshaped)
print(std_data)

prediction = model.predict(std_data)
print(prediction)

if prediction[0] == 1:
    print("The person is diabetic")

else:
    print("The person is not diabetic")  


[[-0.78563246  2.3069156   0.76927997 -0.31915873 -0.78563246 -0.02372537
  -0.77378058 -0.38826595]]
[1]
The person is diabetic
