In [152]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


Now we are importing the important packages for the project

In [153]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm 
from sklearn.metrics import accuracy_score


Data Collection and analysis

In [154]:
#Loading the dataset into a pnadas dataframe
diabetes_dataset = pd.read_csv('../Datasets/diabetes.csv')

In [155]:
# Separate the dataset into two groups based on the 'Outcome' column
outcome_0 = diabetes_dataset[diabetes_dataset['Outcome'] == 0]
outcome_1 = diabetes_dataset[diabetes_dataset['Outcome'] == 1]

# Undersample the groups
outcome_0_undersampled = outcome_0.sample(n=300, random_state=42)
outcome_1_undersampled = outcome_1.sample(n=268, random_state=42)

# Combine the undersampled groups into a new DataFrame
undersampled_diabetes_dataset = pd.concat([outcome_0_undersampled, outcome_1_undersampled])

# Shuffle the combined DataFrame
undersampled_diabetes_dataset = undersampled_diabetes_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the new DataFrame
print(undersampled_diabetes_dataset['Outcome'].value_counts())

Outcome
0    300
1    268
Name: count, dtype: int64


In [156]:
#printing the first 5 rows of the dataset
undersampled_diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,164,82,43,67,32.8,0.341,50,0
1,5,126,78,27,22,29.6,0.439,40,0
2,6,129,90,7,326,19.6,0.582,60,0
3,2,92,52,0,0,30.1,0.141,22,0
4,3,176,86,27,156,33.3,1.154,52,1


In [157]:
#getting the rows and columns of the dataset
undersampled_diabetes_dataset.shape

(568, 9)

In [158]:
#getting the statistical information of the dataset
undersampled_diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,568.0,568.0,568.0,568.0,568.0,568.0,568.0,568.0,568.0
mean,4.049296,124.227113,69.482394,20.875,80.056338,32.56338,0.496188,33.991197,0.471831
std,3.443835,32.919969,19.362404,16.200579,115.266964,7.809805,0.351743,11.632903,0.499646
min,0.0,0.0,0.0,0.0,0.0,0.0,0.084,21.0,0.0
25%,1.0,101.75,64.0,0.0,0.0,28.0,0.254,25.0,0.0
50%,3.0,120.0,72.0,24.0,20.0,32.6,0.3945,31.0,0.0
75%,6.0,145.25,80.0,33.0,130.5,36.8,0.6585,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [159]:
undersampled_diabetes_dataset['Outcome'].value_counts()

Outcome
0    300
1    268
Name: count, dtype: int64

In [160]:
undersampled_diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.32,109.013333,68.283333,19.723333,61.94,30.259333,0.44767,31.243333
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [161]:
#seperating data and the labels
x = undersampled_diabetes_dataset.drop(columns='Outcome', axis=1)
y = undersampled_diabetes_dataset['Outcome'] 

In [162]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,164,82,43,67,32.8,0.341,50
1,5,126,78,27,22,29.6,0.439,40
2,6,129,90,7,326,19.6,0.582,60
3,2,92,52,0,0,30.1,0.141,22
4,3,176,86,27,156,33.3,1.154,52


In [163]:
y.head()

0    0
1    0
2    0
3    0
4    1
Name: Outcome, dtype: int64

Train Test Split

In [164]:
x_train, x_test, y_train ,y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [165]:
print(x.shape,x_train.shape, x_test.shape)

(568, 8) (454, 8) (114, 8)


In [166]:
x_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
175,11,138,74,26,144,36.1,0.557,50
529,0,84,82,31,125,38.2,0.233,23
27,1,128,82,17,183,27.5,0.115,22
565,6,151,62,31,120,35.5,0.692,28
481,9,156,86,0,0,24.8,0.23,53


Training the Model

In [167]:
model = svm.SVC(kernel='linear')

In [168]:
model.fit(x_train, y_train)

Model Evaluation and Accuracy Score

In [169]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
print('Accuracy on training data : ', training_data_accuracy)

Accuracy on training data :  0.7577092511013216


In [170]:
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('Accuracy on test data : ', test_data_accuracy)

Accuracy on test data :  0.7543859649122807


Making a Predective system

In [171]:
input_data = (1, 85, 66, 29, 0, 26.6, 0.351, 31)
#converting the input data into numpy array
input_data_as_numpy_array = np.asarray(input_data)
#reshaping the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')


[0]
The person is not diabetic




Saving the trained model

In [172]:
pip install pickle

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [173]:
import pickle
filename = 'diabetes_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [174]:
#loading the saved model 
loaded_model = pickle.load(open('diabetes_model.sav', 'rb'))

In [175]:
input_data = (1, 85, 66, 29, 0, 26.6, 0.351, 31)
#converting the input data into numpy array
input_data_as_numpy_array = np.asarray(input_data)
#reshaping the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[0]
The person is not diabetic




In [176]:
#the data fields of the dataset x
for column in x.columns:
  print(column)

Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age
