In [4]:
# Importing the Dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import *
import warnings
warnings.filterwarnings('ignore')

# svm --> Support Vector Machine

In [5]:
# The datasets consist of several medical predictor (independent) variables and one target (dependent) variable, Outcome.
# Independent variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

In [6]:
# Importing  PIMA Indians Diabetes Dataset
diabetes=pd.read_csv("/content/diabetes (2).csv")
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# New Section

In [7]:
# Number of rows and columns
diabetes.shape

(768, 9)

In [8]:
# Fetching information about Data set
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [9]:
# Checking for Null Values
diabetes.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [10]:
# Checking for Duplicates Vales
diabetes.duplicated().sum()

0

In [11]:
# Dataset col names
col=diabetes.columns
col

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [12]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [13]:
diabetes['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [14]:
# 0 --> Non Diabetes 1 --> Diabetes

In [15]:
diabetes.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [16]:
# Data Splitting into Dependent col or Target col and Independent col
x=diabetes.drop(columns='Outcome',axis=1)
y=diabetes['Outcome']

In [17]:
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [18]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=2)

In [20]:
x_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
602,1,124,74,36,0,27.8,0.100,30
429,1,95,82,25,180,35.0,0.233,43
623,0,94,70,27,115,43.5,0.347,21
209,7,184,84,33,0,35.5,0.355,41
589,0,73,0,0,0,21.1,0.342,25
...,...,...,...,...,...,...,...,...
534,1,77,56,30,56,33.3,1.251,24
584,8,124,76,24,600,28.7,0.687,52
493,4,125,70,18,122,28.9,1.144,45
527,3,116,74,15,105,26.3,0.107,24


In [21]:
y_train

602    0
429    1
623    0
209    1
589    0
      ..
534    0
584    1
493    1
527    0
168    0
Name: Outcome, Length: 614, dtype: int64

In [22]:
x_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
158,2,88,74,19,53,29.0,0.229,22
251,2,129,84,0,0,28.0,0.284,27
631,0,102,78,40,90,34.5,0.238,24
757,0,123,72,0,0,36.3,0.258,52
689,1,144,82,46,180,46.1,0.335,46
...,...,...,...,...,...,...,...,...
733,2,106,56,27,165,29.0,0.426,22
441,2,83,66,23,50,32.2,0.497,22
627,0,132,78,0,0,32.4,0.393,21
84,5,137,108,0,0,48.8,0.227,37


In [23]:
y_test

158    0
251    0
631    0
757    1
689    1
      ..
733    0
441    0
627    0
84     1
55     0
Name: Outcome, Length: 154, dtype: int64

In [24]:
print(x.shape,x_train.shape,x_test.shape)

(768, 8) (614, 8) (154, 8)


In [25]:
print(y.shape,y_train.shape,y_test.shape)

(768,) (614,) (154,)


In [26]:
# Training Model

In [27]:
svm_model=svm.SVC(kernel='linear')
svm_model.fit(x_train,y_train)

In [28]:
# Evaluation

In [29]:
# Accuracy Score on the training dataset
train_pred=svm_model.predict(x_train)
training_accuracy=accuracy_score(train_pred,y_train)

In [30]:
training_accuracy*100

77.19869706840392

In [31]:
# Accuracy Score on test dataset
test_pred=svm_model.predict(x_test)
test_accuracy=accuracy_score(test_pred,y_test)

In [32]:
test_accuracy*100

75.32467532467533

In [33]:
# Dataset is very less therefore we are getting less accuracy.

In [34]:
# Making a Predictive System

In [36]:
input_data=(2,197,70,45,543,30.5,0.158,53)

# Changing the input_data to numpy array
input_data_as_numpy_array=np.array(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshape=input_data_as_numpy_array.reshape(1,-1)

prediction=svm_model.predict(input_data_reshape)
print(prediction)

if (prediction[0]==0):
    print("Person is not Diabestes")
else:
    print("Person is Diabetes")

[1]
Person is Diabetes


In [38]:
# Saving the Trained Model

In [37]:
import pickle

In [39]:
filename = 'trained_model.sav'
pickle.dump(svm_model, open(filename,'wb'))

In [40]:
# Loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [41]:
input_data=(2,197,70,45,543,30.5,0.158,53)

# Changing the input_data to numpy array
input_data_as_numpy_array=np.array(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshape=input_data_as_numpy_array.reshape(1,-1)

prediction=loaded_model.predict(input_data_reshape)
print(prediction)

if (prediction[0]==0):
    print("Person is not Diabestes")
else:
    print("Person is Diabetes")

[1]
Person is Diabetes
