In [28]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import pickle
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [29]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv("Diabetespred.csv")

In [30]:
# printing the first 5 rows of the dataset
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [31]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

(499, 9)

In [32]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0
mean,3.803607,121.354709,68.743487,20.57515,80.390782,31.984569,0.485377,33.086172,0.364729
std,3.345786,32.441489,19.452608,15.72019,119.774561,8.210358,0.345546,11.636849,0.481837
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,100.0,64.0,0.0,0.0,27.05,0.252,24.0,0.0
50%,3.0,117.0,70.0,23.0,36.0,32.0,0.383,29.0,0.0
75%,6.0,142.0,80.0,33.0,122.0,36.6,0.6335,39.5,1.0
max,17.0,197.0,122.0,63.0,846.0,67.1,2.42,81.0,1.0


In [33]:
diabetes_dataset['Outcome'].value_counts()

Outcome
0    317
1    182
Name: count, dtype: int64

In [34]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.242902,110.369085,68.179811,19.921136,67.73817,30.067823,0.43841,31.258675
1,4.78022,140.489011,69.725275,21.714286,102.428571,35.323077,0.567181,36.269231


In [35]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [36]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
494            3       80              0              0        0   0.0   
495            6      166             74              0        0  26.6   
496            5      110             68              0        0  26.0   
497            2       81             72             15       76  30.1   
498            7      195             70             33      145  25.1   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [37]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
494    0
495    0
496    0
497    0
498    1
Name: Outcome, Length: 499, dtype: int64


In [38]:
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               499 non-null    int64  
 1   Glucose                   499 non-null    int64  
 2   BloodPressure             499 non-null    int64  
 3   SkinThickness             499 non-null    int64  
 4   Insulin                   499 non-null    int64  
 5   BMI                       499 non-null    float64
 6   DiabetesPedigreeFunction  499 non-null    float64
 7   Age                       499 non-null    int64  
 8   Outcome                   499 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 35.2 KB


In [39]:
#Data Standardization
scaler = StandardScaler()

In [40]:
scaler.fit(X)

In [41]:
standardized_data = scaler.transform(X)

In [42]:
print(standardized_data)

[[ 0.65712418  0.82215798  0.16757553 ...  0.19695267  0.4102653
   1.45493002]
 [-0.83879263 -1.12174849 -0.14117594 ... -0.65648446 -0.38927309
  -0.17945286]
 [ 1.2554909   1.90210602 -0.24409309 ... -1.05881911  0.54062482
  -0.09343271]
 ...
 [ 0.35794082 -0.35035703 -0.03825878 ... -0.72963622 -0.56018891
  -0.26547301]
 [-0.53960927 -1.24517113  0.16757553 ... -0.2297659   0.17851504
  -0.69557377]
 [ 0.95630754  2.27237392  0.06465837 ... -0.83936385 -0.9338862
   1.88503078]]


In [43]:
X = standardized_data
Y = diabetes_dataset['Outcome']

In [44]:
print(X)
print(Y)

[[ 0.65712418  0.82215798  0.16757553 ...  0.19695267  0.4102653
   1.45493002]
 [-0.83879263 -1.12174849 -0.14117594 ... -0.65648446 -0.38927309
  -0.17945286]
 [ 1.2554909   1.90210602 -0.24409309 ... -1.05881911  0.54062482
  -0.09343271]
 ...
 [ 0.35794082 -0.35035703 -0.03825878 ... -0.72963622 -0.56018891
  -0.26547301]
 [-0.53960927 -1.24517113  0.16757553 ... -0.2297659   0.17851504
  -0.69557377]
 [ 0.95630754  2.27237392  0.06465837 ... -0.83936385 -0.9338862
   1.88503078]]
0      1
1      0
2      1
3      0
4      1
      ..
494    0
495    0
496    0
497    0
498    1
Name: Outcome, Length: 499, dtype: int64


In [45]:
#Train Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [46]:
print(X.shape, X_train.shape, X_test.shape)

(499, 8) (399, 8) (100, 8)


In [47]:
##training the model
classifier = svm.SVC(kernel='linear')

In [48]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

In [49]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [50]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7644110275689223


In [51]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [52]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.78


In [67]:
##Making a Predictive System

input_data = (7,133,84,0,0,40.2,0.696,37)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[ 0.95630754  0.35932311  0.78507845 -1.31014945 -0.67185765  1.00162197
   0.6101499   0.33666805]]
[1]
The person is diabetic


