# Importing the Libraries

In [1]:
import pandas as pd
import numpy as np

#for data Standardization
from sklearn.preprocessing import StandardScaler

#for train test split
from sklearn.model_selection import train_test_split

#training model
from sklearn import svm
from sklearn.metrics import accuracy_score

# Data Collection and Analysis

### Loading Dataset
We need to load diabetes.csv into dataframes so that we can work with them in Python.

In [2]:
#Mount GoogleColab with Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#loading dataset to pandas dataframe
df_diabetes = pd.read_csv('drive/My Drive/diabetes.csv')

We can view the first 5 rows of a dataframe using the `head` method. Similarly, if you wanted to see the last 3, you can use `tail(3)`

In [4]:
#Printing first 5 rows of dataset
df_diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Dataset includes the data of all the female patients

**Pregnencies**: the nimber of time a female get pregnant

**Glucose**: Glucose level of female

**Bloood Pressure**: Blood Pressure level

**Skin Thickness**: It is taken from the triceps(It is a large, thick muscle on the dorsal part of the upper arm), basically it tells that fat is present in particular muscle

**Insulin**: Insulin Level of female

**Body Mass Index**: Calculated by deviding the weight in kilograms divided by height in meters squared

**Diabetes Pedigree Function**: Calculates diabetes likelihood depending on the subject's age and her diabetic family history

**Age**: Age of female

**Outcome**: female has diabetes or not. 1 indicates patient has diabetes and 0 indicates no diabetes

In [None]:
#Total number of rows and column
df_diabetes.shape

(768, 9)

there are 768 people in a rows and 9 columns represents features or parameters. The last column represents the label needed for prediction.

In [None]:
#Statistical measure of data
df_diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
#to check if there is null value in dataset
df_diabetes.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [None]:
# Total number of values for diabetic and non-diabetic patients
df_diabetes['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

There are 500 people that are non-diabetic (label 0) and 268 people that have diabetes (label 1)

In [None]:
df_diabetes.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


Get the mean value for all the features on the basis of diabetic and non-diabetic people

In [None]:
#seperating the data and labels
X = df_diabetes.drop(columns = 'Outcome' , axis=1)
Y = df_diabetes['Outcome']

Drop is used to exclude a specific column or a row in a dataset. Here we drop a column so axis is eaual to one and if we want to drop a row than axis is equal to 0

for X variable, here we getting all the values from the data except Outcome (label) column
and in Y variable storing all the values of Outcome (label) column

In [None]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [None]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


# Data Standardization
standarddize data in a particular range that helps machine learning model for better prediction



In [None]:
scaler = StandardScaler()

In [None]:
#Compute the mean and std to be used for scaling.
scaler.fit(X)

In [None]:
#Transform the data
std_data = scaler.transform(X)

here we are fitting all inconsistent data with standard scaler function and based on standardization we are transforming all the data to a common range

In [None]:
print(std_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [None]:
#for simplification given this standardize data again to variable X
X = std_data

In [None]:
print(X)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [None]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


# Train Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

Here we are taking four vairiables X_train, X_test, Y_train and Y_test this will split data into two parts. Test part is used to train machine learning model and once the model is trained we evaluate the model with the test data(unknown data) so that we predict accuracy of the model

train_test_split function is used to seperate test and train data. Where test_size indicates the percentage of data want for test here is 0.2 which means 20% for test data from the  whole data

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


There are 768 rows of orignal data out of 614 are going to be used for training data  anf 154 for test data

# Training the Model

In [None]:
#load support vector machine linear model to classifier
classifier = svm .SVC(kernel='linear')

In [None]:
#fit training data to the classifier
#training the support vector machine classifier
classifier.fit(X_train, Y_train)

# Model Evaluation

## Accuracy Score
### Finding accuracy score on training data

In [None]:
X_train_pred = classifier.predict(X_train)
train_data_acc = accuracy_score(X_train_pred, Y_train)

In [None]:
print( 'Accuracy Score of the training data:', train_data_acc)

Accuracy Score of the training data: 0.7866449511400652


Model is predicting 79% correct predictions

### Finding accuracy score on test data

In [None]:
X_test_pred = classifier.predict(X_test)
test_data_acc = accuracy_score(X_test_pred, Y_test)

In [None]:
print( 'Accuracy Score of the test data:', test_data_acc)

Accuracy Score of the test data: 0.7727272727272727


The Model is predicting 77% correct prediction on test data so the model is not over trained(over fit)

# Predictive System

In [None]:
# input the Data
inp_data = (8,120,0,0,0,30,0.183,38)

In [None]:
# changing input data to numpy array
np_ary = np.asarray(inp_data)

In [None]:
# reshape the array because we are predicting one instance
reshape_inp_data = np_ary.reshape(1, -1)

In [None]:
# stansarsize the input data
stand_data = scaler.transform(reshape_inp_data)



In [None]:
# prediction of input data
Prediction = classifier.predict(stand_data)

if (Prediction[0] == 0):
    print('The person is non_Diabetic')

else:
    print('The person is Diabetic')


The person is Diabetic
