Import the essential libraries

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

**Import / Collect the Data**

In [9]:
# load the diabetes dataset into the pandas dataframe
diabetes_data = pd.read_csv(f"/content/diabetes.csv")

#let us see first 5 rows of dataset
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Insights from the data:
All the columns have numerical values, so we do not have to do text-preprocessing.


**Exploratory Data Analysis**

In [10]:
# let us see the number of rows and columns in this data set
diabetes_data.shape

(768, 9)

In [11]:
# Now let us see the information about the dataset
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


From the above output, we can see that we do not have any NaN values, so we do not have to delete any rows or do any imputation.
Also, all the values of numerical types

In [12]:
# Let us see the statistical measure of the data
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [13]:
# Now, let us see whether the dataset is biased or not.
diabetes_data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

We see 500 are non-diabetic and 268 are diabetic.

In [14]:
# Now let us see and group the data-set
diabetes_data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


Check for missing values

In [15]:
# to check if we have any missing values
diabetes_data.isna()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False,False,False
764,False,False,False,False,False,False,False,False,False
765,False,False,False,False,False,False,False,False,False
766,False,False,False,False,False,False,False,False,False


In [16]:
# to check for columns which has any missing values
diabetes_data.isna().any()

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [17]:
# to check for rows which has any missing values
diabetes_data.isna().any(axis=1)

0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Length: 768, dtype: bool

In [18]:
corr = diabetes_data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


From the above plot, the main features affecting the diabetes are:
1) Glucose
2) Insulin
3) BMI
4) Age
5) Pregnancies

In [19]:
new_data = diabetes_data[['Glucose','Insulin','BMI','Age','Pregnancies','Outcome']]
new_data.head()

Unnamed: 0,Glucose,Insulin,BMI,Age,Pregnancies,Outcome
0,148,0,33.6,50,6,1
1,85,0,26.6,31,1,0
2,183,0,23.3,32,8,1
3,89,94,28.1,21,1,0
4,137,168,43.1,33,0,1


Split the data set

In [20]:
#Now, we will train out model on  diabetes_data because we have small number of data
X = diabetes_data.drop(columns = ['Outcome'])
X_label = X
y = diabetes_data['Outcome']
#Let us check the shapes of each set
print("Shape of X is ", X.shape)
print("Shape of y is ", y.shape)

Shape of X is  (768, 8)
Shape of y is  (768,)


Let us Standardize the Data-set


In [21]:
scaler = StandardScaler()
scaler.fit(X)


In [22]:
stand_data = scaler.transform(X)
print(stand_data)



[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [23]:
X = stand_data

train and split the data

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 0.8, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(614, 8)
(154, 8)
(614,)
(154,)


In [25]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)


Model Evaluation for train data

In [26]:
y_train_pred = clf.predict(X_train)
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_train, y_train_pred))

Accuracy: 0.7719869706840391


Model Evaluation for test Data

In [27]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.7792207792207793


**Let Us make a predictive System**

In [28]:
input_data = (0,137,40,35,168,43.1,2.288,33)

#changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array to one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# Now, we need to standardize the data
std_data = scaler.transform(input_data_reshaped)

#do prediction
prediction = clf.predict(std_data)

if(prediction[0] == 0):
  print("The person does not have diabetes")
else:
  print("The person has diabetes")

The person has diabetes


