## Importing Libraries 

In [1]:
import numpy as np
import pandas as pd

#The StandardScaler is used for standardizing features by removing the mean and scaling to unit variance. 
#It's often applied to input data before feeding it into machine learning models.
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

# SVM is a popular algorithm for classification and regression tasks.
from sklearn import svm

#Compares the predicted labels with the true labels to measure how well the model performs.
from sklearn.metrics import accuracy_score

In [2]:
#loading the data to pandas dataframe
dataset=pd.read_csv("diabetes.csv")

In [3]:
# displaying dataset
dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
# first 5 rows
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# last 5 rows
dataset.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [6]:
#displays rows & columns in the dataset
dataset.shape

(768, 9)

In [7]:
#gives statistical measures of the dataset
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
# Count the occurrences of values in the 'Outcome' column
dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [9]:
# Count the occurrences of values in the 'Age' column
dataset['Age'].value_counts()

22    72
21    63
25    48
24    46
23    38
28    35
26    33
27    32
29    29
31    24
41    22
30    21
37    19
42    18
33    17
38    16
36    16
32    16
45    15
34    14
46    13
43    13
40    13
39    12
35    10
50     8
51     8
52     8
44     8
58     7
47     6
54     6
49     5
48     5
57     5
53     5
60     5
66     4
63     4
62     4
55     4
67     3
56     3
59     3
65     3
69     2
61     2
72     1
81     1
64     1
70     1
68     1
Name: Age, dtype: int64

In [10]:
# Count the occurrences of values in the 'BMI' column
dataset['BMI'].value_counts()

32.0    13
31.6    12
31.2    12
0.0     11
32.4    10
        ..
36.7     1
41.8     1
42.6     1
42.8     1
46.3     1
Name: BMI, Length: 248, dtype: int64

In [11]:
# Calculate the mean values for each column grouped by 'Outcome'
# This provides the average values of different features for each outcome class (0 or 1)
dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [12]:
# Calculate the mean values for each column grouped by 'BMI'
# This provides the average values of different features for each outcome class (0 or 1)
dataset.groupby('BMI').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,Age,Outcome
BMI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,3.909091,104.272727,28.818182,4.181818,8.090909,0.433273,30.454545,0.181818
18.2,1.000000,92.333333,67.333333,11.333333,27.333333,0.356667,23.000000,0.000000
18.4,0.000000,104.000000,76.000000,0.000000,0.000000,0.582000,27.000000,0.000000
19.1,1.000000,80.000000,55.000000,0.000000,0.000000,0.258000,21.000000,0.000000
19.3,3.000000,99.000000,80.000000,11.000000,64.000000,0.284000,30.000000,0.000000
...,...,...,...,...,...,...,...,...
53.2,0.000000,162.000000,76.000000,56.000000,100.000000,0.759000,25.000000,1.000000
55.0,1.000000,88.000000,30.000000,42.000000,99.000000,0.496000,26.000000,1.000000
57.3,3.000000,123.000000,100.000000,35.000000,240.000000,0.880000,22.000000,0.000000
59.4,0.000000,180.000000,78.000000,63.000000,14.000000,2.420000,25.000000,1.000000


In [13]:
# Calculate the mean values for each column grouped by 'Age'
# This provides the average values of different features for each outcome class (0 or 1)
dataset.groupby('Age').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Outcome
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
21,1.079365,108.31746,65.936508,19.349206,73.634921,27.81746,0.433825,0.079365
22,1.555556,108.208333,63.722222,20.486111,74.486111,29.509722,0.430625,0.152778
23,1.578947,111.578947,64.315789,22.368421,118.026316,31.502632,0.438579,0.184211
24,1.891304,117.891304,64.956522,25.934783,88.021739,32.569565,0.393565,0.173913
25,1.770833,110.083333,59.666667,23.958333,82.895833,31.94375,0.6005,0.291667
26,1.969697,118.212121,64.181818,23.666667,90.878788,34.915152,0.413455,0.242424
27,2.5625,115.28125,73.5,18.375,63.125,31.95,0.47175,0.25
28,3.028571,119.914286,68.314286,23.628571,94.6,33.642857,0.459629,0.285714
29,3.310345,127.37931,68.241379,21.0,88.793103,33.541379,0.408897,0.448276
30,3.619048,122.285714,64.857143,18.904762,82.666667,30.033333,0.367238,0.285714


## Separating the data and labels

In [14]:
# Create a new DataFrame 'p' by dropping the 'Outcome' column from the original dataset
# Create a Series 'q' containing the 'Outcome' column from the original dataset

p = dataset.drop(columns = 'Outcome', axis=1)
q = dataset['Outcome']

p

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [15]:
q

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

# Data Standardization 

In [16]:
# Initialize a StandardScaler for data standardization
scale = StandardScaler()

In [17]:
scale.fit(p)

StandardScaler()

In [18]:
# Fit the StandardScaler on the features in 'p' and transform the data
data_standardization = scale.transform(p)
print(data_standardization)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [19]:
# Assign the standardized data back to 'p'
p = data_standardization
# Assign the standardized data back to 'p'
q = dataset['Outcome']
print(p)
print(q)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


# Splitting Data into train, test data

In [20]:
# Split the data into training and testing sets using train_test_split
# Print the shapes of the original 'p', 'X_train', and other data subsets

X_train, X_test, Y_train, Y_test = train_test_split(p,q, test_size = 0.2, stratify=q, random_state=2)
print(p.shape)

(768, 8)


In [21]:
print(X_train.shape)

(614, 8)


In [22]:
print( X_test.shape)

(154, 8)


## Training The Model

In [23]:
classify = svm.SVC(kernel='linear')

#Fitting the SVM classifier to the training dataset for model training.
classify.fit(X_train, Y_train)

SVC(kernel='linear')

## Model Evaluation- Accuracy Score

In [24]:
# Predicting labels on the training data using the trained classifier
X_train_prediction = classify.predict(X_train)

# Calculating the accuracy score on the training data
Traindata_accuracy = accuracy_score(X_train_prediction, Y_train)

# Predicting labels on the test data using the trained classifier
X_test_prediction = classify.predict(X_test)

# Calculating the accuracy score on the test data
Testdata_accuracy = accuracy_score(X_test_prediction, Y_test)

# Printing the accuracy scores for both training and test data
print('Accuracy score (Training Data) --> {} '.format(Traindata_accuracy))
print('Accuracy score (Test Data) --> {}'.format(Testdata_accuracy))


Accuracy score (Training Data) --> 0.7866449511400652 
Accuracy score (Test Data) --> 0.7727272727272727


## Predictive System

In [29]:
# Define the input data as a tuple
Inputdata = (5, 166, 72, 19, 175, 25.8, 0.587, 51)

# Convert the input_data to a NumPy array
Input_data_array = np.asarray(Inputdata)

# Reshape the array to match the format expected by the model (one instance)
Input_data_reshape = Input_data_array.reshape(1, -1)

# Standardize the input data using a previously defined scaling transformation
std = scale.transform(Input_data_reshape)

# Print the standardized input data
print(std)


[[ 0.3429808   1.41167241  0.14964075 -0.09637905  0.82661621 -0.78595734
   0.34768723  1.51108316]]




In [31]:
# Predicting the diabetes label using the trained classifier on standardized input data
data_prediction = classify.predict(std)

# Printing the prediction result
print(data_prediction)

[1]


In [33]:
# Checking the prediction result and providing appropriate feedback
if (data_prediction[0] == 0):
    print('---------> Great News: No diabetes detected! ---------->')
else:
    print('---------> Attention: Diabetes has been detected! ---------->')

---------> Attention: Diabetes has been detected! ---------->
