<figure><figure>
    <center> <img src="./images/1.PNG"><center/>
</figure>

## Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Data Collection and Analysis
### PIMA Diabates Dataset

In [4]:
# loading the dataset
diabetes_dataset = pd.read_csv("./Datasets/diabetes.csv")
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
diabetes_dataset.shape

(768, 9)

In [6]:
# getting statistical measures of the datase
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
# checking null values
diabetes_dataset.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
diabetes_dataset["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

 ### 0 = Non Diabetic
 ### 1 = Diabetic

In [10]:
diabetes_dataset.groupby("Outcome").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [11]:
# separate the data and labels
x =  diabetes_dataset.drop(columns="Outcome", axis=1)
y = diabetes_dataset["Outcome"]

## Imbalanced Data Handling

In [12]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [13]:
smote = SMOTE()

In [14]:
x_smote, y_smote = smote.fit_resample(x, y)

In [15]:
y_smote.value_counts()

1    500
0    500
Name: Outcome, dtype: int64

## Data Standardization

In [16]:
scaler = StandardScaler()

In [17]:
scaler.fit(x_smote)

In [18]:
standardized_data = scaler.transform(x_smote)

In [19]:
print(standardized_data)

[[ 0.58706569  0.67100038  0.11941564 ...  0.10512653  0.40105506
   1.41372435]
 [-0.89467507 -1.24189269 -0.1911582  ... -0.81236636 -0.41579346
  -0.27287383]
 [ 1.17976199  1.73371875 -0.29468281 ... -1.24489873  0.53423688
  -0.18410551]
 ...
 [ 1.17976199  0.61027362  0.6370387  ...  0.70234267  1.28509026
   1.14741937]
 [ 0.58706569 -0.84716872  0.32646487 ...  0.26493564 -0.09026864
   1.41372435]
 [-1.19102322  1.09608773 -3.60747041 ...  1.4368243  -0.09339689
   0.08219947]]


In [20]:
# assign x and y
x = standardized_data
y = y_smote

## Train Test Split

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

## Train The Model

In [22]:
model = SVC(kernel="linear")

In [23]:
# training the Support Vector Machine Classifier
model.fit(x_train, y_train) 

In [24]:
y_pred = model.predict(x_test)

## Model Evaluation

### Accuracy Score

In [25]:
# accuracy score on the training data
x_train_pred = model.predict(x_train)
print("Training data accuracy:", accuracy_score(x_train_pred, y_train))

Training data accuracy: 0.76375


In [26]:
# accuracy score on the testing data 
x_test_pred = model.predict(x_test)
print("Testing data accuracy:", accuracy_score(x_test_pred, y_test)) 

Testing data accuracy: 0.735


In [27]:
confusion_matrix(y_test, y_pred)

array([[75, 25],
       [28, 72]], dtype=int64)

In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.75      0.74       100
           1       0.74      0.72      0.73       100

    accuracy                           0.73       200
   macro avg       0.74      0.73      0.73       200
weighted avg       0.74      0.73      0.73       200



## Making a Predictive System

In [29]:
input_data = (4,146,66,28,155,34.3,1.189,42)

# change the input data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instanse
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
# print(std_data)

prediction = model.predict(std_data)
# print(prediction)

if (prediction[0] == 0):
    print("The person is not diabetic")
else:
    print("The person is diabetic")

The person is diabetic




## Save The Trained Model

In [30]:
import pickle

filename1 = "trained_model.sav"
pickle.dump(model, open(filename1, "wb"))

In [32]:
filename2 = "scaled_data.sav"
pickle.dump(scaler, open(filename2, "wb"))

## Load The Saved Model

In [34]:
# Load the model from the file
with open("trained_model.sav", "rb") as file:
    loaded_model = pickle.load(file)

    # Load the scaler from the file
with open("scaled_data.sav", "rb") as file:
    loaded_scaler = pickle.load(file)

In [35]:
input_data = (4,146,66,28,155,34.3,1.189,42)

# change the input data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instanse
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# standardize the input data
std_data = loaded_scaler.transform(input_data_reshaped)
# print(std_data)

prediction = loaded_model.predict(std_data)
# print(prediction)

if (prediction[0] == 0):
    print("The person is not diabetic")
else:
    print("The person is diabetic")

The person is diabetic


