# **Importing Libraries**

In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# **Data Collection**

In [46]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/balanced_1500_diabetes_data.csv')

In [48]:
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [49]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               1500 non-null   int64  
 1   Glucose                   1500 non-null   int64  
 2   BloodPressure             1500 non-null   int64  
 3   SkinThickness             1500 non-null   int64  
 4   Insulin                   1500 non-null   int64  
 5   BMI                       1500 non-null   float64
 6   DiabetesPedigreeFunction  1500 non-null   float64
 7   Age                       1500 non-null   int64  
 8   Outcome                   1500 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 105.6 KB
None


In [50]:
# Statistical summary
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,5.641333,130.093333,74.331333,27.098,133.46,32.815733,0.813857,38.608,0.5
std,4.661671,35.588758,16.81561,14.549141,126.164343,7.449731,0.632192,13.355245,0.500167
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,20.0,0.0
25%,2.0,102.0,66.0,20.0,0.75,28.0,0.30975,27.0,0.0
50%,5.0,127.0,75.0,29.0,110.0,33.1,0.6145,37.0,0.5
75%,9.0,158.0,85.0,38.0,217.0,38.0,1.102,48.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.496,81.0,1.0


In [51]:
# Count of null values
data.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


# **Data Preprocessing**

In [52]:
new_data = data

# Replacing zero values with NaN
new_data[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = new_data[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.nan)

# Count of NaN
new_data.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,5
BloodPressure,35
SkinThickness,227
Insulin,375
BMI,11
DiabetesPedigreeFunction,0
Age,0
Outcome,0


## **Filling Missing Values**

In [53]:
cols_to_fill = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols_to_fill:
    new_data[col] = new_data[col].fillna(new_data[col].mean())

print(new_data['Outcome'].value_counts())

Outcome
1    750
0    750
Name: count, dtype: int64


## **Feature Scaling**

In [54]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
dataset_scaled = sc.fit_transform(new_data)

dataset_scaled = pd.DataFrame(dataset_scaled)
print(dataset_scaled)

X = new_data.drop('Outcome', axis=1)
y = new_data['Outcome']

             0         1         2         3         4         5         6  \
0     0.352941  0.670968  0.489796  0.304348  0.209404  0.317719  0.227047   
1     0.058824  0.264516  0.428571  0.239130  0.209404  0.175153  0.112903   
2     0.470588  0.896774  0.408163  0.270979  0.209404  0.107943  0.245658   
3     0.058824  0.290323  0.428571  0.173913  0.110059  0.205703  0.036807   
4     0.000000  0.600000  0.163265  0.304348  0.197633  0.511202  0.913978   
...        ...       ...       ...       ...       ...       ...       ...   
1495  0.588235  0.941935  0.438776  0.434783  0.272189  0.350305  0.343672   
1496  0.235294  0.890323  0.469388  0.293478  0.139645  0.291242  0.663358   
1497  0.176471  0.954839  0.489796  0.402174  0.144379  0.252546  0.328371   
1498  0.117647  0.903226  0.530612  0.173913  0.384615  0.441955  0.448718   
1499  0.823529  0.987097  0.438776  0.250000  0.298225  0.439919  0.881720   

             7    8  
0     0.491803  1.0  
1     0.180328  0.0

# **Feature Selection**

In [98]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

rf = RandomForestClassifier(n_estimators = 11, criterion = 'entropy')
rfe = RFE(estimator=rf, n_features_to_select=5)
rfe.fit(X, y)
selected_features = X.columns[rfe.support_]
print("Selected Features:", list(selected_features))

Selected Features: ['Glucose', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


## **Split the train and test data**

In [59]:
from sklearn.model_selection import train_test_split

X_features = new_data[selected_features]

sc = MinMaxScaler(feature_range = (0, 1))
dataset_scaled = pd.DataFrame(sc.fit_transform(X_features), columns=selected_features)
print(dataset_scaled)

# Assuming X and y are your features and target
X_train, X_test, y_train, y_test = train_test_split(dataset_scaled, y, test_size=0.2, random_state=42)

print(X_train)
print(y_train)
print(X_features)


       Glucose   Insulin       BMI  DiabetesPedigreeFunction       Age
0     0.670968  0.209404  0.317719                  0.227047  0.491803
1     0.264516  0.209404  0.175153                  0.112903  0.180328
2     0.896774  0.209404  0.107943                  0.245658  0.196721
3     0.290323  0.110059  0.205703                  0.036807  0.016393
4     0.600000  0.197633  0.511202                  0.913978  0.213115
...        ...       ...       ...                       ...       ...
1495  0.941935  0.272189  0.350305                  0.343672  0.393443
1496  0.890323  0.139645  0.291242                  0.663358  0.196721
1497  0.954839  0.144379  0.252546                  0.328371  0.327869
1498  0.903226  0.384615  0.441955                  0.448718  0.639344
1499  0.987097  0.298225  0.439919                  0.881720  0.803279

[1500 rows x 5 columns]
       Glucose   Insulin       BMI  DiabetesPedigreeFunction       Age
382   0.419355  0.214201  0.150713                  

In [60]:
# Checking dimensions
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", y_train.shape)
print("Y_test shape:", y_test.shape)

X_train shape: (1200, 5)
X_test shape: (300, 5)
Y_train shape: (1200,)
Y_test shape: (300,)


# **Handling Imbalanced Data**

In [61]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# **Model Training**

In [113]:
# Logistic Regression Algorithm
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 42, class_weight='balanced')
lr.fit(X_train, y_train)

Y_pred_lr = lr.predict(X_test)

print(X_train)
print(y_train)

       Glucose   Insulin       BMI  DiabetesPedigreeFunction       Age
382   0.419355  0.214201  0.150713                  0.359388  0.016393
538   0.535484  0.247337  0.372709                  0.300248  0.049180
1493  0.567742  0.272189  0.287169                  0.568652  0.459016
1112  0.767742  0.373964  0.521385                  0.597601  0.426230
324   0.438710  0.209404  0.360489                  0.028950  0.016393
...        ...       ...       ...                       ...       ...
1130  0.632258  0.147929  0.305499                  0.545492  0.704918
1294  0.580645  0.407101  0.279022                  0.478908  0.393443
860   0.387097  0.020118  0.112016                  0.214640  0.196721
1459  0.529032  0.267456  0.287169                  0.798594  0.622951
1126  0.587097  0.434320  0.468432                  0.875103  0.803279

[1200 rows x 5 columns]
382     0
538     0
1493    1
1112    1
324     0
       ..
1130    1
1294    1
860     0
1459    1
1126    1
Name: Outcome

In [112]:
# K-nearest neighbors Algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 24)
knn.fit(X_train, y_train)

Y_pred_knn = knn.predict(X_test)

In [74]:
# Support Vector Classifier Algorithm
from sklearn.svm import SVC
svc = SVC(kernel = 'linear')
svc.fit(X_train, y_train)

Y_pred_svc = svc.predict(X_test)

In [73]:
# Decision tree Algorithm
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = 'entropy')
dt.fit(X_train, y_train)

Y_pred_dt = dt.predict(X_test)

In [63]:
# Random forest Algorithm
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 11, criterion = 'entropy')
rf.fit(X_train, y_train)

Y_pred_rf = rf.predict(X_test)

print(X_train)
print(y_train)
print(X_test)

       Glucose   Insulin       BMI  DiabetesPedigreeFunction       Age
382   0.419355  0.214201  0.150713                  0.359388  0.016393
538   0.535484  0.247337  0.372709                  0.300248  0.049180
1493  0.567742  0.272189  0.287169                  0.568652  0.459016
1112  0.767742  0.373964  0.521385                  0.597601  0.426230
324   0.438710  0.209404  0.360489                  0.028950  0.016393
...        ...       ...       ...                       ...       ...
1130  0.632258  0.147929  0.305499                  0.545492  0.704918
1294  0.580645  0.407101  0.279022                  0.478908  0.393443
860   0.387097  0.020118  0.112016                  0.214640  0.196721
1459  0.529032  0.267456  0.287169                  0.798594  0.622951
1126  0.587097  0.434320  0.468432                  0.875103  0.803279

[1200 rows x 5 columns]
382     0
538     0
1493    1
1112    1
324     0
       ..
1130    1
1294    1
860     0
1459    1
1126    1
Name: Outcome

## **Prediction**

In [140]:
import pandas as pd

# Suppose these were the feature columns used for training:
feature_names = ['Glucose', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

# Input data (replace with actual values)
input_data = [[155,495,34.0,0.543,46]]

# Wrap as a DataFrame
input_df = pd.DataFrame(input_data, columns=selected_features)
input_df_scaled = pd.DataFrame(sc.transform(input_df), columns=selected_features)

print(input_df_scaled)

probs = lr.predict_proba(input_df_scaled)[:, 1]
print("Probability of being diabetic (class 1):", probs[0])

# Decide final prediction based on threshold (you can adjust this)
threshold = 0.5
prediction = int(probs[0] >= threshold)

# Now predict
# prediction = lr.predict(input_df)
print("Prediction:", prediction)


    Glucose   Insulin       BMI  DiabetesPedigreeFunction      Age
0  0.716129  0.584615  0.325866                  0.192308  0.42623
Probability of being diabetic (class 1): 0.8822697468406133
Prediction: 1


# **Model Evaluation**

In [141]:
# Evaluating using accuracy_score metrics
from sklearn.metrics import accuracy_score

accuracy_lr = accuracy_score(y_test, Y_pred_lr)
accuracy_knn = accuracy_score(y_test, Y_pred_knn)
accuracy_svc = accuracy_score(y_test, Y_pred_svc)
accuracy_dt = accuracy_score(y_test, Y_pred_dt)
accuracy_rf = accuracy_score(y_test, Y_pred_rf )

### **Accuracy**

In [142]:
print("Logistic Regression Accuracy: " , accuracy_lr * 100)
print("K Nearest neighbors Accuracy: ", accuracy_knn * 100)
print("Support Vector Classifier Accuracy: ", accuracy_svc * 100)
print("Decision tree Accuracy: ", accuracy_dt * 100)
print("Random Forest Accuracy: ", accuracy_rf * 100)

Logistic Regression Accuracy:  92.0
K Nearest neighbors Accuracy:  93.33333333333333
Support Vector Classifier Accuracy:  92.66666666666666
Decision tree Accuracy:  86.0
Random Forest Accuracy:  89.33333333333333


In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, Y_pred_lr)
print(cm)

[[81 18]
 [20 35]]


### **Calculate Precision, Recall and F1-Score**

In [None]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, Y_pred_lr))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81        99
           1       0.66      0.64      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154



In [145]:
import joblib
joblib.dump(lr, 'diabetes_model_lr.pkl')

['diabetes_model_lr.pkl']

In [147]:
from google.colab import files
files.download('diabetes_model_lr.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>