In [79]:
# Import necessary libraries for machine learning models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier

import joblib
import numpy as np


# Import necessary libraries
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

In [80]:
# Load the dataset
# Replace 'path_to_your_dataset.csv' with the actual path to your dataset
df = pd.read_csv('data.csv')


In [81]:
# Assuming df is your original DataFrame after preprocessing
df_diabetic = df[df['diabetes'] == 1]  # Diabetic samples
df_non_diabetic = df[df['diabetes'] == 0]  # Non-diabetic samples

# Count how many individuals are diabetic
diabetic_count = df['diabetes'].value_counts()
print(f"Diabetic Count: {diabetic_count[1]}")  # Assuming '1' indicates diabetic
print(f"Non-Diabetic Count: {diabetic_count[0]}")  # Assuming '0' indicates non-diabetic


Diabetic Count: 8500
Non-Diabetic Count: 91500


In [82]:
# Randomly undersample the non-diabetic samples to match the number of diabetic samples
df_non_diabetic_undersampled = df_non_diabetic.sample(len(df_diabetic), random_state=42)

# Combine the undersampled non-diabetic samples with diabetic samples
df_balanced = pd.concat([df_diabetic, df_non_diabetic_undersampled])

# Shuffle the resulting DataFrame to mix the classes
df_balanced = shuffle(df_balanced, random_state=42)
df_balanced = df_balanced.drop(columns=['smoking_history'])
# Check the balance of the new dataset
balanced_count = df_balanced['diabetes'].value_counts()
print(f"Balanced Diabetic Count: {balanced_count[1]}")
print(f"Balanced Non-Diabetic Count: {balanced_count[0]}")

# Optional: Display the first few rows of the balanced dataset
print(df_balanced.head())

Balanced Diabetic Count: 8500
Balanced Non-Diabetic Count: 8500
       gender   age  hypertension  heart_disease    bmi  HbA1c_level  \
17328  Female  27.0             0              0  23.91          5.0   
60483  Female  67.0             0              0  29.93          6.2   
46998  Female  11.0             0              0  18.46          6.6   
26328    Male  41.0             0              0  37.10          8.8   
74525  Female  53.0             0              0  44.90          6.8   

       blood_glucose_level  diabetes  
17328                  160         0  
60483                  159         1  
46998                   80         0  
26328                  220         1  
74525                  300         1  


In [83]:
df = df_balanced
print(df.shape)

# Check the balance of the new dataset
balanced_count = df_balanced['diabetes'].value_counts()
print(f"Balanced Diabetic Count: {balanced_count[1]}")
print(f"Balanced Non-Diabetic Count: {balanced_count[0]}")

(17000, 8)
Balanced Diabetic Count: 8500
Balanced Non-Diabetic Count: 8500


In [84]:
# 1. Handling missing values
# Assuming that missing values are marked as NaN in the dataset

# For numerical columns, we can fill missing values with the mean of the column
num_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
imputer = SimpleImputer(strategy='mean')
df[num_cols] = imputer.fit_transform(df[num_cols])

In [85]:
# For categorical columns, we can fill missing values with the most frequent value
cat_cols = ['gender', 'hypertension', 'heart_disease']
imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

In [86]:
# Encode 'gender' and 'smoking_history' using Label Encoding or One-Hot Encoding
label_enc = LabelEncoder()

# Label encoding for binary variables
df['gender'] = label_enc.fit_transform(df['gender'])  # Assuming 'Male' and 'Female'

print(df.columns)

Index(['gender', 'age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes'],
      dtype='object')


In [87]:
# 3. Normalizing/Scaling numerical features
# Scaling numeric columns such as 'age', 'bmi', 'HbA1c_level', 'blood_glucose_level'
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes'],
      dtype='object')

In [88]:
# 4. Splitting the dataset into features (X) and target (y)
X = df.drop('diabetes', axis=1)  # Features
y = df['diabetes']  # Target

In [89]:
# Optional: Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [90]:
# Function to train, evaluate and display the results of models
def evaluate_model(model, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Print the results
    print(f"{model_name} Results:")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", report)
    print("\n" + "="*60 + "\n")


In [91]:
# 1. Logistic Regression
log_reg = LogisticRegression(max_iter=1000)  # Increase max_iter if necessary
evaluate_model(log_reg, "Logistic Regression")


Logistic Regression Results:
Accuracy: 88.15%
Confusion Matrix:
 [[1530  172]
 [ 231 1467]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.90      0.88      1702
           1       0.90      0.86      0.88      1698

    accuracy                           0.88      3400
   macro avg       0.88      0.88      0.88      3400
weighted avg       0.88      0.88      0.88      3400





In [92]:
# 2. Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', probability=True)  # Linear kernel for simplicity
evaluate_model(svm_model, "Support Vector Machine (SVM)")

Support Vector Machine (SVM) Results:
Accuracy: 88.47%
Confusion Matrix:
 [[1523  179]
 [ 213 1485]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.89      0.89      1702
           1       0.89      0.87      0.88      1698

    accuracy                           0.88      3400
   macro avg       0.88      0.88      0.88      3400
weighted avg       0.88      0.88      0.88      3400





In [93]:

# 3. k-Nearest Neighbors (k-NN)
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can tune n_neighbors based on your data
evaluate_model(knn_model, "k-Nearest Neighbors (k-NN)")

k-Nearest Neighbors (k-NN) Results:
Accuracy: 88.24%
Confusion Matrix:
 [[1507  195]
 [ 205 1493]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.89      0.88      1702
           1       0.88      0.88      0.88      1698

    accuracy                           0.88      3400
   macro avg       0.88      0.88      0.88      3400
weighted avg       0.88      0.88      0.88      3400





In [94]:
# 4. Naive Bayes (Gaussian Naive Bayes)
nb_model = GaussianNB()
evaluate_model(nb_model, "Naive Bayes (GaussianNB)")


Naive Bayes (GaussianNB) Results:
Accuracy: 83.35%
Confusion Matrix:
 [[1542  160]
 [ 406 1292]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.91      0.84      1702
           1       0.89      0.76      0.82      1698

    accuracy                           0.83      3400
   macro avg       0.84      0.83      0.83      3400
weighted avg       0.84      0.83      0.83      3400





In [95]:
# 5. Multi-layer Perceptron (MLP - Neural Network)
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)  # Can tune hidden layers
evaluate_model(mlp_model, "Multi-Layer Perceptron (MLP)")

Multi-Layer Perceptron (MLP) Results:
Accuracy: 91.12%
Confusion Matrix:
 [[1530  172]
 [ 130 1568]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91      1702
           1       0.90      0.92      0.91      1698

    accuracy                           0.91      3400
   macro avg       0.91      0.91      0.91      3400
weighted avg       0.91      0.91      0.91      3400





In [96]:

# Define the ensemble model (using soft voting for averaging probabilities)
voting_clf = VotingClassifier(
    estimators=[
        ('log_reg', log_reg),
        ('svm', svm_model),
        ('knn', knn_model),
        ('nb', nb_model),
        ('mlp', mlp_model)
    ],
    voting='soft'  # Use 'hard' for majority voting
)

# Train and evaluate the ensemble model
evaluate_model(voting_clf, "Ensemble Model (Voting Classifier)")

Ensemble Model (Voting Classifier) Results:
Accuracy: 89.21%
Confusion Matrix:
 [[1564  138]
 [ 229 1469]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.89      1702
           1       0.91      0.87      0.89      1698

    accuracy                           0.89      3400
   macro avg       0.89      0.89      0.89      3400
weighted avg       0.89      0.89      0.89      3400





In [97]:
def predict_manual_input(model, input_data):
    # Ensure input_data is a NumPy array and reshape it for prediction
    input_data = np.array(input_data).reshape(1, -1)
    
    # Scale the necessary columns
    input_data[:, [1, 4, 5, 6]] = scaler.transform(input_data[:, [1, 4, 5, 6]])
    
    # Predict the class
    prediction = model.predict(input_data)
    pred_proba = model.predict_proba(input_data) if hasattr(model, "predict_proba") else None
    
    # Print the result
    print(f"Predicted Class: {'Diabetic' if prediction[0] == 1 else 'Non-Diabetic'}")
    if pred_proba is not None:
        print(f"Predicted Probabilities: {pred_proba}")


In [98]:
# Test the ensemble model with manual input
# Input format: [gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level]
manual_input = [1, 67.0, 0, 1, 27.32, 6.5, 200]

# Male,67.0,0,1,not current,27.32,6.5,200,1
predict_manual_input(voting_clf, manual_input)

Predicted Class: Diabetic
Predicted Probabilities: [[0.13104315 0.86895685]]




In [103]:

# Test the ensemble model with manual input
manual_input = [0,42.0,0,0,27.32,5.7,80]  # Adjusted input without smoking history
predict_manual_input(voting_clf, manual_input)

Predicted Class: Non-Diabetic
Predicted Probabilities: [[0.99304777 0.00695223]]




In [100]:
# Save the trained model using joblib
joblib.dump(voting_clf, 'diabetes_voting_model.pkl')
joblib.dump(scaler, 'scaler.pkl')  # Save the scaler as well for consistent preprocessing

['scaler.pkl']