- https://www.sciencedirect.com/science/article/pii/S1877050920300557 The paper presents a diabetes prediction model using machine learning to improve diagnostic accuracy. It implements algorithms like Logistic Regression, and others, with data preprocessing, clustering, and model evaluation.
The dataset consists of 800 records with 10 attributes, including features like Number of Pregnancies, Glucose Level, Blood Pressure, BMI, Age, and Job Type.

- https://www.sciencedirect.com/science/article/pii/S2405959521000205 

  ## Approach
  Dataset taken from kaggle https://www.kaggle.com/datasets/saurabh00007/diabetescsv

Handles missing values, especially for attributes that cannot be zero (e.g., Glucose Level, Blood Pressure). The dataset is scaled to normalize values for better model performance.

In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the data
file_path = 'diabetes.csv'
data = pd.read_csv(file_path)

# Replace zeros with NaN and fill with median values
columns_with_missing_values = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in columns_with_missing_values:
    data[col] = data[col].replace(0, np.nan)  # Replace 0 with NaN
    data[col] = data[col].fillna(data[col].median())  # Fill NaN with median value

# Scale the features
scaler = StandardScaler()
scaled_columns = data.columns.drop('Outcome') 
data[scaled_columns] = scaler.fit_transform(data[scaled_columns])

# Save the cleaned and scaled data to a new CSV file
cleaned_file_path = 'cleaned_diabetes_data.csv'
data.to_csv(cleaned_file_path, index=False)

print("Data preprocessing completed. Cleaned data saved to", cleaned_file_path)


Data preprocessing completed. Cleaned data saved to cleaned_diabetes_data.csv


### Using a LogisticRegression

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # Features (drop the 'Outcome' column)
y = data['Outcome']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:")
print(conf_matrix)

print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Confusion Matrix:
[[82 17]
 [21 34]]

Accuracy: 75.32%
Precision: 66.67%
Recall: 61.82%
F1 Score: 64.15%


### Decision Tree

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Ensure no missing values (security measure for cleaner data)
if data.isnull().values.any():
    data = data.dropna()

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # Features (drop the 'Outcome' column)
y = data['Outcome']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:")
print(conf_matrix)

print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Confusion Matrix:
[[79 21]
 [28 26]]

Accuracy: 68.18%
Precision: 55.32%
Recall: 48.15%
F1 Score: 51.49%


### Random Forest

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Ensure no missing values (security measure for cleaner data)
if data.isnull().values.any():
    data = data.dropna()

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # Features (drop the 'Outcome' column)
y = data['Outcome']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:")
print(conf_matrix)

print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Confusion Matrix:
[[88 12]
 [22 32]]

Accuracy: 77.92%
Precision: 72.73%
Recall: 59.26%
F1 Score: 65.31%


### SVM

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # Features (drop the 'Outcome' column)
y = data['Outcome']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the SVM model
svm_model = SVC(kernel='linear')  # Using a linear kernel
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Confusion Matrix:
 [[82 18]
 [28 26]]

Accuracy: 70.13%
Precision: 59.09%
Recall: 48.15%
F1 Score: 53.06%


### KNN

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # Features (drop the 'Outcome' column)
y = data['Outcome']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  # Using 5 neighbors (you can adjust this)
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Confusion Matrix:
 [[83 17]
 [21 33]]

Accuracy: 75.32%
Precision: 66.00%
Recall: 61.11%
F1 Score: 63.46%


### Naive Bayes

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # Features (drop the 'Outcome' column)
y = data['Outcome']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the Naive Bayes model
nb_model = GaussianNB()  # Using Gaussian Naive Bayes
nb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Confusion Matrix:
 [[74 26]
 [20 34]]

Accuracy: 70.13%
Precision: 56.67%
Recall: 62.96%
F1 Score: 59.65%


### Gradient Boosting

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # Features (drop the 'Outcome' column)
y = data['Outcome']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gb_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Confusion Matrix:
 [[86 14]
 [23 31]]

Accuracy: 75.97%
Precision: 68.89%
Recall: 57.41%
F1 Score: 62.63%


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # Features (drop the 'Outcome' column)
y = data['Outcome']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create the model
model = Sequential()
model.add(Dense(10, input_dim=X_train.shape[1], activation='relu'))  # First layer with 10 neurons
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=10, verbose=0)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert probabilities to binary predictions

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Confusion Matrix:
 [[84 16]
 [25 29]]

Accuracy: 73.38%
Precision: 64.44%
Recall: 53.70%
F1 Score: 58.59%


### LSTM

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1).values  # Features (convert to numpy array)
y = data['Outcome'].values  # Target variable (convert to numpy array)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape input to be [samples, time steps, features]
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))  # Reshaping for LSTM
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))      # Reshaping for LSTM

# Create the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))  # LSTM layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=10, verbose=0)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert probabilities to binary predictions

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


  super().__init__(**kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Confusion Matrix:
 [[79 21]
 [24 30]]

Accuracy: 70.78%
Precision: 58.82%
Recall: 55.56%
F1 Score: 57.14%


### BILSTM

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1).values  # Features (convert to numpy array)
y = data['Outcome'].values  # Target variable (convert to numpy array)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape input to be [samples, time steps, features]
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))  # Reshaping for LSTM
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))      # Reshaping for LSTM

# Create the BiLSTM model
model = Sequential()
model.add(Bidirectional(LSTM(50, activation='relu'), input_shape=(X_train.shape[1], X_train.shape[2])))  # Bidirectional LSTM layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=10, verbose=0)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert probabilities to binary predictions

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


  super().__init__(**kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Confusion Matrix:
 [[79 21]
 [26 28]]

Accuracy: 69.48%
Precision: 57.14%
Recall: 51.85%
F1 Score: 54.37%


### Voting Classifier

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the cleaned dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = data.drop('Outcome', axis=1)  # Features
y = data['Outcome']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create individual classifiers
log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()
forest_clf = RandomForestClassifier()

# Create the Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('logistic', log_clf),
        ('decision_tree', tree_clf),
        ('random_forest', forest_clf)
    ],
    voting='hard'  # Use 'soft' for probability-based voting
)

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = voting_clf.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix and the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print(f"\nAccuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Confusion Matrix:
 [[83 17]
 [25 29]]

Accuracy: 72.73%
Precision: 63.04%
Recall: 53.70%
F1 Score: 58.00%
