1. Bayesian Decision Tree

In [None]:
'''
Using git bash :
Execute the following commands in git bash:
    git clone https://github.com/SubramanyaJ/diabetes_detection.git
    cd /diabetes_detection/src/testing
    python3 bayesianDecisionTree.py
    (Libraries need to be installed)

Using Google Collab:
Copy this code into a python notebook
Upload data.csv OR main.csv which is in this directory to the runtime
(File icon in the left panel)
Run
'''

# Accuracy : 0.8729576525508503 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('data.csv')

# Separate the features and target variable
X = data.drop(columns=['Target'])
y = data['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.333, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create individual classifiers
naive_bayes = GaussianNB()
decision_tree = DecisionTreeClassifier(random_state=40)

# Set up parameter grids for GridSearchCV
param_grid_nb = {
    'var_smoothing': [1e-8]  # Values from 1 to 1e-9
}

param_grid_dt = {
    'criterion': ['entropy'],
    'splitter': ['best'],
    'max_depth': [9],
    'min_samples_split': [20],
    'min_samples_leaf': [10],
    'max_features': [None, "best split"]
}

# Create the voting classifier
voting_clf = VotingClassifier(
    estimators=[('naive_bayes', naive_bayes), ('decision_tree', decision_tree)],
    voting='soft',  # Use 'soft' for probability-based voting
    weights=[1, 5]
)

# Set up the parameter grid for the voting classifier
param_grid_voting = {
    'naive_bayes__var_smoothing': param_grid_nb['var_smoothing'],
    'decision_tree__criterion': param_grid_dt['criterion'],
    'decision_tree__splitter': param_grid_dt['splitter'],
    'decision_tree__max_depth': param_grid_dt['max_depth'],
    'decision_tree__min_samples_split': param_grid_dt['min_samples_split'],
    'decision_tree__min_samples_leaf': param_grid_dt['min_samples_leaf'],
    'decision_tree__max_features': param_grid_dt['max_features']
}

# Set up GridSearchCV for the voting classifier
grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid_voting, cv=10, scoring='accuracy')

# Train the model using GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_model.predict(X_test_scaled)

# Calculate relevant statistics
accuracy = np.mean(y_pred == y_test)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print statistics
print(f"\nBest Hyperparameters: {grid_search.best_params_}")
print(f"\nAccuracy: {accuracy}")
print("\nClassification Report:")
print(class_report)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()



2. Basic Neural Network

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('data.csv')

# Separate the features and target variable
X = data.drop(columns=['Target'])
y = data['Target']

# Fix: Subtract 1 from the target labels to map them to the range [0, 4]
y = y - 1

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build Neural Network Model
def build_nn_model(hidden_layers=[64, 32], learning_rate=0.0008, dropout_rate=0.0):
    model = Sequential()
    model.add(Dense(hidden_layers[0], input_shape=(X_train_scaled.shape[1],), activation='relu'))
    
    for units in hidden_layers[1:]:
        model.add(Dense(units, activation='relu'))
        if dropout_rate > 0:  # Add dropout if specified
            model.add(Dropout(dropout_rate))
    
    model.add(Dense(5, activation='softmax'))
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Define hyperparameter grid
hidden_layers_options = [[128, 64]]
learning_rates = [0.0008]
batch_sizes = [4, 8]
dropout_rates = [0.0]
epochs = [20, 22]

# Initialize variables to track the best model
best_model = None
best_accuracy = 0
best_params = {}

# Loop through hyperparameter combinations
for hidden_layers in hidden_layers_options:
    for learning_rate in learning_rates:
        for batch_size in batch_sizes:
            for dropout_rate in dropout_rates:
                for epoch in epochs:
                    # Print the current hyperparameter combination
                    print(f"\nTraining model with hidden_layers={hidden_layers}, learning_rate={learning_rate}, "
                          f"batch_size={batch_size}, dropout_rate={dropout_rate}, epochs={epoch}")
                    
                    # Build the neural network model with the current hyperparameters
                    model = build_nn_model(hidden_layers=hidden_layers, learning_rate=learning_rate, dropout_rate=dropout_rate)
                    
                    # Train the model and capture the training history
                    history = model.fit(X_train_scaled, y_train, validation_split=0.1, 
                                        epochs=epoch, batch_size=batch_size, verbose=0)
                    
                    # Evaluate the model on the test set
                    test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
                    
                    # Print the test accuracy for the current model
                    print(f"Test Accuracy: {test_accuracy:.4f}")
                    
                    # Check if this model's accuracy is better than the best found so far
                    if test_accuracy > best_accuracy:
                        best_accuracy = test_accuracy
                        best_model = model
                        best_params = {
                            'hidden_layers': hidden_layers,
                            'learning_rate': learning_rate,
                            'batch_size': batch_size,
                            'dropout_rate': dropout_rate,
                            'epochs': epoch
                        }
                        print(f"New best model found with accuracy: {best_accuracy:.4f}")

# Print the best parameters and accuracy
print(f"\nBest Parameters: {best_params}")
print(f"Best Test Accuracy: {best_accuracy:.4f}")

# Make predictions with the best model
y_pred = best_model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate relevant statistics
class_report = classification_report(y_test, y_pred_classes)
conf_matrix = confusion_matrix(y_test, y_pred_classes)

# Print classification report
print("\nClassification Report:")
print(class_report)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Plot training and validation accuracy over epochs for the best model
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='upper left')
plt.show()

# Plot training and validation loss over epochs for the best model
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper left')
plt.show()

3. Random Forest (Combination of Decision Trees)

In [None]:
# Accuracy : 0.8720684672668667

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('data.csv')

# Separate the features and target variable
X = data.drop(columns=['Target'])
y = data['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.333, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create individual classifiers
naive_bayes = GaussianNB()
random_forest = RandomForestClassifier(random_state=40)

# Set up parameter grids for GridSearchCV
param_grid_nb = {
    'var_smoothing': [1e-8]  # Fixed value
}

param_grid_rf = {
    'n_estimators': [50],  # Reduced to one option
    'criterion': ['entropy'],
    'max_depth': [10],  # Only one option
    'min_samples_split': [2, 5],  # Fewer options
    'min_samples_leaf': [1],  # Fixed value
    'max_features': [None]
}

# Create the voting classifier
voting_clf = VotingClassifier(
    estimators=[('naive_bayes', naive_bayes), ('random_forest', random_forest)],
    voting='soft',  # Use 'soft' for probability-based voting
    weights=[1, 5]
)

# Set up the parameter grid for the voting classifier
param_grid_voting = {
    'naive_bayes__var_smoothing': param_grid_nb['var_smoothing'],
    'random_forest__n_estimators': param_grid_rf['n_estimators'],
    'random_forest__criterion': param_grid_rf['criterion'],
    'random_forest__max_depth': param_grid_rf['max_depth'],
    'random_forest__min_samples_split': param_grid_rf['min_samples_split'],
    'random_forest__min_samples_leaf': param_grid_rf['min_samples_leaf'],
    'random_forest__max_features': param_grid_rf['max_features']
}

# Set up GridSearchCV for the voting classifier with reduced cv
grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid_voting, cv=5, scoring='accuracy')

# Train the model using GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_model.predict(X_test_scaled)

# Calculate relevant statistics
accuracy = np.mean(y_pred == y_test)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print statistics
print(f"\nBest Hyperparameters: {grid_search.best_params_}")
print(f"\nAccuracy: {accuracy}")
print("\nClassification Report:")
print(class_report)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

