In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
diabetes_dataset = pd.read_csv("diabetesdataset.csv")

# Handling Missing Values
diabetes_dataset[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = \
    diabetes_dataset[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

diabetes_dataset.fillna(diabetes_dataset.median(), inplace=True)

# Removing duplicate values
diabetes_dataset.drop_duplicates(inplace=True)

# Feature Selection
selected_features = ['Glucose', 'BloodPressure', 'Insulin', 'BMI', 'Age', 'DiabetesPedigreeFunction']
X = diabetes_dataset[selected_features]
Y = diabetes_dataset["Outcome"]

# Data Normalization
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Define models
models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True)
}

# Train and evaluate base models
results = {"Model": [], "Training Accuracy": [], "Testing Accuracy": []}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))

    results["Model"].append(model_name)
    results["Training Accuracy"].append(train_accuracy)
    results["Testing Accuracy"].append(test_accuracy)

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)

# Hybrid Model - Random Forest + Naive Bayes
rf = RandomForestClassifier(random_state=100)
rf.fit(X_train, y_train)

train_rf_pred = rf.predict_proba(X_train)
test_rf_pred = rf.predict_proba(X_test)

X_train_hybrid_rf_nb = pd.DataFrame(train_rf_pred, columns=['Prob_0', 'Prob_1'])
X_test_hybrid_rf_nb = pd.DataFrame(test_rf_pred, columns=['Prob_0', 'Prob_1'])

nb = GaussianNB()
nb.fit(X_train_hybrid_rf_nb, y_train)

y_train_pred_rf_nb = nb.predict(X_train_hybrid_rf_nb)
y_test_pred_rf_nb = nb.predict(X_test_hybrid_rf_nb)

train_accuracy_rf_nb = accuracy_score(y_train, y_train_pred_rf_nb)
test_accuracy_rf_nb = accuracy_score(y_test, y_test_pred_rf_nb)

print(f"Hybrid Model - Random Forest + Naive Bayes - Training Accuracy: {train_accuracy_rf_nb}")
print(f"Hybrid Model - Random Forest + Naive Bayes - Testing Accuracy: {test_accuracy_rf_nb}")

# Hybrid Model - SVM + Random Forest
svm = SVC(probability=True, random_state=100)
svm.fit(X_train, y_train)

train_svm_pred = svm.predict_proba(X_train)
test_svm_pred = svm.predict_proba(X_test)

X_train_hybrid_svm_rf = pd.DataFrame(train_svm_pred, columns=['Prob_0', 'Prob_1'])
X_test_hybrid_svm_rf = pd.DataFrame(test_svm_pred, columns=['Prob_0', 'Prob_1'])

rf.fit(X_train_hybrid_svm_rf, y_train)

y_train_pred_svm_rf = rf.predict(X_train_hybrid_svm_rf)
y_test_pred_svm_rf = rf.predict(X_test_hybrid_svm_rf)

train_accuracy_svm_rf = accuracy_score(y_train, y_train_pred_svm_rf)
test_accuracy_svm_rf = accuracy_score(y_test, y_test_pred_svm_rf)

print(f"Hybrid Model - SVM + Random Forest - Training Accuracy: {train_accuracy_svm_rf}")
print(f"Hybrid Model - SVM + Random Forest - Testing Accuracy: {test_accuracy_svm_rf}")

# Confusion Matrix for Base Models
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for {model_name}:\n{cm}\n")

# Confusion Matrix for Hybrid Models
cm_rf_nb = confusion_matrix(y_test, y_test_pred_rf_nb)
cm_svm_rf = confusion_matrix(y_test, y_test_pred_svm_rf)

print("Confusion Matrix - Random Forest + Naive Bayes:\n", cm_rf_nb)
print("Confusion Matrix - SVM + Random Forest:\n", cm_svm_rf)
