In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

In [None]:
diabetes_df = pd.read_csv('diabetes_prediction_dataset.csv')

## Exploration of Data

In [None]:
diabetes_df.head()

In [None]:
diabetes_df.info() #Displays the information of the dataset 

In [None]:
diabetes_df.describe() #Gives statistical data

In [None]:
diabetes_df.isnull().sum() #checks for missing/null values

In [None]:
diabetes_df.isnull().sum() #checks for missing/null values

In [None]:
#Encode categorical variables (gender, smoknig history)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

categorical_columns = ['gender', 'smoking_history']
for column in categorical_columns:
    diabetes_df[column] = LabelEncoder().fit_transform(diabetes_df[column])

In [None]:
#TODO Find/show anomalies (we have to find which columns to investigate or investigate all of them)

In [None]:
#Correlation Matrxi
plt.figure(figsize=(12, 8))
correlation_matrix = diabetes_df.corr()
sb.heatmap(correlation_matrix, annot=True, vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

In [None]:
#Display class imbalance

#grab counts
counts = diabetes_df[diabetes_df.columns[-1]].value_counts()

# Plot the imbalance
plt.figure(figsize=(8, 6))
sb.barplot(x=counts.index, y=counts.values)
plt.title('Imbalance of Values in Dataset (Target) Column')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.xticks(ticks=[0, 1], labels=['No Diabetes', 'Has Diabetes'])
plt.show()

In [None]:
#Explore the data using visualization TODO add more visualizations

#Pairplot
sb.pairplot(diabetes_df)
plt.suptitle('Pairplot of Features Colored by Dataset', y=1.02)
plt.show()

In [None]:
#histoplot
plt.figure(figsize=(12, 8))
for column in diabetes_df.columns[:-1]:  # Exclude the target column
    sb.histplot(diabetes_df[column], label=column)
plt.title('Histogram of Numerical Features')
plt.legend()
plt.show()

In [None]:
#Boxplot
plt.figure(figsize=(12, 8))
sb.boxplot(data=diabetes_df.drop(columns=['diabetes']))
plt.title('Boxplot of Numerical Features')
plt.xticks(rotation=45)
plt.show()

## Preprocessing of Data

In [None]:
diabetes_df = diabetes_df.drop_duplicates(keep='first') # drops the duplicate entries in the data set

diabetes_df.duplicated().sum()

In [None]:
#TODO Remove anomalies if any

In [None]:
#splitting target from data
data = diabetes_df[diabetes_df.columns[:-1]]
target = diabetes_df[diabetes_df.columns[-1]]

In [None]:
from sklearn.preprocessing import StandardScaler 

# Initializes StandardScaler
scaler = StandardScaler()

# Fit and transform the data 
scaled_data = scaler.fit_transform(data)

In [None]:
#Balance classes with SMOTE
import imblearn
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
x,y = sm.fit_resample(scaled_data, target)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## Training the Models

In [None]:
#Logistic regression
#Naives bayes
#KNN
#Random forest
#Decision tree
#Bagging
#AdaBoost
#XGBoost
#Voting
#SVM
#Neural Network
#Deep Neural Network

#Top Models we will run
#GridSearchCV
#RFE feature selection

### Standalone Models

In [None]:
#Logistic Regression - Vance
from sklearn.linear_model import LogisticRegression

Logistic_regression_model = LogisticRegression(random_state=42).fit(X_train, y_train)

pred_log_reg = Logistic_regression_model.predict(X_test)

In [None]:
#Naive Bayes - Jesus
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

pred_nb = nb_model.predict(X_test)

In [None]:
#KNN - Joshua

In [None]:
#Random Forest - Joshua

In [None]:
#Decision Tree - 

In [None]:
#SVM - Jesus

In [None]:
#Neural Network

### Combination Models

In [None]:
#Bagging - 

In [None]:
#AdaBoost - 

In [None]:
#XGBoost - 

In [None]:
#Voting - 

In [None]:
#Deep Neural Network - Vance

## Comparing the Models

https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, class_likelihood_ratios, matthews_corrcoef

def evaluate(y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    clr = class_likelihood_ratios(y_test, y_pred)
    mtc = matthews_corrcoef(y_test, y_pred)

    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'matthews_corrcoef': mtc,
        'class likelihood': clr,
    }

    return results

def display(y_pred):
    # Generate matrix
    cm = confusion_matrix(y_test, y_pred)
    # Display confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Diabetes (1)', 'No Diabetes (0)'])
    disp.plot()
    plt.title('Confusion Matrix Tree')
    plt.show()


## Feature Selection & GridSearchCV on the Top Models

## Recomparing the Models

## Final Observations