# Diabetes predictor

#### Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
import joblib

#### Data Loading

In [None]:
data = pd.read_csv("diabetes.csv")
data.head()

#### Data preprocessing

In [None]:
data.duplicated().sum()

In [None]:
data.isnull().sum()

In [None]:
data.Outcome.value_counts()

#### Features Selection

In [None]:
sns.heatmap(data.corr(),annot = True, cmap = "Blues")

In [None]:
for i in range(8):
    cor = data.iloc[:,i].corr(data.iloc[:,8])
    print(f"Correlation b/w {data.columns[i]} and {data.columns[8]} = {cor: .4f}")

In [None]:
for col in data.columns:
    if (col != "Outcome"):
        info = mutual_info_classif(data[[col]], data["Outcome"])
        print(f"MI b/w {col} and Outcome = {info[0]:.4f}")

In [None]:
X = data.drop("Outcome", axis = 1)
Y = data["Outcome"]

#### Data Splitting

In [None]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y, test_size=0.2, random_state=42)

#### Data Scalling

In [None]:
scaler = MinMaxScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)

#### Data Balancing using SMOTE

In [None]:
smote = SMOTE(random_state = 42)
Xtrain_resampled, Ytrain_resampled = smote.fit_resample(Xtrain_scaled, Ytrain)

#### Models

In [None]:
models = { "LogisticRegression": LogisticRegression(), "DecisionTree": DecisionTreeClassifier(), "KNN": KNeighborsClassifier(n_neighbors=10), 
          "RandomForest": RandomForestClassifier(n_estimators=100, random_state =42) }

#### Training and evaluation

In [None]:
for name, model in models.items():
    # Model Training and prediction
    model.fit(Xtrain_resampled, Ytrain_resampled)
    Ypred = model.predict(Xtest_scaled)
    # Metrics
    cm = confusion_matrix(Ytest, Ypred)
    acc = accuracy_score(Ytest, Ypred)
    prec = precision_score(Ytest, Ypred)
    rec = recall_score(Ytest, Ypred)
    f1 = f1_score(Ytest, Ypred)
    print(f"{name} Metrics:")
    print(f"\tAccuracy : {acc: .2f}")
    print(f"\tPrecision : {prec: .2f}")
    print(f"\tRecall : {rec: .2f}")
    print(f"\tF1 : {f1: .2f}")
    sns.heatmap(cm, annot = True, cmap= "Blues", xticklabels=["No diabetes", "diabetes"], yticklabels=["No diabetes", "diabetes"])
    plt.title(f"Confusion matrix of {name}")
    plt.xlabel("Predicted values")
    plt.ylabel("Actual values")
    plt.show()

#### Saving Model and scaler

In [None]:
with open("Diabetes pedictor model.pkl", 'wb') as file:
    joblib.dump(models["RandomForest"], file)

In [None]:
with open("Scaler.pkl", 'wb') as f:
    joblib.dump(scaler, f)