In [None]:
import warnings
warnings.filterwarnings('ignore')

# Import Neccessary libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


#Import Model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

#Import Sampler libraries
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline

# Set the decimal format
pd.options.display.float_format = "{:.2f}".format


In [None]:
df = pd.read_csv("diabetes_prediction_dataset.csv")


In [None]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.89,0.07,0.04,27.32,5.53,138.06,0.09
std,22.52,0.26,0.19,6.64,1.07,40.71,0.28
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [None]:
df = df.drop_duplicates()

In [None]:
# Checking null values
print(df.isnull().sum())

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [None]:
# Remove Unneccessary value [0.00195%]
df = df[df['gender'] != 'Other']

In [None]:
# Define a function to map the existing categories to new ones
def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'

# Apply the function to the 'smoking_history' column
df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)

# Check the new value counts
print(df['smoking_history'].value_counts())

smoking_history
non-smoker     67276
past_smoker    19655
current         9197
Name: count, dtype: int64


In [None]:
# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level','hypertension','heart_disease']),
        ('cat', OneHotEncoder(), ['gender','smoking_history'])
    ])

# Split data into features and target variable
X = df.drop('diabetes', axis=1)
y = df['diabetes']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.svm import SVC

# Define the SVM model
svm_model = SVC()

# Create a pipeline with preprocessor and SVM model
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svm', svm_model)
])

# Fit the pipeline on the training data
svm_pipeline.fit(X_train, y_train)

# Predict on the testing data
svm_predictions = svm_pipeline.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", svm_accuracy)



SVM Accuracy: 0.960366170810361


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Define the Decision Tree model
dt_model = DecisionTreeClassifier()

# Create a pipeline with preprocessor and Decision Tree model
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('decision_tree', dt_model)
])

# Fit the pipeline on the training data
dt_pipeline.fit(X_train, y_train)

# Predict on the testing data
dt_predictions = dt_pipeline.predict(X_test)

# Evaluate the Decision Tree model
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)


Decision Tree Accuracy: 0.9488713200873816


In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Define the KNN model
knn_model = KNeighborsClassifier()

# Create a pipeline with preprocessor and KNN model
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('knn', knn_model)
])

# Fit the pipeline on the training data
knn_pipeline.fit(X_train, y_train)

# Predict on the testing data
knn_predictions = knn_pipeline.predict(X_test)

# Evaluate the KNN model
knn_accuracy = accuracy_score(y_test, knn_predictions)
print("KNN Accuracy:", knn_accuracy)



KNN Accuracy: 0.960106106314366


In [None]:
from sklearn.linear_model import LogisticRegression

# Define the Logistic Regression model
log_reg_model = LogisticRegression()

# Create a pipeline with preprocessor and Logistic Regression model
log_reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('log_reg', log_reg_model)
])

# Fit the pipeline on the training data
log_reg_pipeline.fit(X_train, y_train)

# Predict on the testing data
log_reg_predictions = log_reg_pipeline.predict(X_test)

# Evaluate the Logistic Regression model
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
print("Logistic Regression Accuracy:", log_reg_accuracy)



Logistic Regression Accuracy: 0.9574014355560179


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define the Random Forest model
rf_model = RandomForestClassifier()

# Create a pipeline with preprocessor and Random Forest model
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('random_forest', rf_model)
])

# Fit the pipeline on the training data
rf_pipeline.fit(X_train, y_train)

# Predict on the testing data
rf_predictions = rf_pipeline.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)



Random Forest Accuracy: 0.9656194736294601


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Define the GBM model
gbm_model = GradientBoostingClassifier()

# Create a pipeline with preprocessor and GBM model
gbm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('gbm', gbm_model)
])

# Fit the pipeline on the training data
gbm_pipeline.fit(X_train, y_train)

# Predict on the testing data
gbm_predictions = gbm_pipeline.predict(X_test)

# Evaluate the GBM model
gbm_accuracy = accuracy_score(y_test, gbm_predictions)
print("GBM Accuracy:", gbm_accuracy)


GBM Accuracy: 0.9695204410693852
