In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../Datasets/diabetes_prediction_dataset.csv')

In [3]:
data = data.drop_duplicates()
data = data.dropna()

In [4]:
over = SMOTE(sampling_strategy='minority', random_state=42)
under = RandomUnderSampler(sampling_strategy='majority', random_state=42)

In [6]:
# Define a function to map the existing categories to new ones
def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'

# Apply the function to the 'smoking_history' column
data['smoking_history'] = data['smoking_history'].apply(recategorize_smoking)

In [7]:
def perform_one_hot_encoding(df, column_name):
    # Perform one-hot encoding on the specified column
    dummies = pd.get_dummies(df[column_name], prefix=column_name)

    # Drop the original column and append the new dummy columns to the dataframe
    df = pd.concat([df.drop(column_name, axis=1), dummies], axis=1)

    return df

# Perform one-hot encoding on the gender variable
data = perform_one_hot_encoding(data, 'gender')

# Perform one-hot encoding on the smoking history variable
data = perform_one_hot_encoding(data, 'smoking_history')

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level','hypertension','heart_disease']),
        ('cat', OneHotEncoder(), ['gender','smoking_history'])
    ])

X = data.drop('diabetes', axis=1)
Y = data['diabetes']

In [9]:
X

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,gender_Other,smoking_history_current,smoking_history_non-smoker,smoking_history_past_smoker
0,80.0,0,1,25.19,6.6,140,True,False,False,False,True,False
1,54.0,0,0,27.32,6.6,80,True,False,False,False,True,False
2,28.0,0,0,27.32,5.7,158,False,True,False,False,True,False
3,36.0,0,0,23.45,5.0,155,True,False,False,True,False,False
4,76.0,1,1,20.14,4.8,155,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
99994,36.0,0,0,24.60,4.8,145,True,False,False,False,True,False
99996,2.0,0,0,17.37,6.5,100,True,False,False,False,True,False
99997,66.0,0,0,27.83,5.7,155,False,True,False,False,False,True
99998,24.0,0,0,35.42,4.0,100,True,False,False,False,True,False
