In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("datasheets/diabetes_prediction_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


## Installing pandas profiling
https://pypi.org/project/pandas-profiling/

In [4]:
!pip install ydata-profiling



In [7]:
from ydata_profiling import ProfileReport

# Initialize profile report instance
profile = ProfileReport(df, title="Profile Report")
# Generate dataset report
profile.to_file(output_file="Diabetes Dataset Report.html")




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Solve dataset imbalance issues using SMOTE

In [9]:
df.diabetes.value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

## Label Encoding

In [10]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,Female,Female,Male,Female,Male
age,80.0,54.0,28.0,36.0,76.0
hypertension,0,0,0,0,1
heart_disease,1,0,0,0,1
smoking_history,never,No Info,never,current,current
bmi,25.19,27.32,27.32,23.45,20.14
HbA1c_level,6.6,6.6,5.7,5.0,4.8
blood_glucose_level,140,80,158,155,155
diabetes,0,0,0,0,0


In [11]:
from sklearn.preprocessing import LabelEncoder

cat = ['gender','smoking_history']

for i in cat:
    encoder = LabelEncoder()
    encoded_data = encoder.fit_transform(df[i])
    df[i] = encoded_data


In [12]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,0.0,0.0,1.0,0.0,1.0
age,80.0,54.0,28.0,36.0,76.0
hypertension,0.0,0.0,0.0,0.0,1.0
heart_disease,1.0,0.0,0.0,0.0,1.0
smoking_history,4.0,0.0,4.0,1.0,1.0
bmi,25.19,27.32,27.32,23.45,20.14
HbA1c_level,6.6,6.6,5.7,5.0,4.8
blood_glucose_level,140.0,80.0,158.0,155.0,155.0
diabetes,0.0,0.0,0.0,0.0,0.0


## SMOTE (Synthetic Minority Over-sampling Technique)

In [13]:
!pip install imbalanced-learn



In [15]:
from sklearn.model_selection import train_test_split
y = df["diabetes"]
x = df.drop(columns="diabetes")

In [16]:
from imblearn.over_sampling import SMOTE

# Create a SMOTE Object
smote = SMOTE(sampling_strategy='auto',random_state=42)
# Apply SMOTE on X Y
x_resample,y_resample = smote.fit_resample(x,y)
# Convert back to pandas types
x_resample_df = pd.DataFrame(x_resample, columns=x.columns)
y_resample_df = pd.Series(y_resample,name=y.name)


In [17]:
# Combine df
resample_df = pd.concat([x_resample_df,y_resample_df],axis=1)

In [20]:
resample_df.diabetes.value_counts()

diabetes
0    91500
1    91500
Name: count, dtype: int64

In [24]:
df.diabetes.value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [21]:
resample_df.head().T

Unnamed: 0,0,1,2,3,4
gender,0.0,0.0,1.0,0.0,1.0
age,80.0,54.0,28.0,36.0,76.0
hypertension,0.0,0.0,0.0,0.0,1.0
heart_disease,1.0,0.0,0.0,0.0,1.0
smoking_history,4.0,0.0,4.0,1.0,1.0
bmi,25.19,27.32,27.32,23.45,20.14
HbA1c_level,6.6,6.6,5.7,5.0,4.8
blood_glucose_level,140.0,80.0,158.0,155.0,155.0
diabetes,0.0,0.0,0.0,0.0,0.0


In [22]:
resample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183000 entries, 0 to 182999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               183000 non-null  int32  
 1   age                  183000 non-null  float64
 2   hypertension         183000 non-null  int64  
 3   heart_disease        183000 non-null  int64  
 4   smoking_history      183000 non-null  int32  
 5   bmi                  183000 non-null  float64
 6   HbA1c_level          183000 non-null  float64
 7   blood_glucose_level  183000 non-null  int64  
 8   diabetes             183000 non-null  int64  
dtypes: float64(3), int32(2), int64(4)
memory usage: 11.2 MB


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  int32  
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  int32  
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int32(2), int64(4)
memory usage: 6.1 MB


In [25]:
# Report generate on resample_df
# Initialize profile report instance
profile = ProfileReport(resample_df, title="Profile Report")
# Generate dataset report
profile.to_file(output_file="Resample Diabetes Dataset Report.html")



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]