In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter


In [3]:
# Load dataset
df_original = pd.read_csv('./GDM_dataset.csv')
# View the first 10 rows
df_original.head(10)

Unnamed: 0,AgeAtStartOfSpell,Ethnicity,IMD Decile,Body Mass Index at Booking,Obese?,Parity,Gravida,No_Of_previous_Csections,FolicAcidDose,GlucoseToleranceTest,Gestational Diabetes
0,2.000001,0,2.000001,2.742637,1.0,1e-06,0.500001,0.0,0,0,0.0
1,1.875001,1,1.666668,2.51473,1.0,1e-06,0.500001,0.0,0,0,0.0
2,1.000001,0,0.333334,1.392249,0.0,1e-06,0.500001,0.0,0,1,0.0
3,2.125001,2,0.666668,0.482172,0.0,1e-06,0.500001,0.0,0,2,0.0
4,1.250001,0,1.333334,0.679071,0.0,1e-06,0.500001,0.0,0,2,0.0
5,2.750001,3,1.666668,1.159691,0.0,1e-06,1.000001,0.0,0,0,0.0
6,3.125001,0,2.000001,0.790699,0.0,1e-06,0.500001,0.0,0,0,0.0
7,2.125001,4,0.333334,2.23721,0.0,2.000001,1.500001,0.0,0,0,0.0
8,1.750001,1,0.666668,1.227908,0.0,1.000001,1.000001,1.0,0,0,0.0
9,0.750001,5,1.333334,3.218606,1.0,1e-06,1.000001,0.0,0,0,0.0


# SMOTE

In [5]:
target = "Gestational Diabetes"
# Split dataset into features (X) and target (y)
X = df_original.drop(columns=[target])
y = df_original[target]

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Check class distribution after SMOTE
print("Class distribution after SMOTE:", Counter(y_smote))

Class distribution after SMOTE: Counter({0.0: 21569, 1.0: 21569})


In [7]:
# Construct the new dataset after data augmenttion with SMOTE
df = X_smote
df[target] = y_smote
df.head()

Unnamed: 0,AgeAtStartOfSpell,Ethnicity,IMD Decile,Body Mass Index at Booking,Obese?,Parity,Gravida,No_Of_previous_Csections,FolicAcidDose,GlucoseToleranceTest,Gestational Diabetes
0,2.000001,0,2.000001,2.742637,1.0,1e-06,0.500001,0.0,0,0,0.0
1,1.875001,1,1.666668,2.51473,1.0,1e-06,0.500001,0.0,0,0,0.0
2,1.000001,0,0.333334,1.392249,0.0,1e-06,0.500001,0.0,0,1,0.0
3,2.125001,2,0.666668,0.482172,0.0,1e-06,0.500001,0.0,0,2,0.0
4,1.250001,0,1.333334,0.679071,0.0,1e-06,0.500001,0.0,0,2,0.0


In [13]:
# Check for missing values
df.isnull().sum()

AgeAtStartOfSpell             0
Ethnicity                     0
IMD Decile                    0
Body Mass Index at Booking    0
Obese?                        0
Parity                        0
Gravida                       0
No_Of_previous_Csections      0
FolicAcidDose                 0
GlucoseToleranceTest          0
Gestational Diabetes          0
dtype: int64

In [14]:
# Save the SMOTE dataset
df.to_csv("./processed_datasets/dataset_preprocessed_smote.csv", index=False)

# ADASYN

In [15]:
# Apply ADASYN to balance the training set
ada = ADASYN(random_state=42)
adaX, aday = ada.fit_resample(X, y)


# Check class distribution after ADASYN
print("Class distribution after ADASYN:", Counter(adaX))

Class distribution after ADASYN: Counter({'AgeAtStartOfSpell': 1, 'Ethnicity': 1, 'IMD Decile': 1, 'Body Mass Index at Booking': 1, 'Obese?': 1, 'Parity': 1, 'Gravida': 1, 'No_Of_previous_Csections': 1, 'FolicAcidDose': 1, 'GlucoseToleranceTest': 1})


In [16]:
# Construct the new dataset after data augmenttion with ADASYN
df = adaX
df[target] = aday
df.head()

Unnamed: 0,AgeAtStartOfSpell,Ethnicity,IMD Decile,Body Mass Index at Booking,Obese?,Parity,Gravida,No_Of_previous_Csections,FolicAcidDose,GlucoseToleranceTest,Gestational Diabetes
0,2.000001,0,2.000001,2.742637,1.0,1e-06,0.500001,0.0,0,0,0.0
1,1.875001,1,1.666668,2.51473,1.0,1e-06,0.500001,0.0,0,0,0.0
2,1.000001,0,0.333334,1.392249,0.0,1e-06,0.500001,0.0,0,1,0.0
3,2.125001,2,0.666668,0.482172,0.0,1e-06,0.500001,0.0,0,2,0.0
4,1.250001,0,1.333334,0.679071,0.0,1e-06,0.500001,0.0,0,2,0.0


In [17]:
# Check for missing values
df.isnull().sum()

AgeAtStartOfSpell             0
Ethnicity                     0
IMD Decile                    0
Body Mass Index at Booking    0
Obese?                        0
Parity                        0
Gravida                       0
No_Of_previous_Csections      0
FolicAcidDose                 0
GlucoseToleranceTest          0
Gestational Diabetes          0
dtype: int64

In [18]:
# Save the ADASYN dataset
df.to_csv("./processed_datasets/dataset_preprocessed_adasyn.csv", index=False)