Method 1: SMOTE

In [None]:
import os
import sys
sys.path.append("./../src/")
from utilities import REPO_PATH, DATA_PATH, RESPONSE_COL_NAME, get_feature_corr_with_response

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random
seed = 0
random.seed(seed)

from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score

In [None]:
import torch
from torch import nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
import torch.nn.init as init
from torch.utils.data import Dataset

In [None]:
train = pd.read_csv(f"{DATA_PATH}/train.csv")
test = pd.read_csv(f"{DATA_PATH}/test.csv")
val = pd.read_csv(f"{DATA_PATH}/val.csv")

In [None]:
X_train, y_train = train.drop([RESPONSE_COL_NAME], axis=1), train[RESPONSE_COL_NAME]
y_test, X_test = test[RESPONSE_COL_NAME], test.drop([RESPONSE_COL_NAME],axis=1)

In [None]:
beforeCounter = Counter(y_train)
print("Before:", beforeCounter)

In [None]:
sample_ratios = np.arange(0.1,1.1,0.1)

In [None]:
accuracy_scores_by_ratio = []
for ratio in sample_ratios:
    smt = SMOTE(sampling_strategy=ratio)
    X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)
    afterCounter = Counter(y_train_sm)
    print("After:", afterCounter)

    clf = BernoulliNB()
    clf.fit(X_train_sm, y_train_sm)
    y_preds = clf.predict(X_test)
    accuracy = accuracy_score(y_preds, y_test)
    accuracy_scores_by_ratio.append(accuracy)
    print("Accuracy:", accuracy)
    print(" ")
    

In [None]:
plt.plot(sample_ratios, accuracy_scores_by_ratio)
plt.show()

Building a GAN to generate Synthetic Data

In [88]:
class Data(Dataset):
    def __init__(self, X, y):
        self.data = X.values.astype(np.float32)
        self.labels = y.values.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        sample = {
            'input': torch.tensor(self.data[idx]),
            'label': torch.tensor(self.labels[idx])
        }
        return sample

In [89]:
# Generation Block Function
def FC_Layer_blockGen(input_dim, output_dim):
    single_block = nn.Sequential(
        nn.Linear(input_dim, output_dim),

        nn.ReLU()
    )
    return single_block

# Discriminattor Block Function   
def FC_Layer_BlockDisc(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.ReLU(),
        nn.Dropout(0.4)
    )

In [90]:
# Generator
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
            nn.Tanh()  
        )

    def forward(self, x):
        return self.model(x)
    
# Discriminator
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


In [91]:
def weights_init(m):
    if isinstance(m, nn.Linear):
        init.xavier_uniform_(m.weight)
        if m.bias is not None:
            init.constant_(m.bias, 0)

In [92]:
#Define training parameters
batch_size = 256
num_epochs = 50
lr = 0.0002
num_features = 62
latent_dim = 20

In [93]:
# Define data dimensions
noise_dim = 20

In [94]:
# MODEL INITIALIZATION
generator = Generator(noise_dim, num_features)
discriminator = Discriminator(num_features)

# LOSS FUNCTION AND OPTIMIZERS
criterion = nn.BCELoss()
gen_optimizer = torch.optim.Adam(generator.parameters(), lr=lr)
disc_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr)

In [95]:
dataset = Data(X_train,y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [96]:
generator = generator.apply(weights_init)
discriminator = discriminator.apply(weights_init)

In [97]:
model_save_freq = 100

latent_dim =20
for epoch in range(num_epochs):
    for batch in dataloader:
        real_data_batch = batch['input']
        # Train discriminator on real data
        real_labels = batch['label']
        disc_optimizer.zero_grad()
        output_real = discriminator(real_data_batch).reshape(256)
        loss_real = criterion(output_real, real_labels)
        loss_real.backward()

        # Train discriminator on generated data
        fake_labels = torch.FloatTensor(np.random.uniform(0, 0.1, (batch_size, 1)))
        noise = torch.FloatTensor(np.random.normal(0, 1, (batch_size, latent_dim)))
        generated_data = generator(noise)
        output_fake = discriminator(generated_data.detach())
        loss_fake = criterion(output_fake, fake_labels)
        loss_fake.backward()

        disc_optimizer.step()

        # Train generator 
        valid_labels = torch.FloatTensor(np.random.uniform(0.9, 1.0, (batch_size, 1)))
        gen_optimizer.zero_grad()
        output_g = discriminator(generated_data)
        loss_g = criterion(output_g, valid_labels)
        loss_g.backward()
        gen_optimizer.step()
        
    # Print progress
    print(f"Epoch {epoch}, D Loss Real: {loss_real.item()}, D Loss Fake: {loss_fake.item()}, G Loss: {loss_g.item()}")

Epoch 0, D Loss Real: 41.20402908325195, D Loss Fake: 0.7985919117927551, G Loss: 0.6787103414535522
Epoch 1, D Loss Real: 39.453125, D Loss Fake: 0.7337930202484131, G Loss: 0.6753849983215332
Epoch 2, D Loss Real: 36.109901428222656, D Loss Fake: 0.4838208258152008, G Loss: 1.146087646484375
Epoch 3, D Loss Real: 42.1875, D Loss Fake: 0.3636218011379242, G Loss: 1.5590752363204956
Epoch 4, D Loss Real: 25.390625, D Loss Fake: 0.39902687072753906, G Loss: 1.7323626279830933
Epoch 5, D Loss Real: 28.125, D Loss Fake: 0.2967832088470459, G Loss: 2.0962817668914795
Epoch 6, D Loss Real: 28.125, D Loss Fake: 0.23845021426677704, G Loss: 2.6493773460388184
Epoch 7, D Loss Real: 21.559921264648438, D Loss Fake: 0.2420194149017334, G Loss: 2.662781238555908
Epoch 8, D Loss Real: 12.890625, D Loss Fake: 0.2813032269477844, G Loss: 2.8898332118988037
Epoch 9, D Loss Real: 14.84375, D Loss Fake: 0.23095735907554626, G Loss: 3.3222343921661377
Epoch 10, D Loss Real: 19.140625, D Loss Fake: 0.231

: 