## Imports

In [1]:
import pandas as pd
import numpy as np
import os  
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
torch.manual_seed(42)

<torch._C.Generator at 0x224f14e0130>

## Preprocessing

In [2]:
def filepath(f):
    d = os.path.join(os.path.dirname(os.getcwd()), 'processed_data', f)
    return d

In [3]:
# Import training and test datasets
train_data = pd.read_csv(filepath("final_training_set.csv"))
test_data = pd.read_csv(filepath("final_test_set.csv"))

In [4]:
uid = train_data["ClaimID"]
train_data = train_data.drop(
    ["ClaimID"],
    axis = 1
)

In [5]:
def minmax_encode(df, col):
    """
    Return dataset including the minmax encoded column and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be numeric
    """

    maxx = df[col].max()
    minx = df[col].min()
    out = list(map(lambda x: (x-minx)/(maxx-minx), df[col]))
    new_colname = col + "_minmax"
    df[new_colname] = out
    return df.drop(
        [col],
        axis = 1
    )

def one_hot_encode(df, col):
    """
    Returns the dataset including the one hot encoded columns and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be a Series that represents a categorical variable
    """
    ohe_cols = pd.get_dummies(df[col], prefix = col)
    output = pd.concat(
        [df, ohe_cols],
        axis = 1,
    ).drop(
        [col],
        axis = 1
    )
    return output

def frequency_encode(df, col):
    """
    Returns the dataset including the frequency encoded column and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be a Series that represents a categorical variable with high cardinality
    """
    val_counts = df[col].value_counts().to_dict()
    total = len(df[col])
    out = []
    for x in df[col]:
        out.append(val_counts[x]/total)
    new_colname = col + '_freq'
    df[new_colname] = out
    df.drop(
        [col],
        axis = 1,
        inplace = True
    )

    return df

In [6]:
freq_encoded_cols = []
ohe_cols = []
num_cols = []
unique_threshold = 30

for col in train_data.columns:
    if train_data[col].nunique() == 1:
        print(col,"has been removed as it is constant")
        train_data.drop([col], axis=1, inplace=True)
    elif train_data[col].nunique() == 2: # Binary columns
        continue
    elif train_data[col].dtype in ['int64','float64']:
        train_data[col] = train_data[col].fillna(train_data[col].median())
        num_cols.append(col)
    elif train_data[col].nunique() > unique_threshold:
        freq_encoded_cols.append(col)
    elif 2 < train_data[col].nunique() <= unique_threshold:
        ohe_cols.append(col)

procedure_1 has been removed as it is constant
procedure_2 has been removed as it is constant
procedure_3 has been removed as it is constant


In [7]:
for col in train_data.columns:
    if col in num_cols:
        train_data = minmax_encode(train_data, col)
    elif col in ohe_cols:
        train_data = one_hot_encode(train_data, col)
    elif col in freq_encoded_cols:
        try:
            train_data = frequency_encode(train_data, col)
        except:
            print(col)
    elif train_data[col].nunique() == 1:
        train_data.drop(
            [col],
            axis = 1,
            inplace = True
        )
    else:
        train_data[col] = train_data[col].astype('bool')

In [8]:
y = train_data["PotentialFraud"]
train_data.drop(
    ["PotentialFraud"],
    axis = 1,
    inplace = True
)

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y, test_size=0.2, random_state=42)

## GAN Model

In [10]:
# Define GAN model

class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.model = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, self.output_dim),
            nn.Sigmoid() # must be between 0 and 1
        )

    def forward(self, x):
        return self.model(x)

class Discriminator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.model = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, self.output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

In [11]:
device = ""
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [12]:
from random import choice, random, randint
def generate_one_random_data():
    categorical = [randint(0, 1) for _ in range(49)]
    numerical = [random() for _ in range(11)]
    return [*categorical, *numerical]

In [13]:
gen_input_dim = 30
generator = Generator(gen_input_dim, X_train.shape[1]).to(device=device)
discriminator = Discriminator(X_train.shape[1], 1).to(device=device)

In [14]:
batch_size = 1024
lr = 0.0001
num_epochs = 10
loss_function = nn.BCELoss()

optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)
optimizer_generator = torch.optim.Adam(generator.parameters(), lr=lr)

In [15]:
train = data_utils.TensorDataset(torch.from_numpy(X_train.values.astype(np.float64)).float(), torch.from_numpy(y_train.values.astype(np.float64)).float())
train_loader = data_utils.DataLoader(train, batch_size=batch_size, shuffle=True)

In [16]:
for epoch in range(num_epochs):
    for n, (real_samples, _) in enumerate(train_loader):
        real_samples = real_samples.to(device=device)
        real_samples_labels = torch.ones((batch_size, 1)).to(
            device=device
        )
        latent_space_samples = torch.randn((batch_size, gen_input_dim)).to(
            device=device
        )
        generated_samples = generator(latent_space_samples)
        generated_samples_labels = torch.zeros((batch_size, 1)).to(
            device=device
        )
        all_samples = torch.cat((real_samples, generated_samples))
        all_samples_labels = torch.cat(
            (real_samples_labels, generated_samples_labels)
        )

        discriminator.zero_grad()
        output_discriminator = discriminator(all_samples)
        loss_discriminator = loss_function(
            output_discriminator, all_samples_labels
        )
        loss_discriminator.backward()
        optimizer_discriminator.step()

        latent_space_samples = torch.randn((batch_size, gen_input_dim)).to(
            device=device
        )

        generator.zero_grad()
        generated_samples = generator(latent_space_samples)
        output_discriminator_generated = discriminator(generated_samples)
        loss_generator = loss_function(
            output_discriminator_generated, real_samples_labels
        )
        loss_generator.backward()
        optimizer_generator.step()

        if n == X_train.shape[0]//batch_size - 1:
            print(f"Epoch: {epoch}\tLoss D.: {loss_discriminator}\tLoss G.: {loss_generator}")
            break

Epoch: 0	Loss D.: 0.020049110054969788	Loss G.: 5.391129016876221
Epoch: 1	Loss D.: 0.0541888065636158	Loss G.: 6.2767839431762695
Epoch: 2	Loss D.: 0.09054140746593475	Loss G.: 10.610733032226562
Epoch: 3	Loss D.: 0.025492262095212936	Loss G.: 7.796809196472168
Epoch: 4	Loss D.: 0.08897071331739426	Loss G.: 5.00631046295166
Epoch: 5	Loss D.: 0.0001852016430348158	Loss G.: 8.411347389221191
Epoch: 6	Loss D.: 0.003181146690621972	Loss G.: 5.727866172790527
Epoch: 7	Loss D.: 0.0005961034912616014	Loss G.: 8.99804401397705
Epoch: 8	Loss D.: 0.006344071589410305	Loss G.: 6.71708869934082
Epoch: 9	Loss D.: 0.00237495475448668	Loss G.: 5.706980228424072


In [17]:
xt = torch.from_numpy(X_train.values.astype(np.float64)).float()
dxt = discriminator(xt)

In [18]:
# Accuracy of discriminator classifying a real training sample as real
y_pred = [round(dxt[i][0].item()) for i in range(dxt.shape[0])]
print(sum(y_pred)/len(y_pred))

0.9986228301176976


In [19]:
xt = torch.from_numpy(X_valid.values.astype(np.float64)).float()
dxt = discriminator(xt)

In [20]:
# Accuracy of discriminator classifying a real validation sample as real
y_pred = [round(dxt[i][0].item()) for i in range(dxt.shape[0])]
print(sum(y_pred)/len(y_pred))

0.9983966751162187


In [21]:
# Accuracy of discriminator classifying a fake validation sample as fake
generated_samples = generator(torch.randn((100_000, gen_input_dim)).to(device=device)).detach()
dxt = discriminator(generated_samples)
y_pred = [1-round(dxt[i][0].item()) for i in range(dxt.shape[0])]
print(sum(y_pred)/len(y_pred))

1.0


In [22]:
generated_samples[0]

tensor([1.0000e+00, 1.0000e+00, 6.8942e-09, 1.5068e-05, 1.0000e+00, 1.0000e+00,
        8.1041e-12, 7.1588e-07, 1.0000e+00, 1.0000e+00, 1.0000e+00, 3.6914e-08,
        1.4657e-07, 9.9994e-01, 1.0000e+00, 6.4534e-09, 1.2569e-09, 9.9997e-01,
        9.0677e-07, 1.9429e-03, 9.9999e-01, 1.0000e+00, 2.0189e-09, 3.8514e-13,
        1.7070e-07, 1.3709e-29, 2.1276e-07, 6.7969e-08, 1.5389e-19, 1.3059e-29,
        1.0700e-06, 9.9923e-01, 1.1826e-10, 7.3786e-08, 1.2279e-29, 1.5390e-21,
        9.9945e-01, 3.5383e-08, 1.5852e-26, 9.9998e-01, 2.6171e-13, 9.3826e-10,
        2.1972e-25, 1.1349e-30, 2.7508e-24, 1.0446e-30, 1.5430e-16, 4.7654e-13,
        1.2049e-10, 1.3134e-28, 1.0037e-15, 9.9978e-01, 8.5892e-10, 5.8571e-08,
        9.6560e-09, 1.4167e-18, 5.8883e-18, 2.3362e-06, 1.1612e-12, 1.1260e-01])