## Imports

In [1]:
import pandas as pd
import numpy as np
import os  
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Preprocessing

In [2]:
def filepath(f):
    d = os.path.join(os.path.dirname(os.getcwd()), 'processed_data', f)
    return d

In [3]:
# Import training and test datasets
train_data = pd.read_csv(filepath("final_training_set.csv"))
test_data = pd.read_csv(filepath("final_test_set.csv"))

In [4]:
uid = train_data["ClaimID"]
train_data = train_data.drop(
    ["ClaimID"],
    axis = 1
)

In [5]:
def minmax_encode(df, col):
    """
    Return dataset including the minmax encoded column and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be numeric
    """

    maxx = df[col].max()
    minx = df[col].min()
    out = list(map(lambda x: (x-minx)/(maxx-minx), df[col]))
    new_colname = col + "_minmax"
    df[new_colname] = out
    return df.drop(
        [col],
        axis = 1
    )

def one_hot_encode(df, col):
    """
    Returns the dataset including the one hot encoded columns and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be a Series that represents a categorical variable
    """
    ohe_cols = pd.get_dummies(df[col], prefix = col)
    output = pd.concat(
        [df, ohe_cols],
        axis = 1,
    ).drop(
        [col],
        axis = 1
    )
    return output

def frequency_encode(df, col):
    """
    Returns the dataset including the frequency encoded column and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be a Series that represents a categorical variable with high cardinality
    """
    val_counts = df[col].value_counts().to_dict()
    total = len(col)
    out = []
    for x in df[col]:
        out.append(val_counts[x]/total)
    new_colname = col + '_freq'
    df[new_colname] = out
    df.drop(
        [col],
        axis = 1,
        inplace = True
    )

    return df

In [6]:
freq_encoded_cols = []
ohe_cols = []
num_cols = []
unique_threshold = 30

for col in train_data.columns:
    if train_data[col].nunique() == 1:
        print(col,"has been removed as it is constant")
        train_data.drop([col], axis=1, inplace=True)
    elif train_data[col].nunique() == 2: # Binary columns
        continue
    elif train_data[col].dtype in ['int64','float64']:
        train_data[col] = train_data[col].fillna(train_data[col].median())
        num_cols.append(col)
    elif train_data[col].nunique() > unique_threshold:
        freq_encoded_cols.append(col)
    elif 2 < train_data[col].nunique() <= unique_threshold:
        ohe_cols.append(col)

procedure_1 has been removed as it is constant
procedure_2 has been removed as it is constant
procedure_3 has been removed as it is constant


In [7]:
for col in train_data.columns:
    if col in num_cols:
        train_data = minmax_encode(train_data, col)
    elif col in ohe_cols:
        train_data = one_hot_encode(train_data, col)
    elif col in freq_encoded_cols:
        try:
            train_data = frequency_encode(train_data, col)
        except:
            print(col)
    elif train_data[col].nunique() == 1:
        train_data.drop(
            [col],
            axis = 1,
            inplace = True
        )
    else:
        train_data[col] = train_data[col].astype('bool')

In [8]:
y = train_data["PotentialFraud"]
train_data.drop(
    ["PotentialFraud"],
    axis = 1,
    inplace = True
)

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y, test_size=0.2, random_state=42)

## GAN Model

In [10]:
# Define GAN model

class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.fc1 = nn.Linear(self.input_dim, 256)
        self.fc2 = nn.Linear(256, 512)
        self.fc3 = nn.Linear(512, self.output_dim)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.tanh(self.fc3(x))

class Discriminator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Discriminator, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.fc1 = nn.Linear(self.input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, self.output_dim)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.sigmoid(self.fc3(x))

In [11]:
X_train

Unnamed: 0,inpatient,Gender,RenalDiseaseIndicator,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,...,InscClaimAmtReimbursed_minmax,DeductibleAmtPaid_minmax,Race_minmax,IPAnnualReimbursementAmt_minmax,IPAnnualDeductibleAmt_minmax,OPAnnualReimbursementAmt_minmax,OPAnnualDeductibleAmt_minmax,age_minmax,claim_duration_minmax,time_under_care_minmax
359776,False,True,False,False,True,False,False,False,False,True,...,0.00080,0.0,0.00,0.047206,0.000000,0.060856,0.044075,0.750859,0.000000,0.000000
148383,False,True,False,False,False,True,False,False,True,False,...,0.00400,0.0,0.00,0.083319,0.027906,0.028438,0.074422,0.585081,0.000000,0.000000
213438,False,True,False,False,False,False,False,False,False,True,...,0.00024,0.0,0.25,0.047206,0.000000,0.018441,0.015896,0.536182,0.000000,0.000000
508770,False,True,False,False,True,False,False,False,False,True,...,0.00480,0.0,0.00,0.047206,0.000000,0.006503,0.002168,0.561728,0.138889,0.000000
364927,False,True,False,False,False,False,False,False,True,False,...,0.00240,0.0,0.00,0.047206,0.000000,0.039018,0.011561,0.647431,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,False,True,False,False,False,True,False,False,False,True,...,0.00008,0.0,0.00,0.047206,0.000000,0.004465,0.012283,0.658541,0.388889,0.000000
259178,False,True,False,False,True,False,False,False,False,False,...,0.00080,0.0,0.00,0.047206,0.000000,0.012424,0.105491,0.555113,0.000000,0.000000
365838,True,True,True,True,False,True,False,False,True,False,...,0.06400,1.0,0.00,0.095120,0.027906,0.012424,0.059971,0.721914,0.166667,0.171429
131932,False,True,False,True,True,False,False,True,False,True,...,0.00008,0.0,0.00,0.047206,0.000000,0.003494,0.002168,0.619618,0.000000,0.000000


In [12]:
y_train

359776    False
148383    False
213438    False
508770    False
364927     True
          ...  
110268    False
259178    False
365838     True
131932    False
121958    False
Name: PotentialFraud, Length: 446568, dtype: bool