In [97]:
import pandas as pd
import numpy as np
from itertools import chain
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

import torch
from torch import Tensor
from torch.nn import Module, Sigmoid, Linear, ReLU, Softmax

from torch.utils.data import Dataset, random_split, DataLoader
from torch.optim import SGD
from torch.nn.init import kaiming_uniform_, xavier_uniform_
from torch.nn import CrossEntropyLoss, BCELoss


In [8]:
# Loading the dataset
filepath = '/Users/root947/Desktop/Oodle/df_cleaned.csv'
df = pd.read_csv(filepath)[['age', 'area', 'application_outcome']]

df.head()

Unnamed: 0,age,area,application_outcome
0,61,rural,approved
1,42,rural,approved
2,57,urban,approved
3,32,rural,declined
4,29,urban,declined


In [34]:
# Preprocessing the data - making it ALL numerical

#convert area column to numerical
le = LabelEncoder()
df['area'] = le.fit_transform(df.area.values)
    
# scale the age column to between 0 and 1
scaler = MinMaxScaler(feature_range = (0,1))
age_t = scaler.fit_transform(df.age.values.reshape(-1, 1))
df['age'] = list(chain(*(age_t)))
    
#converting the outcome variable to binary
df['application_outcome'] = le.fit_transform(df.application_outcome)

In [77]:
#dataset definition - using Dataset

class CSVDataset(Dataset):
    
    def __init__(self):
    
        #store the input and output features
        self.X = df.values[:,:-1] # all rows + all columns except last
        self.y = df.values[:,-1] # all rows + last column only
    
        # ensure all data is numerical - type(float)
        self.X = self.X.astype('float32')
        self.y = self.y.astype('float32')
        #self.y = self.y.reshape((len(self.y), 1))
    
    # number of rows in dataset
    def __len__(self):
        return len(self.X)
    
    # get a row at an index
    def __getitem__(self, index):
        return [self.X[index], self.y[index]]
    
    # split into train and testset - using `random_split`
    def get_splits(self, split_ratio = 0.2):
        test_size = round(split_ratio * len(self.X))
        train_size = len(self.X) - test_size
        
        return random_split(self, [train_size, test_size])

In [90]:
# Model definition - using Module

class myMLPNetwork(Module):
    
    # initialize the class
    def __init__(self, n_inputs):
        
        # calling constructor of parent class
        super().__init__()
        
        # defining the inputs to the first hidden layer - type of hidden layer, weights, activation
        self.hid1 = Linear(n_inputs, 8) # equivalent to keras's dense layer
        kaiming_uniform_(self.hid1.weight, nonlinearity='relu') # init the weights; Common examples include the Xavier and He weight initialization schemes
        self.act1 = ReLU()
        
        # defining the inputs to the second hidden layer
        self.hid2 = Linear(8, 16)
        kaiming_uniform_(self.hid2.weight, nonlinearity='relu')
        self.act2 = ReLU()
        
        # defining the inputs to the third hidden layer
        self.hid3 = Linear(16, 2)
        xavier_uniform_(self.hid3.weight)
        self.act3 = Softmax(dim=1)
        
    def forward(self, X):
        
        #input and act for layer 1
        X = self.hid1(X)
        X = self.act1(X)
        
        #input and act for layer 2
        X = self.hid2(X)
        X = self.act2(X)
        
        #input and act for layer 3
        X = self.hid3(X)
        X = self.act3(X)
        
        return X
        

In [91]:
# Preparing the dataset before training the model

# load the dataset
dataset = CSVDataset()

# get the train and test split
train, test = dataset.get_splits()

# prepare dataloaders - essentially create batches (for both train and test) to be sent as input to the model
train_dl = DataLoader(train, batch_size = 32, shuffle = True)
test_dl = DataLoader(test, batch_size= 32, shuffle= False) # because we need unshuffled labels at the end to draw confusion matrix

In [92]:
# Training the model

# define the network
model = myMLPNetwork(2) # 2 because we only have 2 input features

# define the number of epochs
epochs = 10

# define the optimizer -SGD
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)

# define the loss function
criterion = CrossEntropyLoss()

# iterate through all the epoch
for epoch in range(epochs):
    # go through all the batches generated by dataloader
    for i, (inputs, targets) in enumerate(train_dl):
            # clear the gradients
            optimizer.zero_grad()
            # compute the model output
            yhat = model(inputs)
            # calculate loss
            loss = criterion(yhat, targets.type(torch.LongTensor))
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()
    

In [236]:
# Evaluate the model
predictions, actuals = list(), list()

# loop over all batches in test set

for i, (inputs, targets) in enumerate(test_dl):
    # pass input to the model
    y_pred = model(inputs) 
    # retrieve the numpy array
    y_pred = y_pred.detach().numpy()
    # pick the index of the highest values
    res = np.argmax(y_pred, axis = 1) 
    
    # actual output
    actual = targets.numpy()
    actual = actual.reshape(len(actual), 1)
    
    # store the values in respective lists
    predictions.append(list(res))
    actuals.append(list(actual))
    
actuals = [val for sublist in vstack(list(chain(*actuals))) for val in sublist]
predictions = [val for sublist in vstack(list(chain(*predictions))) for val in sublist]

    

In [237]:
# evaluating how good the mdoel is
from sklearn.metrics import accuracy_score

accuracy_score(actuals, predictions)

0.7495