In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
from sklearn.preprocessing import StandardScaler

%matplotlib inline                               


Download from https://drive.google.com/file/d/1rT7jbjLd4vGw6WKD3IZxeNRipe-Uu3ZC/view?usp=sharing

In [None]:
train_df = pd.read_pickle("./train_family_relation.pkl")
valid_df = pd.read_pickle("./valid_family_relation.pkl")
test_df = pd.read_pickle("./test_family_relation.pkl")


In [None]:
## Family DataSet
X_train = train_df.drop(['label','class'],axis=1).values.astype(np.float64)
X_valid = valid_df.drop(['label','class'],axis=1).values.astype(np.float64)
X_test = test_df.drop(['label','class'],axis=1).values.astype(np.float64)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

#####################################
y_train_class = train_df['class'].values.astype(np.int64)
y_valid_class = valid_df['class'].values.astype(np.int64)
y_test_class = test_df['class'].values.astype(np.int64)

#######################################################
train_dataset = TensorDataset(torch.from_numpy(X_train).type(torch.FloatTensor), torch.from_numpy(y_train_class))
valid_dataset = TensorDataset(torch.from_numpy(X_valid).type(torch.FloatTensor), torch.from_numpy(y_valid_class))
test_dataset = TensorDataset(torch.from_numpy(X_test).type(torch.FloatTensor), torch.from_numpy(y_test_class))

############################################################
family_loaders = {}
family_loaders['train'] = DataLoader(train_dataset, batch_size=200, shuffle=True)
family_loaders['valid'] = DataLoader(valid_dataset, batch_size=200)
family_loaders['test'] = DataLoader(test_dataset, batch_size=200)


## Family Model Development

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1024, 700)
        self.fc2 = nn.Linear(700, 500)
        self.fc3 = nn.Linear(500, 200)
        self.fc4 = nn.Linear(200, 50)
        self.output = nn.Linear(50, 2)
        
        self.dropout = nn.Dropout(p=0.5)
        self.logSoftMax = nn.LogSoftmax(dim=1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.dropout(self.relu(self.fc3(x)))
        x = self.dropout(self.relu(self.fc4(x)))
        x = self.logSoftMax(self.output(x))
        return x

model = Net()
use_cuda = torch.cuda.is_available()
if use_cuda:
    model.cuda()

In [None]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

In [None]:
"""returns trained model"""
# initialize tracker for minimum validation loss
valid_loss_min = np.Inf 

for epoch in range(200):
    # initialize variables to monitor training and validation loss
    train_loss = 0.0
    valid_loss = 0.0

    ###################
    # train the model #
    ###################
    model.train()
    for batch_idx, (data, target) in enumerate(family_loaders['train']):
        optimizer.zero_grad()
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        ## find the loss and update the model parameters accordingly
        pred = model(data)
        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()
        ## record the average training loss, using something like
        train_loss += ((1 / (batch_idx + 1)) * (loss.data - train_loss))

    ######################    
    # validate the model #
    ######################
    model.eval()
    for batch_idx, (data, target) in enumerate(family_loaders['valid']):
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        ## update the average validation loss
        pred = model(data)
        loss = criterion(pred, target)
        valid_loss += ((1 / (batch_idx + 1)) * (loss.data - valid_loss))
    if(epoch % 10 == 0):
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))

    ## TODO: save the model if validation loss has decreased
    if valid_loss < valid_loss_min:
        if(epoch % 10 == 0):
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(model.state_dict(), "checkpoint.cpt")
        valid_loss_min = valid_loss

In [None]:
model.load_state_dict(torch.load('checkpoint.cpt'))
# monitor test loss and accuracy
test_loss = 0.
correct = 0.
total = 0.

model.eval()
for batch_idx, (data, target) in enumerate(family_loaders['test']):
    # move to GPU
    if use_cuda:
        data, target = data.cuda(), target.cuda()
    # forward pass: compute predicted outputs by passing inputs to the model
    output = model(data)
    # calculate the loss
    loss = criterion(output, target)
    # update average test loss 
    test_loss = test_loss + ((1 / (batch_idx + 1)) * (loss.data - test_loss))
    # convert output probabilities to predicted class
    pred = output.data.max(1, keepdim=True)[1]
    # compare predictions to true label
    correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
    total += data.size(0)

print('Test Loss: {:.6f}\n'.format(test_loss))

print('\nTest Accuracy: %2d%% (%2d/%2d)' % (
    100. * correct / total, correct, total))