In [0]:
### This File is made to work on google colab, but can work with modifications locally
### Import Pytorch and other relevant packages
import torch
import torch.nn as nn
### Import MNIST dataset 
from torchvision.datasets import MNIST
### Load Numpy and Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import Normalize
import numpy as np
import pandas as pd

import seaborn as sns

import torch.optim as optim
from torch.utils.data import TensorDataset
import torch.nn.functional as F

from graphviz import Digraph

from DataProcessor import DataInstance, DataProcesser

import sklearn.metrics as mec

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score, adjusted_rand_score

import csv

import os
import math
import struct

from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import confusion_matrix

from google.colab import files

torch.set_printoptions(sci_mode=False)

In [0]:
os.listdir()

In [0]:
ProcessedSet = DataProcesser.load('./processed-fasttext-raw.bin') # fasttext processed data
seed = 0
TrainingSplit = .5

In [0]:
def NavieSampler(Processor):
  ros = RandomOverSampler(random_state=seed)

  x = list(map(lambda x: [x], Processor.instances))
  y = Processor.labels()

  x, y = ros.fit_resample(x, y)

  return DataProcesser.FromInstances(list(map(lambda v: v[0], x)))


def Splitter(Processor, split = .2):
  # Processor contains all data
  # Split is the split ration

  data = Processor.instances
  np.random.seed(seed)
  splitIndex = np.random.choice(len(data), len(data), replace=False).astype(int)


  split = round((1.0-split)*len(data))
  
  set1 = np.array(data)[splitIndex[0:split]]
  set2 = np.array(data)[splitIndex[split:]]

  return DataProcesser.FromInstances(set1),  DataProcesser.FromInstances(set2)


Train, Valid = Splitter(ProcessedSet, TrainingSplit)


dimX = Train.preds()[0].shape[0]
dimY = Train.labels()[0].shape[0]


Train = NavieSampler(Train)
Train.cuda()

TrainingSet = TensorDataset(Train.preds(), Train.labels())

print(len(ProcessedSet.instances))
print(len(Train.instances))
print(len(Valid.instances))


In [0]:
### BaseLine class
class BaseLine(nn.Module):
    def __init__(self):
        super(BaseLine, self).__init__()
        ### Encoder layers
        self.out = nn.Linear(dimX, dimY)
        self.logit = nn.Sigmoid()


    def forward(self, x):
        ### Autoencoder returns the reconstruction 
        ### and latent representation
        x = self.out(x)
        x = self.logit(x)
        
        return x

def evaluate(Processer, model):
  with torch.no_grad():
    res = model(Processer.preds())
  
  def PRound(t): 
    t = t.numpy()
    return (t.T/t.max(axis=1)).T.round()

  Processer.cpu()
  res = res.cpu()
  true = Processer.labels()
  rounded = PRound(res.squeeze(1))

  return mec.classification_report(true, rounded), (true, rounded)


def plotTrainingSession(loss_train, loss_valid):
  plt.figure(figsize = (10,5))
  plt.plot(loss_train, label='Training loss')
  plt.plot(loss_valid, label='Validation loss')
  plt.legend(loc="upper right")
  plt.ylabel('BCE Batch Loss')
  plt.xlabel('Epoch')
  plt.show()

def ConfusionPlot(Processed, model):
  Processed.cuda()
  _, (labels, rounded) = evaluate(Processed , model)

  Map = np.zeros((18,18))
  Processed.cpu()

  for x, y in zip(rounded, labels):
    label = np.where(y==1.0)[0][0]
    Map[label] += x
    Map[label] = Map[label]

  for i, lane in enumerate(Map):
    Map[i] = Map[i]/sum(lane)


  plt.figure(figsize = (20,20))
  classes = range(18)
  plt.imshow(Map.T, interpolation='nearest')
  plt.clim(0.0,1.0)
  plt.colorbar()
  tick_marks = np.arange(len(classes))
  plt.xticks(tick_marks, classes, rotation=45)
  plt.yticks(tick_marks, classes)
  plt.tight_layout()
  plt.ylabel('True label')
  plt.xlabel('Predicted label')
  plt.show()


print("Loaded")

In [0]:
torch.manual_seed(seed) # set fixed random seed for reproducibility

model = BaseLine().cuda()

lr = 0.01

optimizer = optim.Adam(model.parameters(), lr=lr)
loss_function = nn.BCELoss()
#loss_function = nn.CrossEntropyLoss()

trainLossEpochs = []
validLossEpochs = []

epochs = 25

train_loader = torch.utils.data.DataLoader(TrainingSet,
    batch_size=20, shuffle=True)

Valid.cuda()
for epoch in range(1, epochs + 1):
    model.train()
    running_loss = 0
    for batch_idx, data in enumerate(train_loader):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs.float())

        loss = loss_function(outputs, labels.float())
        #loss = loss_function(outputs.squeeze(1).float(), labels.long())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

   
    print('====> Epoch: {} Total Loss: {:.4f} Average Batch Loss {:.4f}\r'.format(
          epoch, running_loss, running_loss/batch_idx))
    trainLossEpochs.append(running_loss/batch_idx)
    with torch.no_grad():
      out = model(Valid.preds())
      validLoss = loss_function(out, Valid.labels().float()).item()
      validLossEpochs.append(validLoss)
      print('===> Validation Loss {}'.format(validLoss))
    

In [0]:
Train.cuda()
Valid.cuda()
ProcessedSet.cuda()

In [0]:
print("Loss plot")
plotTrainingSession(trainLossEpochs, validLossEpochs)
print("Training")
evalText, matrix = evaluate(Train, model)
print(evalText)
print("--"*20)
print("Validation")
evalText, matrix = evaluate(Valid, model)
print(evalText)
print("--"*20)
print("Over All")
evalText, matrix = evaluate(ProcessedSet, model)
print(evalText)

In [0]:
print("Training")
ConfusionPlot(Train, model)
print("--"*20)
print("Validation")
ConfusionPlot(Valid, model)
print("--"*20)
print("Over All")
ConfusionPlot(ProcessedSet, model)