In [0]:
!pip install transformers
#!pip install imbalanced-learn

In [0]:
### This File is made to work on google colab, but can work with modifications locally

# Transformer networks
from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining

# Pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset

### Load Numpy and Matplotlib
import numpy as np
import pandas as pd

# Metrics
import sklearn.metrics as mec
from mlxtend.evaluate import confusion_matrix 
from mlxtend.plotting import plot_confusion_matrix

# Plotting
import matplotlib.pyplot as plt
from matplotlib import collections as matcoll
from graphviz import Digraph

# Data handling
from DataProcessor import DataInstance, DataProcesser

# Utils
import itertools
import time
import pickle
from imblearn.over_sampling import RandomOverSampler

# Printing Options
torch.set_printoptions(precision=5) # Printing accuracy
np.set_printoptions(suppress=True)

from google.colab import files # Colab specific

print("Done")

In [0]:
# Hyper parameters
NumberOfLabels = 18 # Number of participants
BatchSize = 20
NumberOfEpochs = 9 # 9 epochs with this setup seems to be the perfect balance between valid and train set
TrainingSplit = .5 # Split between training and validation
seed = 0
lr = 0.00001 # Lerning Rate
ModelDirectory = 'TurkuNLP/wikibert-base-da-cased'
DataFile = './processed-set-final.bin'
print("Done")

In [0]:
ProcessedSet = DataProcesser.load(DataFile) # Loading data

In [0]:
for inst in ProcessedSet.instances:
    inst.preds = torch.tensor([]) # Standardizing the non existing predictions

In [0]:
def NavieSampler(Processor):
    ros = RandomOverSampler(random_state=seed)

    x = list(map(lambda x: [x], Processor.instances))
    y = Processor.labels()

    x, y = ros.fit_resample(x, y)

    return DataProcesser.FromInstances(list(map(lambda v: v[0], x)))


def Splitter(Processor, split = .2):
    # Processor contains all data
    # Split is the split ration
    np.random.seed(seed)
    data = Processor.instances

    splitIndex = np.random.choice(len(data), len(data), replace=False).astype(int)


    split = round((1.0-split)*len(data))

    set1 = np.array(data)[splitIndex[0:split]]
    set2 = np.array(data)[splitIndex[split:]]

    return DataProcesser.FromInstances(set1),  DataProcesser.FromInstances(set2)


Train, Valid = Splitter(ProcessedSet, TrainingSplit)

Train = NavieSampler(Train)
Train.cuda()
Valid.cuda()
ProcessedSet.cuda()


TrainingSet = TensorDataset(Train.ids(), Train.types(), Train.masks(), Train.labels())

# Final split
print(len(ProcessedSet.instances))
print(len(Train.instances))
print(len(Valid.instances))


In [0]:
class MultiLableModel(BertForPreTraining):
    def __init__(self, config, numLabels=2):
        super(MultiLableModel, self).__init__(config)

        self.numLabels = numLabels
        self.bert = BertModel(config)
        
        self.classifier = nn.Linear(config.hidden_size, numLabels)        
        self.drop = nn.Dropout(0.5)
        self.out = nn.Sigmoid()
        

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        _, x = self.bert(input_ids, attention_mask, token_type_ids)
        x = self.drop(x)
        x = self.classifier(x)    
        x = self.out(x)

        return x

# Accuracy osv messurements
def acc(Processor, model):
    with torch.no_grad():
        preds = model(Processor.ids(), Processor.masks(), Processor.types())
        if preds.is_cuda:
            preds = preds.cpu()

        return preds.numpy()

# Evaluates the model on a data processer set
def eval(Processor, model):
    preds = acc(Processor, model)
    rounded = (preds/preds.max(axis=0)).round()
    labels = Processor.labels().cpu()

    return mec.classification_report(labels,rounded)


def plotTrainingSession(loss_train, loss_valid):
    plt.figure(figsize = (10,5))
    plt.plot(loss_train, label='Training loss')
    plt.plot(loss_valid, label='Validation loss')
    plt.legend(loc="upper right")
    plt.ylabel('BCE Batch Loss')
    plt.xlabel('Epoch')
    plt.show()


def ConfusionPlot(Processed, model):
    preds = acc(Processed, model)
    rounded = (preds/preds.max(axis=0)).round()

    Map = np.zeros((18,18))
    labels = Processed.labels().cpu()

    for x, y in zip(rounded, labels):
    label = np.where(y==1.0)[0][0]
    Map[label] += x
    Map[label] = Map[label]

    for i, lane in enumerate(Map):
    Map[i] = Map[i]/sum(lane)


    plt.figure(figsize = (20,20))
    classes = range(18)
    plt.imshow(Map.T, interpolation='nearest')
    plt.clim(0.0,1.0)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

print("Done") 

In [0]:
# Loading of the model
torch.manual_seed(seed) # set fixed random seed for reproducibility

print("Initializeing Model")
model = MultiLableModel.from_pretrained(ModelDirectory, numLabels = NumberOfLabels).cuda()
print("Model Loaded")

optimizer = optim.Adam(model.parameters(), lr=lr)
loss_function = nn.BCELoss()

train_loader = torch.utils.data.DataLoader(TrainingSet,
    batch_size=BatchSize, shuffle=True)

trainLossEpochs = []
validLossEpochs = []

print("Starting training")
for epoch in range(1, NumberOfEpochs + 1):
    model.train()
    totalBatchLoss = 0
    for batch_idx, data in enumerate(train_loader):
        ids, types, masks, labels = data

        optimizer.zero_grad()
        outputs = model(ids, masks, types)
        
        loss = loss_function(outputs, labels.float())

        loss.backward()
        optimizer.step()
        totalBatchLoss += loss.item()

   
    print('====> Epoch: {} Total Loss: {:.4f} Average batch Loss: {:.4f}\r'.format(
          epoch, totalBatchLoss, totalBatchLoss/batch_idx))
    
    trainLossEpochs.append(totalBatchLoss/batch_idx)

    with torch.no_grad():
      out = model(Valid.ids(), Valid.masks(), Valid.types())
      labels = Valid.labels().float()

      validLoss = loss_function(out, labels).item()
      validLossEpochs.append(validLoss)
      print('===> Validation Loss {}'.format(validLoss))
    

In [0]:
print("Loss plot")
plotTrainingSession(trainLossEpochs, validLossEpochs)

In [0]:
torch.save(model.state_dict(), "./Temp") # If gpu memory is too little, save the model and then restart the notebook, and then load it back in the next cell

In [0]:
torch.manual_seed(seed) # set fixed random seed for reproducibility
print("Initializeing Model")
model = MultiLableModel.from_pretrained(ModelDirectory, numLabels = NumberOfLabels).cuda()
print("Model Loaded")

model.load_state_dict(torch.load("./Temp"))

In [0]:
model.eval()

In [0]:
print("Training")
evalText = eval(Train, model)
print(evalText)
print("--"*20)
print("Validation")
evalText = eval(Valid, model)
print(evalText)
print("--"*20)
print("Over All")
evalText = eval(ProcessedSet, model)
print(evalText)
print("--"*20)

In [0]:
print("Over all")
ConfusionPlot(ProcessedSet, model)

In [0]:
print("Training")
ConfusionPlot(Train, model)

In [0]:
print("Validation")
ConfusionPlot(Valid, model)

In [0]:
def lineScatter(x, y, fileName = None):
    lines = []
    for i in range(len(x)):
        pair=[(x[i],0), (x[i], y[i])]
        lines.append(pair)
        
    linecoll = matcoll.LineCollection(lines)
    fig, ax = plt.subplots()
    ax.add_collection(linecoll)

    plt.scatter(x,y)
    plt.xlabel("Participants")
    plt.ylabel("Sum probability")
    plt.xticks(x)
    plt.ylim(0,1)

    plt.show()
    if fileName != None:
        plt.draw()
        fig.savefig(fileName)

In [0]:
Map = np.zeros((18,18))

preds = acc(ProcessedSet, model)
labels = ProcessedSet.labels().cpu()

for x, y in zip(preds, labels):
    label = np.where(y==1.0)[0][0]
    Map[label] += x
    Map[label] = Map[label]

for i, lane in enumerate(Map):
    Map[i] = Map[i]/sum(lane)

In [0]:
!mkdir out

In [0]:
for i, dist in enumerate(Map):
    print("Particitpant {}".format(i))
    lineScatter(range(18), dist/sum(dist), "./out/par_{}_dist".format(i))

In [0]:
!zip -r ./file.zip ./out

In [0]:
files.download("./file.zip")