# Starter pack, imports

##Libraries

In [0]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.utils.data.sampler import SubsetRandomSampler, WeightedRandomSampler

from sklearn.metrics import confusion_matrix

## Files import

Please ** import the corresponding files into the google collab by selecting "files" -> "import"**. The execution of further cells won't ork otherwise. All these files can be found on the git repository in the /data folder.

List of files to import :


*  "corpus_tweets.txt"
*  "corpus_tweets_subtask_b.txt"
*  "corpus_tweets_subtask_c.txt"
*  "labels_1.txt"
*  "labels_subtask_b.txt"
*  "labels_subtask_c.txt"
*  "emb_dic.txt"



Tokenized cleaned corpus for each task

In [0]:
#  <-------------- Tokenized corpus, specific to each task -------------> #

# Get the cleaned tokenized corpus back for a preprocessed .txt file

# Whole corpus : for task A
with open("corpus_tweets.txt", "r") as file:
    tmp = file.read().splitlines()

tokenized_corpus = [[token for token in sentence.split(' ')][:-1] for sentence in tmp]

# Task B
with open("corpus_tweets_subtask_b.txt", "r") as file:
    tmp_b = file.read().splitlines()

tokenized_corpus_b = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_b]

# Task C
with open("corpus_tweets_subtask_c.txt", "r") as file:
    tmp_c = file.read().splitlines()

tokenized_corpus_c = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_c]

Labels transformed to integers for each task

In [0]:
#  <--------- Labels transformed to int, specific to each task --------> #

# Task A : 0 = "NOT" , 1 = "OFF"
with open("labels_1.txt", "r") as file:
    label1 = file.read().splitlines()
label1 = [float(i) for i in label1]

# Task B : 0 = "UNT" , 1 = "TIN"
with open("labels_subtask_b.txt", "r") as file:
    label_b = file.read().splitlines()
label_b = [float(i) for i in label_b]

# Task C : 0 = "OTH" , 1 = "IND", 2 = "GRP"
with open("labels_subtask_c.txt", "r") as file:
    label_c = file.read().splitlines()
label_c = [float(i) for i in label_c]


Embedding dictionnary

In [0]:
# This function creates a dictionnary of all embeddings present in a .txt file
def extract_dic(path):
    dic = {}
    glove = open(embedding_path)
    for line in glove:
        values = line.split()
        word = values[0]
        try:
            vector = np.asarray(values[1:], dtype='float32')
            if len(vector) != 100:
                print(word, len(vector))
            dic[word] = vector
        except:
            print("Parsing problem on word ", word, " discarding it")
    glove.close()
    return dic

In [7]:
embedding_path = 'emb_dic.txt'

emb_dict = extract_dic(embedding_path)
print(len(emb_dict['should']))

100


## Device selection : run on GPU

In [8]:
GPU = True
device_idx = 0
if GPU:
    device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")
print(device)
cpu = torch.device("cpu")

cuda:0


In [9]:
# We set a random seed to ensure that your results are reproducible.
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
torch.manual_seed(0)

<torch._C.Generator at 0x7f476bb6cfd0>

# Word embedding from GloVe

We define a function that builds the input for our neural networks from the embedding dictionnary and our corpus of tweets: it operates the translation of each string in each tweets to its embedding representation.

There are two versions of this corpus embedding. In the first one, the embedding of a sentence is the mean of the embedding of its tokens. Thus, the shape of each sentence embedding is [1;100], the same as the word embedding

In [0]:
def embed_corpus(emb_dict, corpus):
    # Prepare container for tweet embeddings
    inputs_ = torch.zeros((len(corpus), 100))

    # Counter for debugging purposes
    count_not_found = 0.
    total_count = 0.

    # We loop over all the tweets in the corpus
    for idx, sentence in enumerate(corpus):
        sentence_length = len(sentence)
        mean_embedding = torch.zeros(100)
        for word in sentence:
            total_count += 1
            if word in emb_dict.keys():
                mean_embedding += torch.Tensor(emb_dict[word])
            else:
                count_not_found += 1

        # We average the word embedding over the sentence
        mean_embedding /= sentence_length

        # We add the embedded sentence to the inputs tensor
        inputs_[idx] = mean_embedding
    ratio = (count_not_found / total_count) * 100

    print("Percentage of not recognised words (those we do not have an embedding for) : %.2f" % ratio, "%")
    # We return the embedded corpus
    return inputs_

In the second version, that will be useful for models operating on a sequence of tokens, we do not average over the tokens of a sentence, but concatenate the embeddings of the tokens. Because our models need the same input size for every input, we must match the size of the longuest tweet of the corpus (105 tokens), and pad the shorter ones. This leads to a sparse matrix for the short tweets, but this model has the advantage of being simple.

In [0]:
def embed_corpus_2(emb_dict, corpus):

    tweet_lengths = [len(tweet) for tweet in corpus]
    max_len = np.max(np.array(tweet_lengths))

    # Prepare container for tweet embeddings
    inputs_ = torch.zeros((len(corpus), max_len, 100))

    # Counter for debugging purposes
    count_not_found = 0.
    total_count = 0.

    # We loop over all the tweets in the corpus
    for idx, tweet in enumerate(corpus):
        # and over all the words in a tweet
        for idx2, word in enumerate(tweet):
            total_count += 1
            if word in emb_dict.keys():
                inputs_[idx, idx2] = torch.Tensor(emb_dict[word])
            else:
                count_not_found += 1
    ratio = (count_not_found / total_count) * 100

    print("Percentage of not recognised words (those we do not have an embedding for) : %.2f" % ratio, "%")
    # We return the embedded corpus
    return inputs_

# Data loaders

We define a function data_loader that takes as inputs the embedded tweets and the corresponding labels for a given task, the batch size, the weight of the potential validation and test size, as well as the balancing option. It returns 3 pytorch Dataloader objects on which we will be able to train our models.

In [0]:
def data_loader(emb_corpus, labels, batch_size, random_seed, valid_size=0.1, 
                test_size=0.1, balancing=True):

    
    labels_ = torch.Tensor(labels)
    
    # Sending everything to GPU
    labels_.to(device)
    emb_corpus.to(device)
    
    # Create dataset pytorch object
    dataset_ = torch.utils.data.TensorDataset(emb_corpus, labels_)

    # Split to train / valid / test dataset
    size_dataset = len(labels)
    indices = list(range(size_dataset))
    valid_split = int(np.floor(valid_size * size_dataset))
    test_split = int(np.floor(test_size * size_dataset))

    np.random.seed(random_seed)
    np.random.shuffle(indices)

    # Create lists of train/valid/test indices, to be provided to SubsetSampler 
    train_valid_idx, test_idx = indices[test_split:], indices[:test_split]
    train_idx, valid_idx = train_valid_idx[valid_split:], train_valid_idx[:valid_split]

    # Instantiate SubsetSamplers from list of indices.
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    test_sampler = SubsetRandomSampler(test_idx)

    # Valid and test loaders are not balanced
    valid_loader = torch.utils.data.DataLoader(
        dataset_, batch_size=batch_size, sampler=valid_sampler)
    test_loader = torch.utils.data.DataLoader(
        dataset_, batch_size=batch_size, sampler=test_sampler)

    # If balancing set to False, we just create a data loader from the train indices
    if not balancing:
        train_loader = torch.utils.data.DataLoader(
            dataset_, batch_size=batch_size, sampler=train_sampler)

        return train_loader, valid_loader, test_loader

    # If there is balancing to do, we first extract the training samples 
    # according to the predefined indices, before using a weighted sampler
    if balancing:
        train_loader_unbalanced = torch.utils.data.DataLoader(
            dataset_, batch_size=len(labels), sampler=train_sampler)
        
        # Get back training data from sampler
        training_data, training_labels = next(iter(train_loader_unbalanced))
        
        train_dataset = torch.utils.data.TensorDataset(training_data, training_labels)

        # WeightedSampler takes the list of weights as input
        class_sample_count = np.array([len(np.where(training_labels == t)[0]) 
                                       for t in np.unique(training_labels)])
        
        weight = 1. / class_sample_count
        samples_weight = np.array([weight[int(t)] for t in training_labels])

        samples_weight = torch.from_numpy(samples_weight)
        samples_weight = samples_weight.double()
        balance_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

        train_loader_balanced = DataLoader(train_dataset, batch_size=batch_size, 
                                           num_workers=1, sampler=balance_sampler)

        return train_loader_balanced, valid_loader, test_loader

# Some utils functions

Accuracy : output and target expected to be lists of label integers

In [0]:
def accuracy(output, target):
    correct = (output == target)
    acc = float(float(correct.sum()) /len(output) ) * 100
    return acc

Recall and precision, computed from the confusion matrix

In [0]:
def recall_precision(cm):
    nb_classes = cm.shape[0]
    recall = np.zeros(nb_classes)
    precision = np.zeros(nb_classes)
    for idx in range(nb_classes):
        if (idx + 1) < nb_classes:
            false_negative = np.concatenate((cm[idx, :idx], cm[idx, (idx + 1):]))
            false_positive = np.concatenate((cm[:idx, idx], cm[(idx + 1):, idx]))
        else:
            false_negative = cm[idx, :idx]
            false_positive = cm[:idx, idx]
        
        true_positive = cm[idx, idx]
        
        recall[idx] = true_positive / (true_positive + false_negative.sum())
        precision[idx] = true_positive / (true_positive + false_positive.sum())
    
    return recall, precision

Compute F1-measure from recall and precision. Output is a list containing F1 measure from each class label

In [0]:
def f1_measure(recall, precision):
    f1 = 2 * (recall * precision) / (recall + precision)
    return f1

Metrics function that wraps all the metrics defined above. The Confusion matrix is obtained with the sklearn corresponding function

In [0]:
def metrics(prediction, target):
    
    # Number of correct predictions
    acc = accuracy(prediction, target)
    
    # Confusion matrix
    cm = confusion_matrix(target, prediction)
    
    # Recall and precision
    recall, precision = recall_precision(cm)
    f1 = f1_measure(recall, precision)
    
    return acc, cm, recall, precision, f1

We define an function that evaluates the performance of a model on a given dataloader. This will be usefull to evaluate a model on the validation and test sets. It outputs the metrics seen above.

In [0]:
def eval(model, dataloader):

  predictions = np.array(0)
  targets = np.array(0)
  
  for batch_idx, (embedding, target) in enumerate(dataloader):    
    
    # If the model is on the GPU, don't forget to send the input "embedding" to 
    # the GPU too before computing the prediction
    prediction = model(embedding.to(device)).detach()
    
    # We store the rounded values (integers) of the model output 
    predictions = np.append(predictions, np.round_(prediction.cpu().numpy()).astype(int))
    targets = np.append(targets, target.numpy().astype(int))
  
  return metrics(predictions, targets)

# Model definitions

## First model : a 3 hidden layers fully connected feed forward network

In [0]:
class FFNN(nn.Module):

    def __init__(self):
        super().__init__()
        embedding_dim = 100
        h_dim1 = 512
        h_dim2 = 256
        h_dim3 = 64
        num_classes = 1

        # hidden layers
        self.layer1 = nn.Sequential(
            nn.Linear(in_features=embedding_dim, out_features=h_dim1, bias=True),
            nn.ReLU()
        )

        self.layer2 = nn.Sequential(
            nn.Linear(in_features=h_dim1, out_features=h_dim2, bias=True),
            nn.ReLU()
        )

        self.layer3 = nn.Sequential(
            nn.Linear(in_features=h_dim2, out_features=h_dim3, bias=True),
            nn.ReLU()
        )

        # output layer
        self.layer4 = nn.Sequential(
            nn.Linear(in_features=h_dim3, out_features=num_classes, bias=True),
            nn.Sigmoid()
        )

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        return out

### Training

In [20]:
#  <----------- Global variables for the NN and the training --------------->

# we will train for N epochs (N times the model will see all the data)
epochs = 100

#  <----------- END Global variables for the NN and the training --------------->

emb_corpus = embed_corpus(emb_dict, tokenized_corpus)

train_loader, valid_loader, test_loader = data_loader(emb_corpus, label1, 32, 1)

# Instantiate the model
model_FFNN = FFNN().to(device)

# we use the stochastic gradient descent (SGD) optimizer
optimizer = optim.SGD(model_FFNN.parameters(), lr=0.5)


for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):

        model_FFNN.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()

        # squeeze is needed as the predictions are initially size (batch size, 1) 
        # and we need to remove the dimension of size 1
        predictions = model_FFNN(embedding.to(device)).squeeze(1)
        loss = nn.BCELoss()(predictions, target.to(device))
        
        # For log purposes
        loss_history.append(float(loss))
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))

        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()

    val_acc, cm, recall, precision, f1 = eval(model_FFNN, valid_loader)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    print("Valid accuracy :", val_acc)

Percentage of not recognised words (those we do not have an embedding for) : 3.35 %


  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.675 | Train Acc: 56.91%
Valid accuracy : 57.056603773584904
| Epoch: 02 | Train Loss: 0.636 | Train Acc: 63.57%
Valid accuracy : 73.28301886792453
| Epoch: 03 | Train Loss: 0.618 | Train Acc: 65.92%
Valid accuracy : 72.0754716981132
| Epoch: 04 | Train Loss: 0.603 | Train Acc: 67.75%
Valid accuracy : 72.15094339622642
| Epoch: 05 | Train Loss: 0.595 | Train Acc: 67.56%
Valid accuracy : 63.924528301886795
| Epoch: 06 | Train Loss: 0.583 | Train Acc: 68.56%
Valid accuracy : 74.33962264150942
| Epoch: 07 | Train Loss: 0.579 | Train Acc: 70.00%
Valid accuracy : 69.35849056603773
| Epoch: 08 | Train Loss: 0.573 | Train Acc: 69.32%
Valid accuracy : 69.13207547169812
| Epoch: 09 | Train Loss: 0.563 | Train Acc: 70.35%
Valid accuracy : 54.943396226415096
| Epoch: 10 | Train Loss: 0.567 | Train Acc: 70.34%
Valid accuracy : 69.0566037735849
| Epoch: 11 | Train Loss: 0.569 | Train Acc: 69.90%
Valid accuracy : 63.54716981132076
| Epoch: 12 | Train Loss: 0.554 | Train Ac

### Test

In [21]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval(model_FFNN, test_loader)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 71.25 %
F1-measure on test dataset :  [0.76923077 0.61861862]


## A second model : CNN

As we can see above, the FFNN detects offensive tweets with an accuracy of roughly 71%, which is not too bad for this naive model. However, this came after an important amount of preprocessing. In this part, I'll try to improve this detection accuracy by using a different kind of neural network : a CNN.

In [0]:
def embed_corpus_2(emb_dict, corpus):

    tweet_lengths = [len(tweet) for tweet in corpus]
    max_len = np.max(np.array(tweet_lengths))

    # Prepare container for tweet embeddings
    inputs_ = torch.zeros((len(corpus), max_len, 100))

    # Counter for debugging purposes
    count_not_found = 0.
    total_count = 0.

    # We loop over all the tweets in the corpus
    for idx, tweet in enumerate(corpus):
        # and over all the words in a tweet
        for idx2, word in enumerate(tweet):
            total_count += 1
            if word in emb_dict.keys():
                inputs_[idx, idx2] = torch.Tensor(emb_dict[word])
            else:
                count_not_found += 1
    ratio = (count_not_found / total_count) * 100

    print("Percentage of not recognised words (those we do not have an embedding for) : %.2f" % ratio, "%")
    # We return the embedded corpus
    return inputs_

In [0]:
class CNN(nn.Module):

    def __init__(self, embedding_dim, out_channels, window_size, output_dim, dropout):
        super(CNN, self).__init__()

        # in_channels -- 1 text channel
        # out_channels -- the number of output channels
        # kernel_size is (window size x embedding dim)

        self.conv = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(window_size, embedding_dim))

        # the dropout layer
        self.dropout = nn.Dropout(dropout)

        # the output layer
        self.fc = nn.Linear(out_channels, output_dim)

    def forward(self, x):
        # (batch size, max sent length, embedding dim)

        # We unsqueeze one dimension to give space to the coming convolution channels
        embedded = x.unsqueeze(1)

        # (batch size, 1, max sent length, embedding dim)

        feature_maps = self.conv(embedded)

        # (batch size, n filters, max input length - window size +1)

        feature_maps = feature_maps.squeeze(3)

        feature_maps = F.relu(feature_maps)

        # the max pooling layer
        pooled = F.max_pool1d(feature_maps, feature_maps.shape[2])

        pooled = pooled.squeeze(2)

        # (batch size, n_filters)

        dropped = self.dropout(pooled)

        preds = self.fc(dropped)

        preds = torch.sigmoid(preds)
        
        return preds

### Training

In [23]:
epochs=100

EMBEDDING_DIM = 100
OUTPUT_DIM = 1

#the hyperparameters specific to CNN

# we define the number of filters
N_OUT_CHANNELS = 100

# we define the window size
WINDOW_SIZE = 1

# we apply the dropout with the probability 0.5
DROPOUT = 0.5

model_CNN = CNN(EMBEDDING_DIM, N_OUT_CHANNELS, WINDOW_SIZE, OUTPUT_DIM, DROPOUT).to(device)

optimizer = optim.SGD(model_CNN.parameters(), lr=0.01)
loss_fn = nn.BCELoss()

emb_corpus = embed_corpus_2(emb_dict, tokenized_corpus)
train_loader, valid_loader, test_loader = data_loader(emb_corpus, label1, 32, 1)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):

        model_CNN.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()

        # squeeze is needed as the predictions are initially size (batch size, 1) 
        # and we need to remove the dimension of size 1
        predictions = model_CNN(embedding.to(device)).squeeze(1)   
        loss = nn.BCELoss()(predictions, target.to(device))
        loss_history.append(float(loss))
        
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))
        
        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()

    val_acc, cm, recall, precision, f1 = eval(model_CNN, valid_loader)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    print(f'---> Valid accuracy : {val_acc:.2f}%')

Percentage of not recognised words (those we do not have an embedding for) : 3.35 %


  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.697 | Train Acc: 51.98%
---> Valid accuracy : 43.25%
| Epoch: 02 | Train Loss: 0.684 | Train Acc: 55.95%
---> Valid accuracy : 58.11%
| Epoch: 03 | Train Loss: 0.672 | Train Acc: 59.25%
---> Valid accuracy : 63.55%
| Epoch: 04 | Train Loss: 0.654 | Train Acc: 62.98%
---> Valid accuracy : 64.68%
| Epoch: 05 | Train Loss: 0.619 | Train Acc: 66.58%
---> Valid accuracy : 67.70%
| Epoch: 06 | Train Loss: 0.584 | Train Acc: 70.42%
---> Valid accuracy : 70.19%
| Epoch: 07 | Train Loss: 0.569 | Train Acc: 70.86%
---> Valid accuracy : 73.28%
| Epoch: 08 | Train Loss: 0.558 | Train Acc: 71.71%
---> Valid accuracy : 73.43%
| Epoch: 09 | Train Loss: 0.558 | Train Acc: 71.71%
---> Valid accuracy : 72.83%
| Epoch: 10 | Train Loss: 0.547 | Train Acc: 72.60%
---> Valid accuracy : 75.17%
| Epoch: 11 | Train Loss: 0.534 | Train Acc: 73.17%
---> Valid accuracy : 73.58%
| Epoch: 12 | Train Loss: 0.539 | Train Acc: 73.23%
---> Valid accuracy : 74.49%
| Epoch: 13 | Train Loss: 0.

### Test

In [24]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval(model_CNN, test_loader)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 75.55 %
F1-measure on test dataset :  [0.81008206 0.65677966]


Conclusion : with this simple CNN model, we improved :
 

*   The detection accuracy from 74% (FFNN) to roughly **76%**.
*   The F1 values also improved quite well : from [0.79 ; 0.62] to **[0.81 ; 0.65]**




## A third model : bidirectional LSTM

Let's now try a RNN . Because of the architecture of the LSTM, we need to redefine a little bit the evaluation function, to discard potential batches that would not have the same size as the configured batch_size.

In [0]:
def eval_lstm(model, dataloader, batch_size):
    
    predictions = np.array(0)
    targets = np.array(0)
    
    for batch_idx, (embedding, target) in enumerate(dataloader):
        # With LSTM, we will have troubles if the batch size changes 
        # (for example on the last batch. We discard it)
        if embedding.shape[0] != batch_size:
            continue
        
        embedding = embedding.to(device)
        target = target.to(device)
        
        prediction = model(embedding).detach().cpu()
        
        predictions = np.append(predictions, np.round_(prediction.cpu().numpy()).astype(int))
        targets = np.append(targets, target.cpu().numpy().astype(int))
    
    return metrics(predictions, targets)

In [0]:
class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, seq_len):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        self.batch_size = 32

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim * 2.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            dropout=dropout, batch_first=True, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.linear = nn.Linear(seq_len * hidden_dim * 2, output_dim)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)),
                autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)))

    def forward(self, x):
        if x.shape[0] != self.batch_size:
            pad = torch.zeros((self.batch_size - x.shape[0], x.shape[1], x.shape[2])).to(device)
            x = torch.cat((x, pad), dim=0)
        # Shape of x  torch.Size([32, 105, 100])
        # Shape of LSTM out  torch.Size([32, 105, 40])
        
        # lstm out should be (seq_len, batch, num_directions * hidden_size)
        # elements of self.hidden should be (num_layers * num_directions, batch, hidden_size)
        
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        lstm_out = lstm_out.contiguous()
        lstm_out = lstm_out.view(-1, self.seq_len * 2 * self.hidden_dim)

        tag_space = self.linear(lstm_out)
        tag_scores = torch.sigmoid(tag_space)
        return tag_scores

### Training

In [29]:
batch_size = 32
epochs=10

EMBEDDING_DIM = 100
HIDDEN_DIM = 10
OUTPUT_DIM = 1
DROPOUT = 0.5
SEQ_LEN = 105

model_BiLSTM = BiLSTM(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, SEQ_LEN).to(device)

optimizer = optim.Adam(model_BiLSTM.parameters())
loss_fn = nn.BCELoss()

emb_corpus = embed_corpus_2(emb_dict, tokenized_corpus)
train_loader, valid_loader, test_loader = data_loader(emb_corpus, label1, batch_size, 1)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):
        
        model_BiLSTM.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model_BiLSTM.hidden = model_BiLSTM.init_hidden()

        # Send input data to GPU
        embedding = embedding.to(device)
        
        # squeeze is needed as the predictions are initially size (batch size, 1) 
        # and we need to remove the dimension of size 1
        predictions = model_BiLSTM(embedding).squeeze(1)
        loss = nn.BCELoss()(predictions, target.to(device))
        loss_history.append(float(loss))
        
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))       
        
        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()
    
    val_acc, cm, recall, precision, f1 = eval_lstm(model_BiLSTM, valid_loader, batch_size)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    print(f'---> Valid accuracy : {val_acc:.2f}%')

  "num_layers={}".format(dropout, num_layers))


Percentage of not recognised words (those we do not have an embedding for) : 3.35 %


  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.633 | Train Acc: 62.83%
---> Valid accuracy : 71.21%
| Epoch: 02 | Train Loss: 0.523 | Train Acc: 73.95%
---> Valid accuracy : 73.34%
| Epoch: 03 | Train Loss: 0.492 | Train Acc: 76.03%
---> Valid accuracy : 75.78%
| Epoch: 04 | Train Loss: 0.462 | Train Acc: 78.69%
---> Valid accuracy : 73.88%
| Epoch: 05 | Train Loss: 0.448 | Train Acc: 79.31%
---> Valid accuracy : 76.16%
| Epoch: 06 | Train Loss: 0.437 | Train Acc: 79.80%
---> Valid accuracy : 73.80%
| Epoch: 07 | Train Loss: 0.416 | Train Acc: 81.22%
---> Valid accuracy : 77.30%
| Epoch: 08 | Train Loss: 0.397 | Train Acc: 83.02%
---> Valid accuracy : 74.03%
| Epoch: 09 | Train Loss: 0.385 | Train Acc: 83.18%
---> Valid accuracy : 75.40%
| Epoch: 10 | Train Loss: 0.371 | Train Acc: 84.18%
---> Valid accuracy : 76.16%


In [30]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval_lstm(model_BiLSTM, test_loader, batch_size)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 75.02 %
F1-measure on test dataset :  [0.80751174 0.64425163]


Conclusion : on only 10 epochs with a naive dense layer output, this model achieves almost the same accuracy as the CNN model. We also see that this model overfits very quickly. Let's try to combine Bi-LSTM and convolutions now.

## Last model : Bi-directional LSTM + convolutions

In [0]:
class BiLSTMConv(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, seq_len, 
                 channels, window_size, batch_size):
        super(BiLSTMConv, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        self.batch_size = batch_size

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            dropout=dropout, batch_first=True, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden = self.init_hidden()
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=channels, kernel_size=(window_size, 2 * hidden_dim))
        
        # the dropout layer
        self.dropout = nn.Dropout(dropout)
        
        self.linear = nn.Linear(channels, output_dim)
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)),
                autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)))

    def forward(self, x):
        if x.shape[0] != self.batch_size:
            pad = torch.zeros((self.batch_size - x.shape[0], x.shape[1], x.shape[2])).to(device)
            x = torch.cat((x, pad), dim=0)
        # Shape of x  torch.Size([32, 105, 100])
        # Shape of LSTM out  torch.Size([32, 105, 40])
        
        # lstm out should be (seq_len, batch, num_directions * hidden_size)
        # elements of self.hidden should be (num_layers * num_directions, batch, hidden_size)
        
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        
        #lstm_out = lstm_out.contiguous()
        #lstm_out = lstm_out.view(-1, self.seq_len * 2 * self.hidden_dim)

        # make space for convolution channels
        lstm_out = lstm_out.unsqueeze(1)
        lstm_out = F.relu(lstm_out)
        
        
        conv_out = self.conv(lstm_out)
        
        conv_out = conv_out.squeeze(3)
        
        pooled = F.max_pool1d(conv_out, conv_out.shape[2])
        
        pooled = pooled.squeeze(2)
        
        # (batch size, n_filters)
        dropped = self.dropout(pooled)
        
        preds = self.linear(dropped)
        preds = torch.sigmoid(preds)

        return preds

### Training

In [34]:
batch_size = 32
epochs=10

EMBEDDING_DIM = 100
HIDDEN_DIM = 10
OUTPUT_DIM = 1
DROPOUT = 0.5
SEQ_LEN = 105
CHANNELS = 16
WINDOW_SIZE = 1

model_BiLSTMConv = BiLSTMConv(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, 
                   SEQ_LEN, CHANNELS, WINDOW_SIZE, batch_size).to(device)

optimizer = optim.Adam(model_BiLSTMConv.parameters())
loss_fn = nn.BCELoss()

emb_corpus = embed_corpus_2(emb_dict, tokenized_corpus)
train_loader, valid_loader, test_loader = data_loader(emb_corpus, label1, batch_size, 1)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):
        
        model_BiLSTMConv.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model_BiLSTMConv.hidden = model_BiLSTMConv.init_hidden()

        # Send input data to GPU
        embedding = embedding.to(device)
        
        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
        predictions = model_BiLSTMConv(embedding).squeeze(1)
        loss = nn.BCELoss()(predictions, target.to(device))
        loss_history.append(float(loss))
        
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))     

        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()
    
    val_acc, cm, recall, precision, f1 = eval_lstm(model_BiLSTMConv, valid_loader, batch_size)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    print(f'---> Valid accuracy : {val_acc:.2f}%')

  "num_layers={}".format(dropout, num_layers))


Percentage of not recognised words (those we do not have an embedding for) : 3.35 %


  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.636 | Train Acc: 62.70%
---> Valid accuracy : 76.69%
| Epoch: 02 | Train Loss: 0.527 | Train Acc: 74.53%
---> Valid accuracy : 73.80%
| Epoch: 03 | Train Loss: 0.505 | Train Acc: 76.50%
---> Valid accuracy : 75.55%
| Epoch: 04 | Train Loss: 0.499 | Train Acc: 76.70%
---> Valid accuracy : 76.01%
| Epoch: 05 | Train Loss: 0.474 | Train Acc: 78.79%
---> Valid accuracy : 76.09%
| Epoch: 06 | Train Loss: 0.482 | Train Acc: 78.08%
---> Valid accuracy : 77.30%
| Epoch: 07 | Train Loss: 0.469 | Train Acc: 78.97%
---> Valid accuracy : 76.62%
| Epoch: 08 | Train Loss: 0.450 | Train Acc: 80.08%
---> Valid accuracy : 77.38%
| Epoch: 09 | Train Loss: 0.442 | Train Acc: 80.35%
---> Valid accuracy : 77.08%
| Epoch: 10 | Train Loss: 0.439 | Train Acc: 80.81%
---> Valid accuracy : 77.46%


In [35]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval_lstm(model_BiLSTMConv, test_loader, batch_size)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 75.48 %
F1-measure on test dataset :  [0.81169591 0.64847162]


These are the best results obtained so far, we will thus tuse this model to generate predictions for the challenge!

# Sub task a

## Test prediction generation

Because the dataset are new for the challenge prediction (different from the training), we need to import the specific tokenized corpus as well as the new embedding dictionnaries. Indeed, in the new datasets, there may be some new vocabulary word that we have not encountered before, so we need their embedding from the GloVe file.

A pre requisite for the execution of further cells is the import of a few .txt file that have been previsouly generated from the given dataset. Please import :

*   "test_corpus_tweets_a.txt"
*   "test_corpus_tweets_b.txt"
*   "test_corpus_tweets_c.txt"
*   "emb_dic_a.txt"
*   "emb_dic_b.txt"
*   "emb_dic_c.txt"





### Some data management before predictions

In [0]:
# Get the cleaned tokenized corpus back
with open("test_corpus_tweets_a.txt", "r") as file:
    tmp_a = file.read().splitlines()


tokenized_corpus_a = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_a]

In [0]:
embedding_path = 'emb_dic_a.txt'
emb_dict_a = extract_dic(embedding_path)

In [42]:
emb_corpus_a = embed_corpus_2(emb_dict_a, tokenized_corpus_a)

Percentage of not recognised words (those we do not have an embedding for) : 17.55 %


In [43]:
print(emb_corpus_a.shape)

# We need to make this a multiple of batch_size = 32. We padd with zeros. 
# We will discard the results after.

sent_pad = torch.zeros((4, 66, 100))

emb_corpus_a = torch.cat((emb_corpus_a, sent_pad), dim=0)

print(emb_corpus_a.shape)

torch.Size([860, 66, 100])
torch.Size([864, 66, 100])


###  We now prepare the input to the model and make predictions

In [0]:
# Make batches of 32 : 

input_a = emb_corpus_a.view(-1, 32, 66, 100)

# Preparing a container for the results
prediction_a = torch.zeros((int(864/32),32))

for batch in range(26):
    output = model_BiLSTMConv(torch.Tensor(input_a[batch]).to(device))
    prediction_a[batch] = output.cpu().detach().squeeze(1)

In [45]:
# reshape prediction_a and dicard the last elements corresponding to the padding
prediction_a = prediction_a.view(-1)[:-4]
print(prediction_a.shape)

# Getting back the np.array, and round the result to have 0 or 1
prediction_a = prediction_a.numpy()
prediction_a = np.round_(prediction_a).astype(int)
print(prediction_a)

torch.Size([860])
[0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0
 0 1 1 1 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 0
 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0
 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0
 1 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 1 0 0 1 0 1
 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
 0 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1
 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
 0 0 0 

We now have to translate our integer vector shaped prediction to a csv file composed of "OFF" and "NOT" labels. To do this, we need to get back the id of the tweets from the original file. Please import the file "testset-taska.tsv".

In [0]:
prediction_a_letters = ["OFF" if element == 1 else "NOT" for element in prediction_a]

In [0]:
dataframe_a = pd.read_csv('testset-taska.tsv', sep="\t", header=0)
id_a = dataframe_a["id"]

In [0]:
pred_dataframe = pd.DataFrame(prediction_a_letters, index=id_a)
pred_dataframe.to_csv(path_or_buf ="pred_a.csv", header=False)

The prediction is now available in the files section (refresh it to see it), and ready to be downloaded.

# Sub task b

Our models have been trained on the whole dataset, that is to say for sub task A. Now we want to train for sub task B, thus we need a subdataset containing the relevant tweets for this specific task. The corresponding files have already been imported in the beginning of this notebook

Because we are dealing with the training data, we get back are original embedding dictionnary

In [0]:
embedding_path = 'emb_dic.txt'
emb_dict = extract_dic(embedding_path)

In [53]:
emb_corpus_b = embed_corpus_2(emb_dict, tokenized_corpus_b)

Percentage of not recognised words (those we do not have an embedding for) : 2.84 %


## Training

In [57]:
batch_size = 16
epochs=12

EMBEDDING_DIM = 100
HIDDEN_DIM = 10
OUTPUT_DIM = 1
DROPOUT = 0.5
SEQ_LEN = 105
CHANNELS = 16
WINDOW_SIZE = 1

model_BiLSTMConv = BiLSTMConv(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, 
                   SEQ_LEN, CHANNELS, WINDOW_SIZE, batch_size).to(device)

optimizer = optim.Adam(model_BiLSTMConv.parameters())
loss_fn = nn.BCELoss()

train_loader, valid_loader, test_loader = data_loader(emb_corpus_b, label_b, batch_size, 1, valid_size=0.1, test_size=0.1)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):
        
        model_BiLSTMConv.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model_BiLSTMConv.hidden = model_BiLSTMConv.init_hidden()

        # Send input data to GPU
        embedding = embedding.to(device)
        
        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
        # Have to transpose batch and sequence dimensions for nn.LSTM
        predictions = model_BiLSTMConv(embedding).squeeze(1)
        loss = nn.BCELoss()(predictions, target.to(device))
        loss_history.append(float(loss))
        
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))     

        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()

    val_acc, cm, recall, precision, f1 = eval_lstm(model_BiLSTMConv, valid_loader, batch_size)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    print(f'---> Valid accuracy : {val_acc:.2f}%')

  "num_layers={}".format(dropout, num_layers))
  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.688 | Train Acc: 54.03%
---> Valid accuracy : 72.52%
| Epoch: 02 | Train Loss: 0.664 | Train Acc: 62.05%
---> Valid accuracy : 73.21%
| Epoch: 03 | Train Loss: 0.626 | Train Acc: 65.99%
---> Valid accuracy : 69.98%
| Epoch: 04 | Train Loss: 0.592 | Train Acc: 69.80%
---> Valid accuracy : 61.66%
| Epoch: 05 | Train Loss: 0.572 | Train Acc: 71.22%
---> Valid accuracy : 65.82%
| Epoch: 06 | Train Loss: 0.536 | Train Acc: 74.06%
---> Valid accuracy : 68.13%
| Epoch: 07 | Train Loss: 0.535 | Train Acc: 74.40%
---> Valid accuracy : 66.74%
| Epoch: 08 | Train Loss: 0.486 | Train Acc: 77.56%
---> Valid accuracy : 77.37%
| Epoch: 09 | Train Loss: 0.439 | Train Acc: 81.16%
---> Valid accuracy : 67.21%
| Epoch: 10 | Train Loss: 0.403 | Train Acc: 84.43%
---> Valid accuracy : 71.82%
| Epoch: 11 | Train Loss: 0.348 | Train Acc: 87.44%
---> Valid accuracy : 75.06%
| Epoch: 12 | Train Loss: 0.291 | Train Acc: 90.34%
---> Valid accuracy : 75.06%


In [60]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval_lstm(model_BiLSTMConv, test_loader, batch_size)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 76.91 %
F1-measure on test dataset :  [0.28571429 0.86225895]


## Test sub_task_b

Now that the model is trained for task b, we can predict the output for the challenge sub task b, the same way as for task a.

In [0]:
# Get the cleaned tokenized corpus back
with open("test_corpus_tweets_b.txt", "r") as file:
    tmp_b = file.read().splitlines()

tokenized_corpus_b = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_b]

In [0]:
embedding_path = 'emb_dic_b.txt'
emb_dict_b = extract_dic(embedding_path)

In [61]:
emb_corpus_b = embed_corpus_2(emb_dict_b, tokenized_corpus_b)

Percentage of not recognised words (those we do not have an embedding for) : 6.25 %


In [62]:
print(emb_corpus_b.shape)

# It is already a mutliple of batchsize 16

torch.Size([240, 59, 100])


In [0]:
# Make batches of 16 : 

input_b = emb_corpus_b.view(-1, 16, 59, 100)

# Preparing a container for the results
prediction_b = torch.zeros((int(240/16),16))

for batch in range(15):
    output = model_BiLSTMConv(torch.Tensor(input_b[batch]).to(device))
    prediction_b[batch] = output.cpu().detach().squeeze(1)

In [64]:
# reshape prediction_a and dicard the last elements corresponding to the padding
prediction_b = prediction_b.view(-1)
print(prediction_b.shape)

# Getting back the np.array, and round the result to have 0 or 1
prediction_b = prediction_b.numpy()
prediction_b = np.round_(prediction_b).astype(int)
print(prediction_b)

torch.Size([240])
[1 1 0 1 0 1 1 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1
 0 0 1 1 1 1 0 0 1 1 0 0 1 0 1 0 1 1 1 0 0 0 1 1 1 1 1 1 0 0 1 1 1 0 1 0 0
 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 0 0 0 1 1 0
 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0
 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 1
 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 0 0 0 1 0 1
 1 0 1 1 0 0 1 0 0 1 1 1 0 1 1 0 0 1]


In [0]:
prediction_b_letters = ["TIN" if element == 1 else "UNT" for element in prediction_b]

In [0]:
dataframe_b = pd.read_csv('testset-taskb.tsv', sep="\t", header=0)
id_b = dataframe_b["id"]

In [0]:
pred_dataframe = pd.DataFrame(prediction_b_letters, index=id_b)
pred_dataframe.to_csv(path_or_buf ="pred_b.csv", header=False)

The predictions for task b can be found in the "files"!

# Sub task C : a little modification of architecture

In [0]:
embedding_path = 'emb_dic.txt'
emb_dict = extract_dic(embedding_path)

In [78]:
emb_corpus_c = embed_corpus_2(emb_dict, tokenized_corpus_c)

Percentage of not recognised words (those we do not have an embedding for) : 2.79 %


We will operate a slight modification to the network to have 4 neurons in the output layer instead of 1. We will have to change the activation function (from ReLU() to Softmax()) and adapt the Loss function too.

In [0]:
class BiLSTMConv4(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, seq_len, 
                 channels, window_size, batch_size):
        super(BiLSTMConv, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        self.batch_size = batch_size

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            dropout=dropout, batch_first=True, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden = self.init_hidden()
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=channels, kernel_size=(window_size, 2 * hidden_dim))
        
        # the dropout layer
        self.dropout = nn.Dropout(dropout)
        
        self.linear = nn.Linear(channels, output_dim)
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)),
                autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)))

    def forward(self, x):
        if x.shape[0] != self.batch_size:
            pad = torch.zeros((self.batch_size - x.shape[0], x.shape[1], x.shape[2])).to(device)
            x = torch.cat((x, pad), dim=0)
        # Shape of x  torch.Size([32, 105, 100])
        # Shape of LSTM out  torch.Size([32, 105, 40])
        
        # lstm out should be (seq_len, batch, num_directions * hidden_size)
        # elements of self.hidden should be (num_layers * num_directions, batch, hidden_size)
        
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        
        #lstm_out = lstm_out.contiguous()
        #lstm_out = lstm_out.view(-1, self.seq_len * 2 * self.hidden_dim)

        # make space for convolution channels
        lstm_out = lstm_out.unsqueeze(1)
        lstm_out = F.relu(lstm_out)
        
        
        conv_out = self.conv(lstm_out)
        
        conv_out = conv_out.squeeze(3)
        
        pooled = F.max_pool1d(conv_out, conv_out.shape[2])
        
        pooled = pooled.squeeze(2)
        
        # (batch size, n_filters)
        dropped = self.dropout(pooled)
        
        preds = self.linear(dropped)

        return preds

Because we now output 4 values instead of one, the evaluation function has to be adjusted.

In [0]:
def eval_lstm2(model, dataloader, batch_size):
    
    predictions = np.array(0)
    targets = np.array(0)
    
    for batch_idx, (embedding, target) in enumerate(dataloader):
        # With LSTM, we will have troubles if the batch size changes 
        # (for example on the last batch. We discard it)
        if embedding.shape[0] != batch_size:
            continue
        
        embedding = embedding.to(device)
        target = target.to(device)
        
        # Here we have to take the softmax of the predictions first
        prediction = torch.softmax(model(embedding).detach().cpu(), dim=1)
        prediction = prediction.cpu().numpy()
        prediction = np.argmax(prediction, axis=1)
        
        predictions = np.append(predictions, prediction.astype(int))
        targets = np.append(targets, target.cpu().numpy().astype(int))
    
    return metrics(predictions, targets)

In [83]:
batch_size = 16
epochs=10

EMBEDDING_DIM = 100
HIDDEN_DIM = 10
OUTPUT_DIM = 3
DROPOUT = 0.5
SEQ_LEN = 105
CHANNELS = 16
WINDOW_SIZE = 1

model_BiLSTMConv = BiLSTMConv(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, 
                   SEQ_LEN, CHANNELS, WINDOW_SIZE, batch_size).to(device)

optimizer = optim.Adam(model_BiLSTMConv.parameters())
loss_fn = nn.CrossEntropyLoss()

train_loader, valid_loader, test_loader = data_loader(emb_corpus_c, label_c, batch_size, 1)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):
        
        model_BiLSTMConv.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model_BiLSTMConv.hidden = model_BiLSTMConv.init_hidden()

        # Send input data to GPU
        embedding = embedding.to(device)
        
        if target.shape[0] != batch_size:
            pad = torch.zeros(batch_size - target.shape[0])
            target = torch.cat((target, pad), dim=0)
            
        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
        # Have to transpose batch and sequence dimensions for nn.LSTM
        predictions = model_BiLSTMConv(embedding).squeeze(1)
        loss = nn.CrossEntropyLoss()(predictions, target.long().to(device))
        loss_history.append(float(loss))
        
        predictions = torch.softmax(predictions.detach(), dim=1)
        predictions = np.argmax(predictions.cpu().numpy(), axis=1)
        target = target.detach()
        acc_history.append(accuracy(predictions.astype(int), 
                                    target.cpu().numpy().astype(int)))     

        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()
    
    
    val_acc, cm, recall, precision, f1 = eval_lstm2(model_BiLSTMConv, valid_loader, batch_size)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%' )
    print(f'---> Valid accuracy : {val_acc:.2f}%')

  "num_layers={}".format(dropout, num_layers))
  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 1.096 | Train Acc: 36.82%
---> Valid accuracy : 32.73%
| Epoch: 02 | Train Loss: 1.065 | Train Acc: 46.52%
---> Valid accuracy : 56.62%
| Epoch: 03 | Train Loss: 1.013 | Train Acc: 51.03%
---> Valid accuracy : 60.78%
| Epoch: 04 | Train Loss: 0.977 | Train Acc: 53.16%
---> Valid accuracy : 65.45%
| Epoch: 05 | Train Loss: 0.966 | Train Acc: 54.19%
---> Valid accuracy : 66.23%
| Epoch: 06 | Train Loss: 0.956 | Train Acc: 54.83%
---> Valid accuracy : 69.09%
| Epoch: 07 | Train Loss: 0.947 | Train Acc: 58.51%
---> Valid accuracy : 70.13%
| Epoch: 08 | Train Loss: 0.937 | Train Acc: 57.60%
---> Valid accuracy : 67.01%
| Epoch: 09 | Train Loss: 0.925 | Train Acc: 59.99%
---> Valid accuracy : 71.95%
| Epoch: 10 | Train Loss: 0.911 | Train Acc: 60.70%
---> Valid accuracy : 64.94%


In [84]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval_lstm2(model_BiLSTMConv, test_loader, batch_size)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 67.79 %
F1-measure on test dataset :  [0.21978022 0.81263158 0.56862745]


## Test task C

In [0]:
# Get the cleaned tokenized corpus back
with open("test_corpus_tweets_c.txt", "r") as file:
    tmp_c = file.read().splitlines()

tokenized_corpus_c = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_c]

In [0]:
embedding_path = 'emb_dic_c.txt'
emb_dict_c = extract_dic(embedding_path)

In [88]:
emb_corpus_c = embed_corpus_2(emb_dict_c, tokenized_corpus_c)

Percentage of not recognised words (those we do not have an embedding for) : 6.04 %


In [89]:
print(emb_corpus_c.shape)

# We need to make it a multiple of the batchsize:

pad = torch.zeros((11, 58, 100))
emb_corpus_c = torch.cat((emb_corpus_c, pad), dim=0)
print(emb_corpus_c.shape)
# It is already a mutliple of batchsize 16

torch.Size([213, 58, 100])
torch.Size([224, 58, 100])


In [0]:
# Make batches of 16 : 

input_c = emb_corpus_c.view(-1, 16, 58, 100)

# Preparing a container for the results
prediction_c = np.zeros((int(224/16),16))

for batch in range(14):
    output = model_BiLSTMConv(torch.Tensor(input_c[batch]).to(device))
    output = torch.softmax(output, dim=1)
    output = np.argmax(output.cpu().detach().numpy(), axis =1)
    prediction_c[batch] = output

prediction_c = torch.Tensor(prediction_c)

In [91]:
# reshape prediction_a and dicard the last elements corresponding to the padding
prediction_c = prediction_c.view(-1)[:-11]
print(prediction_c.shape)

# Getting back the np.array, and round the result to have 0 or 1
prediction_c = prediction_c.numpy()
prediction_c = prediction_c.astype(int)
print(prediction_c)

torch.Size([213])
[0 0 1 1 1 1 1 2 1 1 2 2 2 0 0 1 2 2 2 0 0 2 1 1 1 1 2 2 0 1 2 1 1 1 2 2 1
 1 1 2 1 1 2 2 2 1 2 0 2 2 0 2 0 2 1 1 2 1 1 2 1 1 2 1 1 1 1 0 2 1 1 2 2 0
 1 1 1 1 2 2 1 2 1 1 0 1 2 1 1 1 1 2 1 2 0 2 2 1 2 2 0 1 2 1 1 1 0 1 0 2 1
 2 1 2 1 1 1 1 1 2 1 0 0 2 1 1 2 1 0 2 1 1 2 1 2 2 1 1 0 0 0 1 1 2 2 1 0 2
 0 1 2 1 1 1 0 1 2 2 1 2 2 1 2 1 0 2 1 2 2 1 1 1 1 1 1 1 0 1 1 0 1 1 1 2 1
 0 1 2 2 1 1 0 0 1 2 0 1 2 1 2 1 1 1 1 1 1 0 0 1 1 1 1 0]


In [0]:
prediction_c_letters = ["IND" if element == 1 else "GRP" if element == 2 
                        else "OTH" for element in prediction_c]

In [0]:
dataframe_c = pd.read_csv('test_set_taskc.tsv', sep="\t", header=0)
id_c = dataframe_c["id"]

In [0]:
pred_dataframe = pd.DataFrame(prediction_c_letters, index=id_c)
pred_dataframe.to_csv(path_or_buf ="pred_c.csv", header=False)

Predictions for task C can now be found in "Files".