In [0]:
import pandas as pd
import numpy as np
#from corpus import *
#from utils import *
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.utils.data.sampler import SubsetRandomSampler, WeightedRandomSampler

from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

## Device selection

In [28]:
GPU = True
device_idx = 0
if GPU:
    device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")
print(device)
cpu = torch.device("cpu")

cuda:0


In [29]:
# We set a random seed to ensure that your results are reproducible.
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
torch.manual_seed(0)

<torch._C.Generator at 0x7f442041c190>

## Data management

### Corpus management

In [0]:
#  <-------------- Dataset management -------------> #
# Get the vocabulary back
with open("vocabulary.txt", "r") as file:
    vocabulary = file.read().splitlines()

# Get the cleaned tokenized corpus back
with open("corpus_tweets.txt", "r") as file:
    tmp = file.read().splitlines()

# In each sentence, we get rid of the last token, which is '\n'
clean_corpus = [sentence[:-1] for sentence in tmp]

tokenized_corpus = [[token for token in sentence.split(' ')][:-1] for sentence in tmp]

# Get the labels
with open("labels_1.txt", "r") as file:
    label1 = file.read().splitlines()
with open("labels_2.txt", "r") as file:
    label2 = file.read().splitlines()
with open("labels_3.txt", "r") as file:
    label3 = file.read().splitlines()

label1 = [float(i) for i in label1]
label2 = [float(i) for i in label2]
label3 = [float(i) for i in label3]

#  <-------------- END Dataset management -------------> #


### Word embedding from GloVe

In [0]:
embedding_path = 'emb_dic.txt'
emb_dict = {}
glove = open(embedding_path)
for line in glove:
    values = line.split()
    word = values[0]
    try:
        vector = np.asarray(values[1:], dtype='float32')
        if len(vector) != 100:
          print(word, len(vector))
        emb_dict[word] = vector
    except:
        print("Parsing problem on word ", word, " discarding it")
glove.close()

In [196]:
print(len(emb_dict['should']))

100


### Data loaders

In [0]:
def data_loader(emb_corpus, labels, batch_size, random_seed, valid_size=0.1, test_size=0.1, balancing=True):

    # One hot encoding of labels, function in utils
    #labels_ = one_hot_encoding(np.array(labels))
    labels_ = torch.Tensor(labels)
    # Sending everything to GPU
    labels_.to(device)
    emb_corpus.to(device)
    
    
    # Create dataset
    dataset_ = torch.utils.data.TensorDataset(emb_corpus, labels_)

    # Split to train / valid / test dataset
    size_dataset = len(labels)
    indices = list(range(size_dataset))
    valid_split = int(np.floor(valid_size * size_dataset))
    test_split = int(np.floor(test_size * size_dataset))

    np.random.seed(random_seed)
    np.random.shuffle(indices)

    train_valid_idx, test_idx = indices[test_split:], indices[:test_split]
    train_idx, valid_idx = train_valid_idx[valid_split:], train_valid_idx[:valid_split]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    test_sampler = SubsetRandomSampler(test_idx)

    valid_loader = torch.utils.data.DataLoader(
        dataset_, batch_size=batch_size, sampler=valid_sampler)
    test_loader = torch.utils.data.DataLoader(
        dataset_, batch_size=batch_size, sampler=test_sampler)

    # We balancing set to False, we just create a data loader from the train indices
    if not balancing:
        train_loader = torch.utils.data.DataLoader(
            dataset_, batch_size=batch_size, sampler=train_sampler)

        return train_loader, valid_loader, test_loader

    # If there is balancing to do, we first extract the training samples according to the
    # predefined indices, before using a weighted sampler
    if balancing:
        train_loader_unbalanced = torch.utils.data.DataLoader(
            dataset_, batch_size=len(labels), sampler=train_sampler)
        
        # Get back training data from sampler
        training_data, training_labels = next(iter(train_loader_unbalanced))
        # We get back classes from one hot encoding
        #training_labels_int = training_labels.argmax(dim = 1, keepdim=True)
        train_dataset = torch.utils.data.TensorDataset(training_data, training_labels)

        # WeightedSampler takes the list of weights as input
        class_sample_count = np.array([len(np.where(training_labels == t)[0]) for t in np.unique(training_labels)])
        weight = 1. / class_sample_count
        samples_weight = np.array([weight[int(t)] for t in training_labels])

        samples_weight = torch.from_numpy(samples_weight)
        samples_weight = samples_weight.double()
        balance_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

        train_loader_balanced = DataLoader(train_dataset, batch_size=batch_size, num_workers=1, sampler=balance_sampler)

        return train_loader_balanced, valid_loader, test_loader

## First model : a 3 hidden layers fully connected feed forward network

In [0]:
class FFNN(nn.Module):

    def __init__(self):
        super().__init__()
        embedding_dim = 100
        h_dim1 = 512
        h_dim2 = 256
        h_dim3 = 64
        num_classes = 1

        # hidden layers
        self.layer1 = nn.Sequential(
            nn.Linear(in_features=embedding_dim, out_features=h_dim1, bias=True),
            nn.ReLU()
        )

        self.layer2 = nn.Sequential(
            nn.Linear(in_features=h_dim1, out_features=h_dim2, bias=True),
            nn.ReLU()
        )

        self.layer3 = nn.Sequential(
            nn.Linear(in_features=h_dim2, out_features=h_dim3, bias=True),
            nn.ReLU()
        )

        # output layer
        self.layer4 = nn.Sequential(
            nn.Linear(in_features=h_dim3, out_features=num_classes, bias=True),
            nn.Sigmoid()
        )

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        return out

## Some utils functions

In [0]:
def accuracy(output, target):
    correct = (output == target)
    acc = float(float(correct.sum()) /len(output) ) * 100
    return acc

In [0]:
def recall_precision(cm):
    nb_classes = cm.shape[0]
    recall = np.zeros(nb_classes)
    precision = np.zeros(nb_classes)
    for idx in range(nb_classes):
        if (idx + 1) < nb_classes:
            false_negative = np.concatenate((cm[idx, :idx], cm[idx, (idx + 1):]))
            false_positive = np.concatenate((cm[:idx, idx], cm[(idx + 1):, idx]))
        else:
            false_negative = cm[idx, :idx]
            false_positive = cm[:idx, idx]
        true_positive = cm[idx, idx]
        recall[idx] = true_positive / (true_positive + false_negative.sum())
        precision[idx] = true_positive / (true_positive + false_positive.sum())
    return recall, precision

In [0]:
def eval(model, dataloader):

  predictions = np.array(0)
  targets = np.array(0)
  
  for batch_idx, (embedding, target) in enumerate(dataloader):    
  
    prediction = model(embedding.to(device)).detach()
    predictions = np.append(predictions, np.round_(prediction.cpu().numpy()).astype(int))
    
    targets = np.append(targets, target.numpy().astype(int))
  
  return metrics(predictions, targets)

In [0]:
def f1_measure(recall, precision):
    f1 = 2 * (recall * precision) / (recall + precision)
    return f1

In [0]:
def metrics(prediction, target):
    
    # Number of correct predictions
    acc = accuracy(prediction, target)
    
    # Confusion matrix
    cm = confusion_matrix(target, prediction)
    
    # Recall and precision
    recall, precision = recall_precision(cm)
    f1 = f1_measure(recall, precision)
    
    return acc, cm, recall, precision, f1

In [0]:
def embed_corpus(emb_dict, corpus):
    # Prepare container for tweet embeddings
    inputs_ = torch.zeros((len(corpus), 100))

    # Counter for debugging purposes
    count_not_found = 0.
    total_count = 0.

    # We loop over all the tweets in the corpus
    for idx, sentence in enumerate(corpus):
        sentence_length = len(sentence)
        mean_embedding = torch.zeros(100)
        for word in sentence:
            total_count += 1
            if word in emb_dict.keys():
                mean_embedding += torch.Tensor(emb_dict[word])
            else:
                count_not_found += 1

        # We average the word embedding over the sentence
        mean_embedding /= sentence_length

        # We add the embedded sentence to the inputs tensor
        inputs_[idx] = mean_embedding
    ratio = (count_not_found / total_count) * 100

    print("Percentage of not recognised words (those we do not have an embedding for) : %.2f" % ratio, "%")
    # We return the embedded corpus
    return inputs_

def one_hot_encoding(labels):
    labels_ = torch.zeros(len(labels), int(np.amax(labels) + 1))
    for idx, label in enumerate(labels):
        labels_[idx, int(label)] = 1.
    return labels_

### Training

In [216]:
#  <----------- Global variables for the NN and the training --------------->

# we will train for N epochs (N times the model will see all the data)
epochs = 100

#  <----------- END Global variables for the NN and the training --------------->

emb_corpus = embed_corpus(emb_dict, tokenized_corpus)

train_loader, valid_loader, test_loader = data_loader(emb_corpus, label1, 32, 1)

# Instantiate the model
model_FFNN = FFNN().to(device)

# we use the stochastic gradient descent (SGD) optimizer
optimizer = optim.SGD(model_FFNN.parameters(), lr=0.5)


for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):

        model_FFNN.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()

        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
        predictions = model_FFNN(embedding.to(device)).squeeze(1)
        loss = nn.BCELoss()(predictions, target.to(device))
        
        # For log purposes
        loss_history.append(float(loss))
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))

        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()

    val_acc, cm, recall, precision, f1 = eval(model_FFNN, valid_loader)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    print("Valid accuracy :", val_acc)

Percentage of not recognised words (those we do not have an embedding for) : 3.35 %


  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.673 | Train Acc: 58.52%
Valid accuracy : 42.49056603773585
| Epoch: 02 | Train Loss: 0.636 | Train Acc: 64.00%
Valid accuracy : 68.90566037735849
| Epoch: 03 | Train Loss: 0.612 | Train Acc: 66.36%
Valid accuracy : 68.90566037735849
| Epoch: 04 | Train Loss: 0.604 | Train Acc: 67.22%
Valid accuracy : 65.81132075471699
| Epoch: 05 | Train Loss: 0.581 | Train Acc: 69.35%
Valid accuracy : 74.18867924528301
| Epoch: 06 | Train Loss: 0.584 | Train Acc: 68.76%
Valid accuracy : 74.11320754716981
| Epoch: 07 | Train Loss: 0.561 | Train Acc: 70.23%
Valid accuracy : 63.62264150943396
| Epoch: 08 | Train Loss: 0.569 | Train Acc: 69.97%
Valid accuracy : 66.79245283018868
| Epoch: 09 | Train Loss: 0.568 | Train Acc: 69.70%
Valid accuracy : 70.79245283018868
| Epoch: 10 | Train Loss: 0.564 | Train Acc: 70.42%
Valid accuracy : 69.13207547169812
| Epoch: 11 | Train Loss: 0.560 | Train Acc: 71.32%
Valid accuracy : 74.64150943396226
| Epoch: 12 | Train Loss: 0.555 | Train Acc

### Test data

In [220]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval(model_FFNN, test_loader)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 73.28 %
F1-measure on test dataset :  [0.79225352 0.62579281]


## A second model : CNN

As we can see above, the FFNN detects offensive tweets with an accuracy of roughly 73%, which is not too bad for this naive model. However, this came after an important amount of preprocessing. In this part, I'll try to improve this detection accuracy by using a different kind of neural network : a CNN.

In [0]:
def embed_corpus_2(emb_dict, corpus):

    tweet_lengths = [len(tweet) for tweet in corpus]
    max_len = np.max(np.array(tweet_lengths))

    # Prepare container for tweet embeddings
    inputs_ = torch.zeros((len(corpus), max_len, 100))

    # Counter for debugging purposes
    count_not_found = 0.
    total_count = 0.

    # We loop over all the tweets in the corpus
    for idx, tweet in enumerate(corpus):
        # and over all the words in a tweet
        for idx2, word in enumerate(tweet):
            total_count += 1
            if word in emb_dict.keys():
                inputs_[idx, idx2] = torch.Tensor(emb_dict[word])
            else:
                count_not_found += 1
    ratio = (count_not_found / total_count) * 100

    print("Percentage of not recognised words (those we do not have an embedding for) : %.2f" % ratio, "%")
    # We return the embedded corpus
    return inputs_

In [0]:
class CNN(nn.Module):

    def __init__(self, embedding_dim, out_channels, window_size, output_dim, dropout):
        super(CNN, self).__init__()

        # in_channels -- 1 text channel
        # out_channels -- the number of output channels
        # kernel_size is (window size x embedding dim)

        self.conv = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(window_size, embedding_dim))

        # the dropout layer
        self.dropout = nn.Dropout(dropout)

        # the output layer
        self.fc = nn.Linear(out_channels, output_dim)

    def forward(self, x):
        # (batch size, max sent length, embedding dim)

        # We unsqueeze one dimension to give space to the coming convolution channels
        embedded = x.unsqueeze(1)

        # (batch size, 1, max sent length, embedding dim)

        feature_maps = self.conv(embedded)

        # (batch size, n filters, max input length - window size +1)

        feature_maps = feature_maps.squeeze(3)

        feature_maps = F.relu(feature_maps)

        # the max pooling layer
        pooled = F.max_pool1d(feature_maps, feature_maps.shape[2])

        pooled = pooled.squeeze(2)

        # (batch size, n_filters)

        dropped = self.dropout(pooled)

        preds = self.fc(dropped)

        preds = torch.sigmoid(preds)
        
        return preds

## Training

In [228]:
epochs=100

EMBEDDING_DIM = 100
OUTPUT_DIM = 1

#the hyperparameters specific to CNN

# we define the number of filters
N_OUT_CHANNELS = 100

# we define the window size
WINDOW_SIZE = 1

# we apply the dropout with the probability 0.5
DROPOUT = 0.5

model_CNN = CNN(EMBEDDING_DIM, N_OUT_CHANNELS, WINDOW_SIZE, OUTPUT_DIM, DROPOUT).to(device)

optimizer = optim.SGD(model_CNN.parameters(), lr=0.01)
loss_fn = nn.BCELoss()

emb_corpus = embed_corpus_2(emb_dict, tokenized_corpus)
train_loader, valid_loader, test_loader = data_loader(emb_corpus, label1, 32, 1)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):

        model_CNN.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()

        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
        predictions = model_CNN(embedding.to(device)).squeeze(1)   
        loss = nn.BCELoss()(predictions, target.to(device))
        loss_history.append(float(loss))
        
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))
        
        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()

    val_acc, cm, recall, precision, f1 = eval(model_CNN, valid_loader)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    print(f'---> Valid accuracy : {val_acc:.2f}%')

Percentage of not recognised words (those we do not have an embedding for) : 3.35 %


  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.695 | Train Acc: 52.02%
---> Valid accuracy : 52.91%
| Epoch: 02 | Train Loss: 0.672 | Train Acc: 60.11%
---> Valid accuracy : 63.02%
| Epoch: 03 | Train Loss: 0.643 | Train Acc: 64.77%
---> Valid accuracy : 67.25%
| Epoch: 04 | Train Loss: 0.613 | Train Acc: 67.67%
---> Valid accuracy : 69.43%
| Epoch: 05 | Train Loss: 0.584 | Train Acc: 69.89%
---> Valid accuracy : 69.51%
| Epoch: 06 | Train Loss: 0.571 | Train Acc: 70.48%
---> Valid accuracy : 75.55%
| Epoch: 07 | Train Loss: 0.560 | Train Acc: 71.28%
---> Valid accuracy : 73.96%
| Epoch: 08 | Train Loss: 0.550 | Train Acc: 72.56%
---> Valid accuracy : 73.89%
| Epoch: 09 | Train Loss: 0.540 | Train Acc: 73.02%
---> Valid accuracy : 74.19%
| Epoch: 10 | Train Loss: 0.543 | Train Acc: 72.83%
---> Valid accuracy : 74.11%
| Epoch: 11 | Train Loss: 0.546 | Train Acc: 73.15%
---> Valid accuracy : 74.11%
| Epoch: 12 | Train Loss: 0.543 | Train Acc: 72.82%
---> Valid accuracy : 74.04%
| Epoch: 13 | Train Loss: 0.

In [229]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval(model_CNN, test_loader)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 76.08 %
F1-measure on test dataset :  [0.81708021 0.65430752]


Conclusion : with this simple CNN model, we improved :
 

*   The detection accuracy from 74% (FFNN) to roughly **76%**.
*   The F1 values also improved quite well : from [0.79 ; 0.62] to **[0.81 ; 0.65]**




## A third model : bidirectional LSTM

In [0]:
class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, seq_len):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        self.batch_size = 32

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            dropout=dropout, batch_first=True, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.linear = nn.Linear(seq_len * hidden_dim * 2, output_dim)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)),
                autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)))

    def forward(self, x):
        if x.shape[0] != self.batch_size:
            pad = torch.zeros((self.batch_size - x.shape[0], x.shape[1], x.shape[2])).to(device)
            x = torch.cat((x, pad), dim=0)
        # Shape of x  torch.Size([32, 105, 100])
        # Shape of LSTM out  torch.Size([32, 105, 40])
        
        # lstm out should be (seq_len, batch, num_directions * hidden_size)
        # elements of self.hidden should be (num_layers * num_directions, batch, hidden_size)
        
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        lstm_out = lstm_out.contiguous()
        lstm_out = lstm_out.view(-1, self.seq_len * 2 * self.hidden_dim)

        tag_space = self.linear(lstm_out)
        tag_scores = torch.sigmoid(tag_space)
        return tag_scores

In [0]:
def eval_lstm(model, dataloader, batch_size):
    
    predictions = np.array(0)
    targets = np.array(0)
    
    for batch_idx, (embedding, target) in enumerate(dataloader):
        # With LSTM, we will have troubles if the batch size changes 
        # (for example on the last batch. We discard it)
        if embedding.shape[0] != batch_size:
            continue
        
        embedding = embedding.to(device)
        target = target.to(device)
        
        prediction = model(embedding).detach().cpu()
        
        predictions = np.append(predictions, np.round_(prediction.cpu().numpy()).astype(int))
        targets = np.append(targets, target.cpu().numpy().astype(int))
    
    return metrics(predictions, targets)

### Training

In [248]:
batch_size = 32
epochs=10

EMBEDDING_DIM = 100
HIDDEN_DIM = 10
OUTPUT_DIM = 1
DROPOUT = 0.5
SEQ_LEN = 105

model_BiLSTM = BiLSTM(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, SEQ_LEN).to(device)

optimizer = optim.Adam(model_BiLSTM.parameters())
loss_fn = nn.BCELoss()

emb_corpus = embed_corpus_2(emb_dict, tokenized_corpus)
train_loader, valid_loader, test_loader = data_loader(emb_corpus, label1, batch_size, 1)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):
        
        model_BiLSTM.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model_BiLSTM.hidden = model_BiLSTM.init_hidden()

        # Send input data to GPU
        embedding = embedding.to(device)
        
        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
        # Have to transpose batch and sequence dimensions for nn.LSTM
        predictions = model_BiLSTM(embedding).squeeze(1)
        loss = nn.BCELoss()(predictions, target.to(device))
        loss_history.append(float(loss))
        
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))       
        
        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()
    
    val_acc, cm, recall, precision, f1 = eval_lstm(model_BiLSTM, valid_loader, batch_size)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    print(f'---> Valid accuracy : {val_acc:.2f}%')

  "num_layers={}".format(dropout, num_layers))


Percentage of not recognised words (those we do not have an embedding for) : 3.35 %


  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.637 | Train Acc: 62.68%
---> Valid accuracy : 71.97%
| Epoch: 02 | Train Loss: 0.532 | Train Acc: 73.46%
---> Valid accuracy : 73.12%
| Epoch: 03 | Train Loss: 0.490 | Train Acc: 76.73%
---> Valid accuracy : 74.41%
| Epoch: 04 | Train Loss: 0.469 | Train Acc: 78.17%
---> Valid accuracy : 76.54%
| Epoch: 05 | Train Loss: 0.457 | Train Acc: 79.26%
---> Valid accuracy : 74.94%
| Epoch: 06 | Train Loss: 0.427 | Train Acc: 80.75%
---> Valid accuracy : 76.16%
| Epoch: 07 | Train Loss: 0.424 | Train Acc: 80.89%
---> Valid accuracy : 78.07%
| Epoch: 08 | Train Loss: 0.402 | Train Acc: 82.39%
---> Valid accuracy : 76.47%
| Epoch: 09 | Train Loss: 0.387 | Train Acc: 83.36%
---> Valid accuracy : 71.44%
| Epoch: 10 | Train Loss: 0.372 | Train Acc: 84.35%
---> Valid accuracy : 74.71%


In [249]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval_lstm(model_BiLSTM, test_loader, batch_size)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 75.78 %
F1-measure on test dataset :  [0.81554524 0.64745011]


Conclusion : on only 10 epochs with a naive dense layer output, this model achieves almost the same accuracy as the CNN model. We also see that this model overfits very quickly. Let's try to combine Bi-LSTM and convolutions now.

## Last model : Bi-directional LSTM + convolutions

In [0]:
class BiLSTMConv(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, seq_len, 
                 channels, window_size, batch_size):
        super(BiLSTMConv, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        self.batch_size = batch_size

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            dropout=dropout, batch_first=True, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden = self.init_hidden()
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=channels, kernel_size=(window_size, 2 * hidden_dim))
        
        # the dropout layer
        self.dropout = nn.Dropout(dropout)
        
        self.linear = nn.Linear(channels, output_dim)
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)),
                autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)))

    def forward(self, x):
        if x.shape[0] != self.batch_size:
            pad = torch.zeros((self.batch_size - x.shape[0], x.shape[1], x.shape[2])).to(device)
            x = torch.cat((x, pad), dim=0)
        # Shape of x  torch.Size([32, 105, 100])
        # Shape of LSTM out  torch.Size([32, 105, 40])
        
        # lstm out should be (seq_len, batch, num_directions * hidden_size)
        # elements of self.hidden should be (num_layers * num_directions, batch, hidden_size)
        
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        
        #lstm_out = lstm_out.contiguous()
        #lstm_out = lstm_out.view(-1, self.seq_len * 2 * self.hidden_dim)

        # make space for convolution channels
        lstm_out = lstm_out.unsqueeze(1)
        lstm_out = F.relu(lstm_out)
        
        
        conv_out = self.conv(lstm_out)
        
        conv_out = conv_out.squeeze(3)
        
        pooled = F.max_pool1d(conv_out, conv_out.shape[2])
        
        pooled = pooled.squeeze(2)
        
        # (batch size, n_filters)
        dropped = self.dropout(pooled)
        
        preds = self.linear(dropped)
        preds = torch.sigmoid(preds)

        return preds

## Training

In [254]:
batch_size = 32
epochs=10

EMBEDDING_DIM = 100
HIDDEN_DIM = 10
OUTPUT_DIM = 1
DROPOUT = 0.5
SEQ_LEN = 105
CHANNELS = 16
WINDOW_SIZE = 1

model_BiLSTMConv = BiLSTMConv(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, 
                   SEQ_LEN, CHANNELS, WINDOW_SIZE, batch_size).to(device)

optimizer = optim.Adam(model_BiLSTMConv.parameters())
loss_fn = nn.BCELoss()

emb_corpus = embed_corpus_2(emb_dict, tokenized_corpus)
train_loader, valid_loader, test_loader = data_loader(emb_corpus, label1, batch_size, 1)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):
        
        model_BiLSTMConv.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model_BiLSTMConv.hidden = model_BiLSTMConv.init_hidden()

        # Send input data to GPU
        embedding = embedding.to(device)
        
        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
        # Have to transpose batch and sequence dimensions for nn.LSTM
        predictions = model_BiLSTMConv(embedding).squeeze(1)
        loss = nn.BCELoss()(predictions, target.to(device))
        loss_history.append(float(loss))
        
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))     

        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()
    
    val_acc, cm, recall, precision, f1 = eval_lstm(model_BiLSTMConv, valid_loader, batch_size)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    print(f'---> Valid accuracy : {val_acc:.2f}%')

  "num_layers={}".format(dropout, num_layers))


Percentage of not recognised words (those we do not have an embedding for) : 3.35 %


  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.675 | Train Acc: 57.21%
---> Valid accuracy : 60.55%
| Epoch: 02 | Train Loss: 0.554 | Train Acc: 73.45%
---> Valid accuracy : 74.41%
| Epoch: 03 | Train Loss: 0.510 | Train Acc: 76.03%
---> Valid accuracy : 75.55%
| Epoch: 04 | Train Loss: 0.499 | Train Acc: 76.62%
---> Valid accuracy : 76.69%
| Epoch: 05 | Train Loss: 0.482 | Train Acc: 77.80%
---> Valid accuracy : 75.55%
| Epoch: 06 | Train Loss: 0.471 | Train Acc: 79.14%
---> Valid accuracy : 76.85%
| Epoch: 07 | Train Loss: 0.470 | Train Acc: 78.64%
---> Valid accuracy : 75.48%
| Epoch: 08 | Train Loss: 0.467 | Train Acc: 78.02%
---> Valid accuracy : 77.91%
| Epoch: 09 | Train Loss: 0.448 | Train Acc: 79.63%
---> Valid accuracy : 76.62%
| Epoch: 10 | Train Loss: 0.442 | Train Acc: 80.16%
---> Valid accuracy : 77.30%


In [255]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval_lstm(model_BiLSTMConv, test_loader, batch_size)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 76.92 %
F1-measure on test dataset :  [0.82455124 0.66295884]


# Test generation

In [0]:
#  <-------------- Dataset management -------------> #
# Get the vocabulary back
with open("test_vocabulary_a.txt", "r") as file:
    vocabulary_a = file.read().splitlines()

# Get the cleaned tokenized corpus back
with open("test_corpus_tweets_a.txt", "r") as file:
    tmp_a = file.read().splitlines()

# In each sentence, we get rid of the last token, which is '\n'
clean_corpus_a = [sentence[:-1] for sentence in tmp_a]

tokenized_corpus_a = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_a]

#  <-------------- END Dataset management -------------> #

In [0]:
embedding_path = 'emb_dic_a.txt'
emb_dict_a = {}
glove = open(embedding_path)
for line in glove:
    values = line.split()
    word = values[0]
    try:
        vector = np.asarray(values[1:], dtype='float32')
        if len(vector) != 100:
          print(word, len(vector))
        emb_dict_a[word] = vector
    except:
        print("Parsing problem on word ", word, " discarding it")
glove.close()

In [265]:
emb_corpus_a = embed_corpus_2(emb_dict_a, tokenized_corpus_a)

Percentage of not recognised words (those we do not have an embedding for) : 7.47 %


In [267]:
print(emb_corpus_a.shape)

# We need to make this a multiple of batch_size = 32. We padd with zeros. 
# We will discard the results after.

sent_pad = torch.zeros((4, 66, 100))

emb_corpus_a = torch.cat((emb_corpus_a, sent_pad), dim=0)

print(emb_corpus_a.shape)

torch.Size([860, 66, 100])
torch.Size([864, 66, 100])


In [0]:
# Make batches of 32 : 

input_a = emb_corpus_a.view(-1, 32, 66, 100)

# Preparing a container for the results
prediction_a = torch.zeros((int(864/32),32))

for batch in range(26):
    output = model_BiLSTMConv(torch.Tensor(input_a[batch]).to(device))
    prediction_a[batch] = output.cpu().detach().squeeze(1)

In [282]:
# reshape prediction_a and dicard the last elements corresponding to the padding
prediction_a = prediction_a.view(-1)[:-4]
print(prediction_a.shape)

# Getting back the np.array, and round the result to have 0 or 1
prediction_a = prediction_a.numpy()
prediction_a = np.round_(prediction_a).astype(int)
print(prediction_a)

torch.Size([860])
[1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0
 0 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0
 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1
 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1
 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1
 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
 1 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0
 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 1
 1 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 1
 0 0 0 

In [0]:
prediction_a_letters = ["OFF" if element == 1 else "NOT" for element in prediction_a]

In [0]:
dataframe_a = pd.read_csv('testset-taska.tsv', sep="\t", header=0)
id_a = dataframe_a["id"]

In [0]:
pred_dataframe = pd.DataFrame(prediction_a_letters, index=id_a)
pred_dataframe.to_csv(path_or_buf ="pred_a.csv", header=False)

# Sub task b

In [0]:
#  <-------------- Dataset management -------------> #
# Get the vocabulary back
with open("vocabulary.txt", "r") as file:
    vocabulary_a = file.read().splitlines()

# Get the cleaned tokenized corpus back
with open("corpus_tweets_subtask_b.txt", "r") as file:
    tmp_b = file.read().splitlines()

# In each sentence, we get rid of the last token, which is '\n'
clean_corpus_b = [sentence[:-1] for sentence in tmp_b]

tokenized_corpus_b = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_b]

#  <-------------- END Dataset management -------------> #

In [0]:
# Get the labels
with open("labels_subtask_b.txt", "r") as file:
    label_b = file.read().splitlines()


label_b = [float(i) for i in label_b]

In [0]:
embedding_path = 'emb_dic.txt'
emb_dict = {}
glove = open(embedding_path)
for line in glove:
    values = line.split()
    word = values[0]
    try:
        vector = np.asarray(values[1:], dtype='float32')
        if len(vector) != 100:
          print(word, len(vector))
        emb_dict[word] = vector
    except:
        print("Parsing problem on word ", word, " discarding it")
glove.close()

In [67]:
emb_corpus_b = embed_corpus_2(emb_dict, tokenized_corpus_b)

Percentage of not recognised words (those we do not have an embedding for) : 2.84 %


## Training

In [71]:
batch_size = 16
epochs=15

EMBEDDING_DIM = 100
HIDDEN_DIM = 10
OUTPUT_DIM = 1
DROPOUT = 0.5
SEQ_LEN = 105
CHANNELS = 16
WINDOW_SIZE = 1

model_BiLSTMConv = BiLSTMConv(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, 
                   SEQ_LEN, CHANNELS, WINDOW_SIZE, batch_size).to(device)

optimizer = optim.Adam(model_BiLSTMConv.parameters())
loss_fn = nn.BCELoss()

train_loader, valid_loader, test_loader = data_loader(emb_corpus_b, label_b, batch_size, 1, valid_size=0, test_size=0)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):
        
        model_BiLSTMConv.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model_BiLSTMConv.hidden = model_BiLSTMConv.init_hidden()

        # Send input data to GPU
        embedding = embedding.to(device)
        
        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
        # Have to transpose batch and sequence dimensions for nn.LSTM
        predictions = model_BiLSTMConv(embedding).squeeze(1)
        loss = nn.BCELoss()(predictions, target.to(device))
        loss_history.append(float(loss))
        
        predictions = predictions.detach()
        target = target.detach()
        acc_history.append(accuracy(np.round_(predictions.cpu().numpy()).astype(int), 
                                    target.cpu().numpy().astype(int)))     

        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()

    #val_acc, cm, recall, precision, f1 = eval_lstm(model_BiLSTMConv, valid_loader, batch_size)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    #print(f'---> Valid accuracy : {val_acc:.2f}%')

  "num_layers={}".format(dropout, num_layers))
  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 0.692 | Train Acc: 52.89%
| Epoch: 02 | Train Loss: 0.658 | Train Acc: 61.93%
| Epoch: 03 | Train Loss: 0.623 | Train Acc: 65.75%
| Epoch: 04 | Train Loss: 0.586 | Train Acc: 71.25%
| Epoch: 05 | Train Loss: 0.549 | Train Acc: 74.14%
| Epoch: 06 | Train Loss: 0.524 | Train Acc: 75.27%
| Epoch: 07 | Train Loss: 0.482 | Train Acc: 78.43%
| Epoch: 08 | Train Loss: 0.457 | Train Acc: 80.00%
| Epoch: 09 | Train Loss: 0.417 | Train Acc: 82.00%
| Epoch: 10 | Train Loss: 0.364 | Train Acc: 85.59%
| Epoch: 11 | Train Loss: 0.330 | Train Acc: 87.30%
| Epoch: 12 | Train Loss: 0.295 | Train Acc: 89.89%
| Epoch: 13 | Train Loss: 0.265 | Train Acc: 90.57%
| Epoch: 14 | Train Loss: 0.230 | Train Acc: 92.34%
| Epoch: 15 | Train Loss: 0.197 | Train Acc: 93.50%


In [51]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval_lstm(model_BiLSTMConv, test_loader, batch_size)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 79.21 %
F1-measure on test dataset :  [0.28571429 0.87837838]


# Test sub_task_b

In [0]:
#  <-------------- Dataset management -------------> #
# Get the vocabulary back
with open("test_vocabulary_b.txt", "r") as file:
    vocabulary_b = file.read().splitlines()

# Get the cleaned tokenized corpus back
with open("test_corpus_tweets_b.txt", "r") as file:
    tmp_b = file.read().splitlines()

# In each sentence, we get rid of the last token, which is '\n'
clean_corpus_b = [sentence[:-1] for sentence in tmp_b]

tokenized_corpus_b = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_b]

#  <-------------- END Dataset management -------------> #

In [0]:
embedding_path = 'emb_dic_b.txt'
emb_dict_b = {}
glove = open(embedding_path)
for line in glove:
    values = line.split()
    word = values[0]
    try:
        vector = np.asarray(values[1:], dtype='float32')
        if len(vector) != 100:
          print(word, len(vector))
        emb_dict_b[word] = vector
    except:
        print("Parsing problem on word ", word, " discarding it")
glove.close()

In [74]:
emb_corpus_b = embed_corpus_2(emb_dict_b, tokenized_corpus_b)

Percentage of not recognised words (those we do not have an embedding for) : 6.37 %


In [75]:
print(emb_corpus_b.shape)

# It is already a mutliple of batchsize 16

torch.Size([240, 59, 100])


In [0]:
# Make batches of 16 : 

input_b = emb_corpus_b.view(-1, 16, 59, 100)

# Preparing a container for the results
prediction_b = torch.zeros((int(240/16),16))

for batch in range(15):
    output = model_BiLSTMConv(torch.Tensor(input_b[batch]).to(device))
    prediction_b[batch] = output.cpu().detach().squeeze(1)

In [77]:
# reshape prediction_a and dicard the last elements corresponding to the padding
prediction_b = prediction_b.view(-1)
print(prediction_b.shape)

# Getting back the np.array, and round the result to have 0 or 1
prediction_b = prediction_b.numpy()
prediction_b = np.round_(prediction_b).astype(int)
print(prediction_b)

torch.Size([240])
[1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0
 0 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1
 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0
 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1
 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1]


In [0]:
prediction_b_letters = ["TIN" if element == 1 else "UNT" for element in prediction_b]

In [0]:
dataframe_b = pd.read_csv('testset-taskb.tsv', sep="\t", header=0)
id_b = dataframe_b["id"]

In [0]:
pred_dataframe = pd.DataFrame(prediction_b_letters, index=id_b)
pred_dataframe.to_csv(path_or_buf ="pred_b.csv", header=False)

# Sub task C

We will operate a slight modification to the network to have 4 neurons in the output layer instead of 1. We will have to change the activation function (from ReLU() to Softmax()) and adapt the Loss function too.

In [0]:
#  <-------------- Dataset management -------------> #
# Get the vocabulary back
with open("vocabulary.txt", "r") as file:
    vocabulary_a = file.read().splitlines()

# Get the cleaned tokenized corpus back
with open("corpus_tweets_subtask_c.txt", "r") as file:
    tmp_c = file.read().splitlines()

# In each sentence, we get rid of the last token, which is '\n'
clean_corpus_c = [sentence[:-1] for sentence in tmp_c]

tokenized_corpus_c = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_c]

#  <-------------- END Dataset management -------------> #

In [0]:
# Get the labels
with open("labels_subtask_c.txt", "r") as file:
    label_c = file.read().splitlines()

label_c = [float(i) for i in label_c]

In [0]:
embedding_path = 'emb_dic.txt'
emb_dict = {}
glove = open(embedding_path)
for line in glove:
    values = line.split()
    word = values[0]
    try:
        vector = np.asarray(values[1:], dtype='float32')
        if len(vector) != 100:
          print(word, len(vector))
        emb_dict[word] = vector
    except:
        print("Parsing problem on word ", word, " discarding it")
glove.close()

In [97]:
emb_corpus_c = embed_corpus_2(emb_dict, tokenized_corpus_c)

Percentage of not recognised words (those we do not have an embedding for) : 2.79 %


In [0]:
class BiLSTMConv4(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, seq_len, 
                 channels, window_size, batch_size):
        super(BiLSTMConv, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        self.batch_size = batch_size

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            dropout=dropout, batch_first=True, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden = self.init_hidden()
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=channels, kernel_size=(window_size, 2 * hidden_dim))
        
        # the dropout layer
        self.dropout = nn.Dropout(dropout)
        
        self.linear = nn.Linear(channels, output_dim)
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)),
                autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim).to(device)))

    def forward(self, x):
        if x.shape[0] != self.batch_size:
            pad = torch.zeros((self.batch_size - x.shape[0], x.shape[1], x.shape[2])).to(device)
            x = torch.cat((x, pad), dim=0)
        # Shape of x  torch.Size([32, 105, 100])
        # Shape of LSTM out  torch.Size([32, 105, 40])
        
        # lstm out should be (seq_len, batch, num_directions * hidden_size)
        # elements of self.hidden should be (num_layers * num_directions, batch, hidden_size)
        
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        
        #lstm_out = lstm_out.contiguous()
        #lstm_out = lstm_out.view(-1, self.seq_len * 2 * self.hidden_dim)

        # make space for convolution channels
        lstm_out = lstm_out.unsqueeze(1)
        lstm_out = F.relu(lstm_out)
        
        
        conv_out = self.conv(lstm_out)
        
        conv_out = conv_out.squeeze(3)
        
        pooled = F.max_pool1d(conv_out, conv_out.shape[2])
        
        pooled = pooled.squeeze(2)
        
        # (batch size, n_filters)
        dropped = self.dropout(pooled)
        
        preds = self.linear(dropped)

        return preds

In [0]:
def eval_lstm2(model, dataloader, batch_size):
    
    predictions = np.array(0)
    targets = np.array(0)
    
    for batch_idx, (embedding, target) in enumerate(dataloader):
        # With LSTM, we will have troubles if the batch size changes 
        # (for example on the last batch. We discard it)
        if embedding.shape[0] != batch_size:
            continue
        
        embedding = embedding.to(device)
        target = target.to(device)
        
        # Here we have to take the softmax of the predictions first
        prediction = torch.softmax(model(embedding).detach().cpu(), dim=1)
        prediction = prediction.cpu().numpy()
        prediction = np.argmax(prediction, axis=1)
        
        predictions = np.append(predictions, prediction.astype(int))
        targets = np.append(targets, target.cpu().numpy().astype(int))
    
    return metrics(predictions, targets)

In [132]:
batch_size = 16
epochs=15

EMBEDDING_DIM = 100
HIDDEN_DIM = 10
OUTPUT_DIM = 3
DROPOUT = 0.5
SEQ_LEN = 105
CHANNELS = 16
WINDOW_SIZE = 1

model_BiLSTMConv = BiLSTMConv(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, 
                   SEQ_LEN, CHANNELS, WINDOW_SIZE, batch_size).to(device)

optimizer = optim.Adam(model_BiLSTMConv.parameters())
loss_fn = nn.CrossEntropyLoss()

train_loader, valid_loader, test_loader = data_loader(emb_corpus_c, label_c, batch_size, 1)

for epoch in range(1, epochs + 1):
    loss_history = []
    acc_history = []
    for batch_idx, (embedding, target) in enumerate(train_loader):
        
        model_BiLSTMConv.train()

        # we zero the gradients as they are not removed automatically
        optimizer.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model_BiLSTMConv.hidden = model_BiLSTMConv.init_hidden()

        # Send input data to GPU
        embedding = embedding.to(device)
        
        if target.shape[0] != batch_size:
            pad = torch.zeros(batch_size - target.shape[0])
            target = torch.cat((target, pad), dim=0)
            
        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1
        # Have to transpose batch and sequence dimensions for nn.LSTM
        predictions = model_BiLSTMConv(embedding).squeeze(1)
        loss = nn.CrossEntropyLoss()(predictions, target.long().to(device))
        loss_history.append(float(loss))
        
        predictions = torch.softmax(predictions.detach(), dim=1)
        predictions = np.argmax(predictions.cpu().numpy(), axis=1)
        target = target.detach()
        acc_history.append(accuracy(predictions.astype(int), 
                                    target.cpu().numpy().astype(int)))     

        # calculate the gradient of each parameter
        loss.backward()

        # update the parameters using the gradients and optimizer algorithm
        optimizer.step()

    epoch_loss = np.array(loss_history).mean()
    epoch_acc = np.array(acc_history).mean()
    
    
    val_acc, cm, recall, precision, f1 = eval_lstm2(model_BiLSTMConv, valid_loader, batch_size)
    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%' )
    print(f'---> Valid accuracy : {val_acc:.2f}%')

  "num_layers={}".format(dropout, num_layers))
  self.weights = torch.tensor(weights, dtype=torch.double)


| Epoch: 01 | Train Loss: 1.095 | Train Acc: 38.79%
---> Valid accuracy : 57.14%
| Epoch: 02 | Train Loss: 1.047 | Train Acc: 48.94%
---> Valid accuracy : 63.64%
| Epoch: 03 | Train Loss: 0.986 | Train Acc: 54.67%
---> Valid accuracy : 68.31%
| Epoch: 04 | Train Loss: 0.968 | Train Acc: 56.19%
---> Valid accuracy : 69.87%
| Epoch: 05 | Train Loss: 0.962 | Train Acc: 56.38%
---> Valid accuracy : 61.56%
| Epoch: 06 | Train Loss: 0.964 | Train Acc: 56.06%
---> Valid accuracy : 64.68%
| Epoch: 07 | Train Loss: 0.936 | Train Acc: 61.15%
---> Valid accuracy : 66.23%
| Epoch: 08 | Train Loss: 0.934 | Train Acc: 60.08%
---> Valid accuracy : 67.27%
| Epoch: 09 | Train Loss: 0.907 | Train Acc: 63.56%
---> Valid accuracy : 71.17%
| Epoch: 10 | Train Loss: 0.899 | Train Acc: 64.59%
---> Valid accuracy : 70.65%
| Epoch: 11 | Train Loss: 0.902 | Train Acc: 65.30%
---> Valid accuracy : 67.01%
| Epoch: 12 | Train Loss: 0.881 | Train Acc: 67.53%
---> Valid accuracy : 67.79%
| Epoch: 13 | Train Loss: 0.

In [133]:
acc_test, cm_test, recall_test, precision_test, f1_test = eval_lstm2(model_BiLSTMConv, test_loader, batch_size)
print("Accuracy on test dataset : %.2f" % acc_test, "%")
print("F1-measure on test dataset : ", f1_test)

Accuracy on test dataset : 70.39 %
F1-measure on test dataset :  [0.22222222 0.82452431 0.62037037]


# Test task C

In [0]:
#  <-------------- Dataset management -------------> #
# Get the vocabulary back
with open("test_vocabulary_c.txt", "r") as file:
    vocabulary_c = file.read().splitlines()

# Get the cleaned tokenized corpus back
with open("test_corpus_tweets_c.txt", "r") as file:
    tmp_c = file.read().splitlines()

# In each sentence, we get rid of the last token, which is '\n'
clean_corpus_c = [sentence[:-1] for sentence in tmp_c]

tokenized_corpus_c = [[token for token in sentence.split(' ')][:-1] for sentence in tmp_c]

#  <-------------- END Dataset management -------------> #

In [0]:
embedding_path = 'emb_dic_c.txt'
emb_dict_c = {}
glove = open(embedding_path)
for line in glove:
    values = line.split()
    word = values[0]
    try:
        vector = np.asarray(values[1:], dtype='float32')
        if len(vector) != 100:
          print(word, len(vector))
        emb_dict_c[word] = vector
    except:
        print("Parsing problem on word ", word, " discarding it")
glove.close()

In [148]:
emb_corpus_c = embed_corpus_2(emb_dict_c, tokenized_corpus_c)

Percentage of not recognised words (those we do not have an embedding for) : 6.04 %


In [149]:
print(emb_corpus_c.shape)

# We need to make it a multiple of the batchsize:

pad = torch.zeros((11, 58, 100))
emb_corpus_c = torch.cat((emb_corpus_c, pad), dim=0)
print(emb_corpus_c.shape)
# It is already a mutliple of batchsize 16

torch.Size([213, 58, 100])
torch.Size([224, 58, 100])


In [0]:
# Make batches of 16 : 

input_c = emb_corpus_c.view(-1, 16, 58, 100)

# Preparing a container for the results
prediction_c = np.zeros((int(224/16),16))

for batch in range(14):
    output = model_BiLSTMConv(torch.Tensor(input_c[batch]).to(device))
    output = torch.softmax(output, dim=1)
    output = np.argmax(output.cpu().detach().numpy(), axis =1)
    prediction_c[batch] = output

prediction_c = torch.Tensor(prediction_c)

In [157]:
# reshape prediction_a and dicard the last elements corresponding to the padding
prediction_c = prediction_c.view(-1)[:-11]
print(prediction_c.shape)

# Getting back the np.array, and round the result to have 0 or 1
prediction_c = prediction_c.numpy()
prediction_c = prediction_c.astype(int)
print(prediction_c)

torch.Size([213])
[2 2 1 1 1 2 1 1 1 1 2 2 2 2 0 1 2 0 2 1 0 2 1 1 1 1 2 0 2 1 2 1 1 1 2 2 1
 1 1 2 1 1 2 2 2 1 2 0 2 0 1 2 1 2 2 1 2 1 1 2 2 1 2 1 1 2 1 1 2 1 1 2 2 2
 1 1 1 1 2 2 0 0 1 1 2 1 2 1 1 1 1 1 1 2 2 2 2 1 2 2 1 1 2 1 1 1 2 1 0 2 1
 0 1 2 1 1 1 1 1 2 1 0 2 2 1 1 2 1 2 2 1 1 2 2 2 2 1 1 2 0 2 1 1 2 2 1 1 2
 0 1 0 1 1 1 0 1 2 2 1 2 2 1 2 1 1 2 1 2 2 1 1 1 1 1 1 1 0 1 1 2 1 1 1 2 1
 1 0 2 2 2 1 2 2 1 2 0 1 2 1 2 1 1 2 1 1 1 2 1 1 2 1 1 1]


In [0]:
prediction_c_letters = ["IND" if element == 1 else "GRP" if element == 2 
                        else "OTH" for element in prediction_c]

In [0]:
dataframe_c = pd.read_csv('test_set_taskc.tsv', sep="\t", header=0)
id_c = dataframe_c["id"]

In [0]:
pred_dataframe = pd.DataFrame(prediction_c_letters, index=id_c)
pred_dataframe.to_csv(path_or_buf ="pred_c.csv", header=False)