#### Download or import nlp models

In [117]:
import gensim.downloader
from gensim.models.word2vec import KeyedVectors

# Download word2vec and save it in pretrained_models
# word2vec_model = gensim.downloader.load('word2vec-google-news-300')
# word2vec_model.save('pretrained_models/word2vec-google-news-300')
word2vec_model = KeyedVectors.load('pretrained_models/word2vec-google-news-300')

In [118]:
import fasttext
import fasttext.util
import os

# Download fasttext and save it in pretrained_models
# fasttext.util.download_model('en', if_exists='ignore')
# os.rename('cc.en.300.bin', 'pretrained_models/cc.en.300.bin')
# os.rename('cc.en.300.bin.gz', 'pretrained_models/cc.en.300.bin.gz')
fasttext_model = fasttext.load_model('pretrained_models/cc.en.300.bin')



#### Some useful functions

In [119]:
def get_model_name(model):
    if type(model) == gensim.models.keyedvectors.KeyedVectors:
        return 'Word2Vec'
    elif type(model) == fasttext.FastText._FastText:
        return 'FastText'

In [120]:
embedding_size = 300

In [121]:
import numpy as np

In [122]:
def get_embedding(model, word):
    """Get the embedding of a word. It works with models not having words with uppercase letters."""
    try:
        return model[word]
    except KeyError:
        # print(f"'{word}' has been ignored!")
        return np.zeros((embedding_size,))
    except:
        return model[word.lower()]

In [123]:
def maxpooling_embedding(model, sentence):
    tokenised_sentence = list(gensim.utils.tokenize(sentence))
    sentence_embedding_matrix = np.zeros((embedding_size, len(tokenised_sentence)))                                    
    for index, token in enumerate(tokenised_sentence):
        sentence_embedding_matrix[:, index] = get_embedding(model, token)
    return np.max(sentence_embedding_matrix, axis=1)

#### Load and format datasets

In [124]:
def load_data_and_target(path, sep=' '):
    data = []
    target = []
    
    with open(path, encoding='utf-8') as f:
        for line in f:
            x, y = sep.join(line.split(sep)[1:]).rstrip('\n'), line.split(sep)[0]
            data.append(x)
            target.append(y)

    return data, target

In [125]:
from torch.utils.data import Dataset, DataLoader
from torch import FloatTensor, LongTensor
from typing import List

class QuestionDataset(Dataset):
    def __init__(self, dataset:List[str], target:np.array, sentence_aggregation_function, nlp_model):
        self.dataset = dataset
        self.doc_embeddings = [None for _ in range(len(dataset))]
        self.sentence_aggregation_function = sentence_aggregation_function 
        self.nlp_model = nlp_model
        self.target = target
    
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        if self.doc_embeddings[index] is None:
            self.doc_embeddings[index] = self.sentence_aggregation_function(self.nlp_model, self.dataset[index]) 
        return tuple((FloatTensor(self.doc_embeddings[index]), LongTensor([self.target[index]]).squeeze(0)))

In [126]:
X, y = load_data_and_target('./data/questions-train.txt')
X_test, y_test = load_data_and_target('./data/questions-test.txt')

valid_ratio = 0.1
valid_size = int(len(X) * valid_ratio)
train_size = len(X) - valid_size

X_train, y_train = X[:train_size], y[:train_size]
X_valid, y_valid = X[train_size:], y[train_size:]

In [127]:
categories = np.unique(y)

In [128]:
categorical_to_ordinal_values = dict()

for ord, cat in enumerate(categories):
    categorical_to_ordinal_values[cat] = ord

categorical_to_ordinal_values

{'ABBREVIATION': 0,
 'DEFINITION': 1,
 'DESCRIPTION': 2,
 'ENTITY': 3,
 'LOCATION': 4,
 'ORGANIZATION': 5,
 'PERSON': 6,
 'QUANTITY': 7,
 'TEMPORAL': 8}

In [129]:
def ordinal_encoding(dataset):
    dataset_encoded = []
    for cat in dataset:
        dataset_encoded.append(categorical_to_ordinal_values[cat])

    return dataset_encoded

In [130]:
y_train = ordinal_encoding(y_train)
y_valid = ordinal_encoding(y_valid)
y_test = ordinal_encoding(y_test)

#### MLP

In [131]:
from torch import nn

class MultiLayerPerceptron(nn.Module):
    
    def __init__(self, input_size, hidden_layer_size, output_size) :
        super().__init__()
        self.intput_layer = nn.Linear(input_size, hidden_layer_size)
        self.output_layer = nn.Linear(hidden_layer_size, output_size)
        
    def forward(self, x):
        x = self.intput_layer(x)
        x = nn.functional.relu_(x)
        x = self.output_layer(x)
        return x

In [132]:
from poutyne.framework import Experiment
from poutyne import set_seeds

set_seeds(42)
hidden_size = 100
n_classes = len(categories)

mlp = MultiLayerPerceptron(embedding_size, hidden_size, n_classes)

In [133]:
def training_loop(question, nlp_model, network, sentence_aggregation_function, batch_size=16, epochs=30):
    train_dataset = QuestionDataset(X_train, y_train, sentence_aggregation_function, nlp_model)
    valid_dataset = QuestionDataset(X_valid, y_valid, sentence_aggregation_function, nlp_model)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

    # We save the results of training in corresponding path
    directory_name = f'./results/Q{question}/{get_model_name(nlp_model)}/{hidden_size}'

    experiment = Experiment(directory_name, 
                            network, 
                            optimizer = "Adam", 
                            task="classification")

    experiment.train(train_dataloader, valid_dataloader, epochs=epochs, disable_tensorboard=True)

    return experiment

In [134]:
def training_and_evaluation(question, nlp_model, network, sentence_aggregation_function=maxpooling_embedding, batch_size=8):
    experiment = training_loop(question, nlp_model, network, sentence_aggregation_function, batch_size=batch_size)

    test_dataset = QuestionDataset(X_test, y_test, sentence_aggregation_function, nlp_model)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    experiment.test(test_dataloader)

In [135]:
import shutil

try:
    shutil.rmtree('results/Q1a')
    shutil.rmtree('results/Q1b')
except:
    print('Ignored!')

### 1.a FastText vs Word2Vec

#### Word2Vec

In [136]:
training_and_evaluation('1a', word2vec_model, mlp)

[35mEpoch: [36m 1/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.92s [35mloss:[94m 1.735182[35m acc:[94m 38.852230[35m fscore_macro:[94m 0.232045[35m val_loss:[94m 1.491099[35m val_acc:[94m 47.387387[35m val_fscore_macro:[94m 0.333104[0m
Epoch 1: val_acc improved from -inf to 47.38739, saving file to ./results/1a/Word2Vec/100\checkpoint_epoch_1.ckpt
[35mEpoch: [36m 2/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.55s [35mloss:[94m 1.252120[35m acc:[94m 58.368326[35m fscore_macro:[94m 0.424380[35m val_loss:[94m 1.196665[35m val_acc:[94m 57.477477[35m val_fscore_macro:[94m 0.443432[0m
Epoch 2: val_acc improved from 47.38739 to 57.47748, saving file to ./results/1a/Word2Vec/100\checkpoint_epoch_2.ckpt
[35mEpoch: [36m 3/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.50s [35mloss:[94m 1.053098[35m acc:[94m 64.367127[35m fscore_macro:[94m 0.506569[35m val_loss:[94m 1.085243[35m val_acc:[94m 61.441441[35m val

#### Fasttext

In [137]:
training_and_evaluation('1a', fasttext_model, mlp)

[35mEpoch: [36m 1/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m2.44s [35mloss:[94m 1.988206[35m acc:[94m 29.894021[35m fscore_macro:[94m 0.198548[35m val_loss:[94m 1.744446[35m val_acc:[94m 40.000000[35m val_fscore_macro:[94m 0.273885[0m
Epoch 1: val_acc improved from -inf to 40.00000, saving file to ./results/1a/FastText/100\checkpoint_epoch_1.ckpt
[35mEpoch: [36m 2/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.22s [35mloss:[94m 1.564321[35m acc:[94m 46.390722[35m fscore_macro:[94m 0.340762[35m val_loss:[94m 1.502372[35m val_acc:[94m 47.927928[35m val_fscore_macro:[94m 0.368353[0m
Epoch 2: val_acc improved from 40.00000 to 47.92793, saving file to ./results/1a/FastText/100\checkpoint_epoch_2.ckpt
[35mEpoch: [36m 3/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.25s [35mloss:[94m 1.389955[35m acc:[94m 51.649670[35m fscore_macro:[94m 0.416197[35m val_loss:[94m 1.389815[35m val_acc:[94m 53.513514[35m val

**Commentaire**: Les embeddings de Word2Vec donnent les meilleures valeurs de précision à l'entrainement (80% à l'entrainement et 70% à la validation) comparés à ceux de FastText (75% à l'entrainement et 60% à la validation). Cette différence peut s'expliquer par le fait que Word2Vec considère le mot en intégralité pour faire les embeddings tandis que FastText prend en compte la structure du mot et donc construit les embeddings du mots à partir de ceux de ses différents caractères. Cela pourrait induire un certian biais. Cette remarque est bien vérifiée par les valeurs de pertes. Word2Vec : 0.5 à l'entrainement et 0.9 à la validation tandis que pour FastText 0.7 à l'entrainement et 1.2 à la validation.

### Tâche 1.b: Impact de la taille de la couche cachée du réseau feedfoward 

In [138]:
hidden_sizes = [100, 150, 200, 250, 300, 350, 400]

In [139]:
for hidden_size in hidden_sizes:
    mlp = MultiLayerPerceptron(embedding_size, hidden_size, n_classes)

    training_and_evaluation('1b', word2vec_model, mlp)

[35mEpoch: [36m 1/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.90s [35mloss:[94m 1.743987[35m acc:[94m 38.292342[35m fscore_macro:[94m 0.225007[35m val_loss:[94m 1.504047[35m val_acc:[94m 47.207207[35m val_fscore_macro:[94m 0.329836[0m
Epoch 1: val_acc improved from -inf to 47.20721, saving file to ./results/1b/Word2Vec/100\checkpoint_epoch_1.ckpt
[35mEpoch: [36m 2/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.63s [35mloss:[94m 1.275326[35m acc:[94m 57.768446[35m fscore_macro:[94m 0.414409[35m val_loss:[94m 1.213579[35m val_acc:[94m 57.297297[35m val_fscore_macro:[94m 0.425125[0m
Epoch 2: val_acc improved from 47.20721 to 57.29730, saving file to ./results/1b/Word2Vec/100\checkpoint_epoch_2.ckpt
[35mEpoch: [36m 3/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.64s [35mloss:[94m 1.072776[35m acc:[94m 63.967207[35m fscore_macro:[94m 0.499438[35m val_loss:[94m 1.103064[35m val_acc:[94m 60.180180[35m val

In [140]:
for hidden_size in hidden_sizes:
    mlp = MultiLayerPerceptron(embedding_size, hidden_size, n_classes)

    training_and_evaluation('1b', fasttext_model, mlp)

[35mEpoch: [36m 1/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m2.71s [35mloss:[94m 1.757501[35m acc:[94m 37.012597[35m fscore_macro:[94m 0.235698[35m val_loss:[94m 1.599578[35m val_acc:[94m 43.423423[35m val_fscore_macro:[94m 0.298153[0m
Epoch 1: val_acc improved from -inf to 43.42342, saving file to ./results/1b/FastText/100\checkpoint_epoch_1.ckpt
[35mEpoch: [36m 2/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.82s [35mloss:[94m 1.430999[35m acc:[94m 50.549890[35m fscore_macro:[94m 0.359398[35m val_loss:[94m 1.381300[35m val_acc:[94m 52.792793[35m val_fscore_macro:[94m 0.390063[0m
Epoch 2: val_acc improved from 43.42342 to 52.79279, saving file to ./results/1b/FastText/100\checkpoint_epoch_2.ckpt
[35mEpoch: [36m 3/30 [35mTrain steps: [36m626 [35mVal steps: [36m70 [32m1.56s [35mloss:[94m 1.272325[35m acc:[94m 56.108778[35m fscore_macro:[94m 0.431559[35m val_loss:[94m 1.287204[35m val_acc:[94m 57.297297[35m val

**Commentaire**: La taille de la couche cachée du réseau MLP ne semble pas avoir un impact sur les performances du modèle utilisant les embeddings FastText. L'accuracy est environ 80% pour train et 60% pour validation.
Cependant la variation de la taille de la couche cachée a un impact sur les performances du MLP. En effet, l'accuracy de l'entrainement suit une tendance croissante en fonction de la taille de la couche cachée. Elle passe 80% pour une taille de 100 à 90% pour une taille de 300. Cette tendance n'est pas observée pour la validation. L'accuracy reste constante à 70% environ avec une certaine instabilité. Cette constance de l'accuracy lors de la validation et cette augmentation de celle de l'entrainement nous font penser que une variation croissante de la taille de la couche cachée induirait un surentrainement du model.
En perspective, il serait intéressant de combiner cette expérimentation en faisant varier d'autres hyperparamètres comme le learning rate pour déterminer la meilleure configuration.