<a href="https://colab.research.google.com/github/Simone999/nlp_assignment1/blob/main/assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from typing import List, Callable, Dict, Iterable

import pandas as pd
import numpy as np
import tensorflow as tf
import keras as ks

from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r "/content/drive/My Drive/dependency_treebank" "dependency_treebank"

Mounted at /content/drive


structuring dataframe

In [None]:
dataset_name = "dependency_treebank"
dataset_path = os.path.join(os.getcwd(), dataset_name)
end_train = 100
end_validation = 150
end_test = 199

def create_dataset(start, end, split:str):
  tagged_sentences = []
  for data_file in range(start, end+1):
    filename = os.path.join(dataset_path, "wsj_%04d.dp" % data_file)
    with open(filename, mode='r', encoding='utf-8') as text_file:  
      corpus = text_file.read()
      tagged_sentences += corpus.split("\n\n")

  X = [] # store input sequence
  Y = [] # store output sequence
  for sentence in tqdm(tagged_sentences):
      X_sentence = []
      Y_sentence = []

      for tagged_word in sentence.rstrip('\n').split("\n"):       
          entity = tagged_word.split("\t")
          X_sentence.append(entity[0])  # entity[0] contains the word
          Y_sentence.append(entity[1])  # entity[1] contains corresponding tag          
      X.append(X_sentence)
      Y.append(Y_sentence)

  assert len(tagged_sentences) == len(X)

  df = pd.DataFrame({'sentence':X, 'labels':Y})
  df['split'] = split
  return df

train_set = create_dataset(1, end_train, 'train')
val_set = create_dataset(end_train+1, end_validation, 'validation')
test_set = create_dataset(end_validation, end_test, 'test')
dataset = pd.concat([train_set, val_set, test_set])

dataset

100%|██████████| 1963/1963 [00:00<00:00, 66811.26it/s]
100%|██████████| 1299/1299 [00:00<00:00, 57459.85it/s]
100%|██████████| 661/661 [00:00<00:00, 40034.58it/s]


Unnamed: 0,sentence,labels,split
0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ...",train
1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ...",train
2,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP...",train
3,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS...",train
4,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V...",train
...,...,...,...
656,"[They, also, said, that, more, than, a, dozen,...","[PRP, RB, VBD, IN, JJR, IN, DT, NN, NNS, VBP, ...",test
657,"[Sen., Kennedy, said, in, a, separate, stateme...","[NNP, NNP, VBD, IN, DT, JJ, NN, IN, PRP, VBZ, ...",test
658,"[Trinity, Industries, Inc., said, it, reached,...","[NNP, NNPS, NNP, VBD, PRP, VBD, DT, JJ, NN, TO...",test
659,"[Terms, were, n't, disclosed, .]","[NNS, VBD, RB, VBN, .]",test


In [None]:
num_words = len(set([word.lower() for sentence in dataset['sentence'] for word in sentence]))
num_tags   = len(set([word.lower() for sentence in dataset['labels'] for word in sentence]))

print("Total number of tagged sentences: {}".format(len(dataset)))
print("Vocabulary size: {}".format(num_words))
print("Total number of tags: {}".format(num_tags))

Total number of tagged sentences: 3923
Vocabulary size: 10947
Total number of tags: 45


In [None]:
# Plot label distributions

# from matplotlib import pyplot as plt

# def flatten(arr):
#   return [item for sublist in arr for item in sublist]

# def plot_labels_distribution(dataset, title: str):
#     train_data = flatten((dataset.loc[dataset['split'] == "train"])['labels'])
#     val_data = flatten((dataset.loc[dataset['split'] == "validation"])['labels'])
#     test_data = flatten((dataset.loc[dataset['split'] == "test"])['labels'])
                    
#     classes = flatten(dataset['labels'])
#     bins = np.linspace(0, len(classes), len(classes) + 1, dtype='int32')
#     plt.title(title)
#     plt.hist([train_data, val_data, test_data], bins=bins, label=['train', 'val', 'test'])
    
#     plt.legend(loc='upper right')    
    
#     x_ticks_names = classes
#     x_ticks_pos = [(i + 0.5) for i in np.arange(len(x_ticks_names))]
    
#     plt.xticks(x_ticks_pos, x_ticks_names, rotation=90)
#     plt.tight_layout()
#     plt.show()

# plot_labels_distribution(dataset, 'Tags distribution');

# Glove embeddings

In [None]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(model_type: str='glove', embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = ""
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    elif model_type.strip().lower() == 'fasttext':
        download_path = "fasttext-wiki-news-subwords-300"
    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove, fasttext")
        
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        print('FastText: 300')
        raise e

    return emb_model

embedding_dimension=50
embedding_model = load_embedding_model(model_type="glove", embedding_dimension=embedding_dimension)



In [None]:
def get_words(sentences: List[str]):
  return set(token for tokens in sentences for token in tokens)

class Tokenizer:
  def __init__(self, vocabulary=None):
    """
    Transform a set of sentences in a set of indices.
    The order of enumeration respects the order of the set of sentences used in the fit_on_texts method.
    """
    self.word_to_index = {}
    self.idx_to_word = {}
    self.__idx = 1

    if vocabulary:
      self.update_vocabulary(vocabulary)

  def get_oov_terms(self, words):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    words_in_vocabulary = set(self.word_to_index.keys())
    oov = set(words).difference(words_in_vocabulary)
    return list(oov)

  def update_vocabulary(self, sentences, verbose=True):
    """
    Update the vocabulary by looking at the list of sentences in input
    """
    
    words = get_words(sentences) 
    oov_terms = self.get_oov_terms(words)

    old_len = len(self.word_to_index)
    self.__expand_vocabulary(oov_terms)

    if verbose:
      oov_percentage = float(len(oov_terms)) * 100 / len(words)
      print(f"Total OOV terms: {len(oov_terms)} ({oov_percentage:.2f}%)")

      print("Vocabulary length before expansion:", old_len)
      print("Vocabulary length after expansion:", len(self.word_to_index))

    return oov_terms

  def __expand_vocabulary(self, oov_terms: Iterable[str]):
    for term in oov_terms:
      self.word_to_index[term] = self.__idx
      self.idx_to_word[self.__idx] = term
      self.__idx += 1

  def texts_to_sequences(self, sentences):
    """
    Transform a list of sentences in a list of sequences, according to the current vocabulary
    """
    return list(map(self.__sentence_to_sequence, sentences))

  def __sentence_to_sequence(self, sentence):
    return list(map(lambda token: self.word_to_index[token], sentence))

  def sequences_to_texts(self, sequences):
    """
    Transform a list of sequences in a list of sentences, according to the current vocabulary
    """
    return list(map(self.__sequence_to_text, sequences))
  
  def __sequence_to_text(self, sequence):
    return list(map(lambda idx: self.idx_to_word[idx], sequence))

class EmbeddingMatrix():
  def __init__(self,
               embedding_model: gensim.models.keyedvectors.KeyedVectors,
               embedding_dimension,
               word_to_idx: Dict[str, int],
               oov_vector_factory = None) -> None:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param embedding_dimension: dimension of the vectors in the embedding space
    :param word_to_idx: vocabulary map (word -> index) (dict)
    """
    self.oov_vector_factory = oov_vector_factory if oov_vector_factory else lambda: np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

    vocab_size = len(word_to_idx)
    self.embedding_matrix = np.zeros((vocab_size + 1, embedding_dimension), dtype=np.float32)

    for word, idx in tqdm(word_to_idx.items()):
      if word in embedding_model:
        embedding_vector = embedding_model[word]
      else:
        embedding_vector = self.oov_vector_factory()

      self.embedding_matrix[idx] = embedding_vector

  def update_matrix(self, word_to_idx: Dict[str, int]):
    old_vocab_size = len(self.embedding_matrix) - 1
    vocab_size = len(word_to_idx)
    embedding_dimension = self.embedding_matrix.shape[1]

    self.embedding_matrix.resize((vocab_size + 1, embedding_dimension))
    for word, idx in tqdm(word_to_idx.items()):
      if idx > old_vocab_size:
        self.embedding_matrix[idx] = self.oov_vector_factory()

  def as_numpy(self):
    return self.embedding_matrix

In [None]:
def add_oov(dataset, dataset_name: str):
  print()
  print(f'Adding {dataset_name} vocabulary ...')
  tokenizer.update_vocabulary(dataset.sentence)
  embedding_matrix.update_matrix(tokenizer.word_to_index)

tokenizer = Tokenizer([embedding_model.vocab.keys()])
embedding_matrix = EmbeddingMatrix(embedding_model, embedding_dimension, tokenizer.word_to_index)

add_oov(train_set, 'training set')
add_oov(val_set, 'validation set')
add_oov(test_set, 'test set')

print('\n')
print('Shape of the embedding matrix', embedding_matrix.as_numpy().shape)

Total OOV terms: 400000 (100.00%)
Vocabulary length before expansion: 0
Vocabulary length after expansion: 400000


100%|██████████| 400000/400000 [00:01<00:00, 290261.90it/s]



Adding training set vocabulary ...
Total OOV terms: 2346 (29.29%)
Vocabulary length before expansion: 400000
Vocabulary length after expansion: 402346


100%|██████████| 402346/402346 [00:00<00:00, 2034650.42it/s]



Adding validation set vocabulary ...
Total OOV terms: 944 (16.02%)
Vocabulary length before expansion: 402346
Vocabulary length after expansion: 403290


100%|██████████| 403290/403290 [00:00<00:00, 2112721.46it/s]



Adding test set vocabulary ...
Total OOV terms: 455 (12.49%)
Vocabulary length before expansion: 403290
Vocabulary length after expansion: 403745


100%|██████████| 403745/403745 [00:00<00:00, 2023759.42it/s]



Shape of the embedding matrix (403746, 50)





In [None]:
embedding_matrix.as_numpy().shape

(403746, 50)

In [None]:
def create_dataloader(dataset: pd.DataFrame, max_seq_length):
  encoded_sentences = tokenizer.texts_to_sequences(dataset.sentence)
  encoded_sentences = ks.utils.pad_sequences(encoded_sentences, maxlen=max_seq_length, padding="pre", truncating="post")

  encoded_labels = label_tokenizer.texts_to_sequences(dataset.labels)
  encoded_labels = ks.utils.pad_sequences(encoded_labels, maxlen=max_seq_length, padding="pre", truncating="post")

  return (
      tf.data.Dataset.from_tensor_slices((encoded_sentences, encoded_labels))
      # .cache()
      # .prefetch(buffer_size=tf.data.AUTOTUNE)
      )

max_seq_length = 1000

label_tokenizer = Tokenizer()
label_tokenizer.update_vocabulary(dataset.labels, verbose=False)

train_ds = create_dataloader(train_set, max_seq_length)
val_ds = create_dataloader(val_set, max_seq_length)

In [None]:
# text, labels = next(train_ds.take(1).as_numpy_iterator())
# print(text)
# print(labels)
# print(len(train_set.sentence[0]))

# Network baseline

In [None]:
def create_model(embedding_weights):
  inputs = ks.layers.Input(shape=(None, ))
  x = ks.layers.Embedding(*embedding_weights.shape, weights=[embedding_weights])(inputs)
  rnn = ks.layers.LSTM(units=64, return_sequences=True)
  x = ks.layers.Bidirectional(rnn)(x)
  x = ks.layers.TimeDistributed(ks.layers.Dense(45, activation='softmax'))(x)

  return ks.models.Model(inputs, x)


embedding_weights = embedding_matrix.as_numpy()
model = create_model(embedding_weights)
model.summary()

# #Create Architecture
# lstm_model = Sequential()
# # vocabulary size — number of unique words in data
# # length of vector with which each word is represented
# lstm_model.add(Embedding(input_dim = len(vocabulary), 
# output_dim = 50, 
# # length of input sequence
# input_length = 100, 
# # word embedding matrix
# weights = [embedding_matrix],
# # True — update embeddings_weight matrix
# trainable = True 
# ))
# # add an LSTM layer which contains 64 LSTM cells
# # True — return whole sequence; False — return single output of the end of the sequence
# lstm_model.add(LSTM(64, return_sequences=True))
# lstm_model.add(TimeDistributed(Dense(num_tags, activation='softmax')))
# #compile model
# lstm_model.compile(loss      =  'categorical_crossentropy',
#                   optimizer =  'adam',
#                   metrics   =  ['acc'])
# # check summary of the model
# lstm_model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 50)          20187300  
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        58880     
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, None, 45)         5805      
 ibuted)                                                         
                                                                 
Total params: 20,251,985
Trainable params: 20,251,985
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
sequences = tokenizer.texts_to_sequences(train_set.sentence.values)
sequences[:10]

[[401341,
  400191,
  258712,
  238658,
  55202,
  72706,
  258712,
  381093,
  350838,
  66392,
  184243,
  225554,
  101864,
  134325,
  385117,
  402163,
  184888,
  112474],
 [400974,
  400191,
  324081,
  133587,
  188675,
  400284,
  400405,
  258712,
  66392,
  401196,
  264039,
  141565,
  112474],
 [401986,
  402053,
  258712,
  61636,
  55202,
  72706,
  278139,
  26824,
  133587,
  188675,
  401215,
  401359,
  400982,
  401037,
  258712,
  381361,
  204458,
  101864,
  134325,
  385117,
  188675,
  236465,
  402314,
  216114,
  20603,
  112474],
 [401540,
  247717,
  188675,
  132277,
  70566,
  67760,
  355336,
  1612,
  400849,
  91431,
  294753,
  51551,
  360728,
  101864,
  369443,
  223173,
  188675,
  60030,
  236271,
  35090,
  101864,
  141565,
  188675,
  288564,
  333655,
  355336,
  210782,
  280513,
  109699,
  222122,
  55202,
  48553,
  258712,
  241009,
  87185,
  112474],
 [401620,
  132277,
  115382,
  258712,
  400736,
  258712,
  324081,
  369554,
  1927

In [None]:
lstm_training = lstm_model.fit(embedding_matrix, y_train, batch_size=128, epochs=10)

ValueError: ignored

In [None]:
glove_file = os.path.join(os.getcwd(),"Glove", "glove.6B.50d.txt")

print ("Loading Glove Model")
with open(glove_file, encoding="utf8" ) as f:
    lines = f.readlines()
vocabulary = {}
for line in lines:
    splits = line.split()
    vocabulary[splits[0]] = np.array([float(val) for val in splits[1:]])
print ("Done.",len(vocabulary.keys())," words loaded!")

Loading Glove Model
Done. 400000  words loaded!


In [None]:
def findembedding(word):
    if word in vocabulary.keys():
        embedding = vocabulary[word]
    else:
        embedding = [0]*50
    return embedding

def glovesent(sentence):
    matrix = [findembedding(word) for word in tokenizer.tokenize(str(sentence))]
    matrix = np.array(matrix)
    return np.average(matrix, axis=0)


glove_X_train = np.array([glovesent(sentence) for sentence in train_set])
glove_X_test = np.array([glovesent(sentence) for sentence in test_set])

print(glove_X_train.shape)
