<a href="https://colab.research.google.com/github/Simone999/nlp_assignment1/blob/main/assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os

import pandas as pd
import numpy as np

from tqdm import tqdm

from typing import List, Callable, Dict, Iterable

import keras as ks

In [2]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r "/content/drive/My Drive/dependency_treebank" "dependency_treebank"

Mounted at /content/drive


structuring dataframe

In [3]:
dataset_name = "dependency_treebank"
dataset_path = os.path.join(os.getcwd(), dataset_name)
end_train = 100
end_validation = 150
end_test = 199

def create_dataset(start, end, split:str):
  tagged_sentences = []
  for data_file in range(start, end+1):
    filename = os.path.join(dataset_path, "wsj_%04d.dp" % data_file)
    with open(filename, mode='r', encoding='utf-8') as text_file:  
      corpus = text_file.read()
      tagged_sentences += corpus.split("\n\n")

  X = [] # store input sequence
  Y = [] # store output sequence
  for sentence in tqdm(tagged_sentences):
      X_sentence = []
      Y_sentence = []

      for tagged_word in sentence.rstrip('\n').split("\n"):       
          entity = tagged_word.split("\t")
          X_sentence.append(entity[0])  # entity[0] contains the word
          Y_sentence.append(entity[1])  # entity[1] contains corresponding tag          
      X.append(X_sentence)
      Y.append(Y_sentence)

  assert len(tagged_sentences) == len(X)

  df = pd.DataFrame({'sentence':X, 'labels':Y})
  df['split'] = split
  return df

train_set = create_dataset(1, end_train, 'train')
val_set = create_dataset(end_train+1, end_validation, 'validation')
test_set = create_dataset(end_validation, end_test, 'test')
dataset = pd.concat([train_set, val_set, test_set])

dataset

100%|██████████| 1963/1963 [00:00<00:00, 61186.80it/s]
100%|██████████| 1299/1299 [00:00<00:00, 66392.90it/s]
100%|██████████| 661/661 [00:00<00:00, 49352.66it/s]


Unnamed: 0,sentence,labels,split
0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ...",train
1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ...",train
2,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP...",train
3,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS...",train
4,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V...",train
...,...,...,...
656,"[They, also, said, that, more, than, a, dozen,...","[PRP, RB, VBD, IN, JJR, IN, DT, NN, NNS, VBP, ...",test
657,"[Sen., Kennedy, said, in, a, separate, stateme...","[NNP, NNP, VBD, IN, DT, JJ, NN, IN, PRP, VBZ, ...",test
658,"[Trinity, Industries, Inc., said, it, reached,...","[NNP, NNPS, NNP, VBD, PRP, VBD, DT, JJ, NN, TO...",test
659,"[Terms, were, n't, disclosed, .]","[NNS, VBD, RB, VBN, .]",test


In [4]:
num_words = len(set([word.lower() for sentence in dataset['sentence'] for word in sentence]))
num_tags   = len(set([word.lower() for sentence in dataset['labels'] for word in sentence]))

print("Total number of tagged sentences: {}".format(len(dataset)))
print("Vocabulary size: {}".format(num_words))
print("Total number of tags: {}".format(num_tags))

Total number of tagged sentences: 3923
Vocabulary size: 10947
Total number of tags: 45


In [None]:
# Plot label distributions

# from matplotlib import pyplot as plt

# def flatten(arr):
#   return [item for sublist in arr for item in sublist]

# def plot_labels_distribution(dataset, title: str):
#     train_data = flatten((dataset.loc[dataset['split'] == "train"])['labels'])
#     val_data = flatten((dataset.loc[dataset['split'] == "validation"])['labels'])
#     test_data = flatten((dataset.loc[dataset['split'] == "test"])['labels'])
                    
#     classes = flatten(dataset['labels'])
#     bins = np.linspace(0, len(classes), len(classes) + 1, dtype='int32')
#     plt.title(title)
#     plt.hist([train_data, val_data, test_data], bins=bins, label=['train', 'val', 'test'])
    
#     plt.legend(loc='upper right')    
    
#     x_ticks_names = classes
#     x_ticks_pos = [(i + 0.5) for i in np.arange(len(x_ticks_names))]
    
#     plt.xticks(x_ticks_pos, x_ticks_names, rotation=90)
#     plt.tight_layout()
#     plt.show()

# plot_labels_distribution(dataset, 'Tags distribution');

# Glove embeddings

In [None]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(model_type: str='glove', embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = ""
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    elif model_type.strip().lower() == 'fasttext':
        download_path = "fasttext-wiki-news-subwords-300"
    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove, fasttext")
        
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        print('FastText: 300')
        raise e

    return emb_model

embedding_model = load_embedding_model(model_type="glove", embedding_dimension=50)



In [None]:
# Initialize the vocabulary with Glove vocabulary
vocabulary = {k: v.index for k, v in embedding_model.vocab.items()}

In [None]:
def get_words(df: pd.DataFrame):
  return set(token for tokens in df.sentence.values for token in tokens)

class Vectorizer(ks.layers.TextVectorization):
  def __init__(self) -> None:
     super().__init__()

  def expand_vocabulary(self, oov_terms: Iterable[str]):
    idx = max(vocabulary.values()) + 1
    for term in oov_terms:
      vocabulary[term] = idx
      idx += 1

    return vocabulary

def check_OOV_terms(vocabulary: Dict[str, int],
                    word_listing: Iterable[str]):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    embedding_vocabulary = set(vocabulary.keys())
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

def expand_vocabulary(vocabulary: Dict[str, int], oov_terms: Iterable[str]):
  idx = max(vocabulary.values()) + 1
  for term in oov_terms:
    vocabulary[term] = idx
    idx += 1

  return vocabulary
  

word_listing = get_words(train_set)
oov_terms = check_OOV_terms(vocabulary, word_listing)
oov_percentage = float(len(oov_terms)) * 100 / len(word_listing)
print(f"Total OOV terms: {len(oov_terms)} ({oov_percentage:.2f}%)")

print("Vocabulary length before expansion:", len(vocabulary))
vocabulary = expand_vocabulary(vocabulary, oov_terms)
print("Vocabulary length after expansion:", len(vocabulary))

# expand embedding matrix with new (maybe random) vectors, iterating through oov terms

# from collections import OrderedDict

# def build_vocabulary(df: pd.DataFrame) -> (Dict[int, str],
#                                            Dict[str, int],
#                                            List[str]):
#     """
#     Given a dataset, builds the corresponding word vocabulary.

#     :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
#     :return:
#       - word vocabulary: vocabulary index to word
#       - inverse word vocabulary: word to vocabulary index
#       - word listing: set of unique terms that build up the vocabulary
#     """
#     idx_to_word = OrderedDict()
#     word_to_idx = OrderedDict()
    
#     curr_idx = 0
#     for tokens in tqdm(df.sentence.values):
#         for token in tokens:
#             if token not in word_to_idx:
#                 word_to_idx[token] = curr_idx
#                 idx_to_word[curr_idx] = token
#                 curr_idx += 1

#     word_listing = list(idx_to_word.values())
#     return idx_to_word, word_to_idx, word_listing
 
# idx_to_word, word_to_idx, word_listing = build_vocabulary(train_set)
# print(f'[Debug] Index -> Word vocabulary size: {len(idx_to_word)}')
# print(f'[Debug] Word -> Index vocabulary size: {len(word_to_idx)}')
# print(f'[Debug] Some words: {[(idx_to_word[idx], idx) for idx in np.arange(10) + 1]}')

Total OOV terms: 2346 (29.29%)
Vocabulary length before expansion: 400000
Vocabulary length after expansion: 402346


In [None]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           word_to_idx: Dict[str, int],
                           vocab_size: int,
                           oov_terms: List[str]) -> np.ndarray:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_to_idx: vocabulary map (word -> index) (dict)
    :param vocab_size: size of the vocabulary
    :param oov_terms: list of OOV terms (list)

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((vocab_size, embedding_dimension), dtype=np.float32)
    for word, idx in tqdm(word_to_idx.items()):
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [None]:
embedding_matrix = build_embedding_matrix(embedding_model,50,vocabulary,len(vocabulary),oov_terms)
embedding_matrix[0]

100%|██████████| 402346/402346 [00:01<00:00, 379624.15it/s]


array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [None]:
#Create Architecture
lstm_model = Sequential()
# vocabulary size — number of unique words in data
# length of vector with which each word is represented
lstm_model.add(Embedding(input_dim = len(vocabulary), 
output_dim = 50, 
# length of input sequence
input_length = 100, 
# word embedding matrix
weights = [embedding_matrix],
# True — update embeddings_weight matrix
trainable = True 
))
# add an LSTM layer which contains 64 LSTM cells
# True — return whole sequence; False — return single output of the end of the sequence
lstm_model.add(LSTM(64, return_sequences=True))
lstm_model.add(TimeDistributed(Dense(num_tags, activation='softmax')))
#compile model
lstm_model.compile(loss      =  'categorical_crossentropy',
                  optimizer =  'adam',
                  metrics   =  ['acc'])
# check summary of the model
lstm_model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 50)           20117300  
                                                                 
 lstm_1 (LSTM)               (None, 100, 64)           29440     
                                                                 
 time_distributed_1 (TimeDis  (None, 100, 45)          2925      
 tributed)                                                       
                                                                 
Total params: 20,149,665
Trainable params: 20,149,665
Non-trainable params: 0
_________________________________________________________________


In [None]:
lstm_training = lstm_model.fit(embedding_matrix, y_train, batch_size=128, epochs=10)

ValueError: ignored

In [None]:
glove_file = os.path.join(os.getcwd(),"Glove", "glove.6B.50d.txt")

print ("Loading Glove Model")
with open(glove_file, encoding="utf8" ) as f:
    lines = f.readlines()
vocabulary = {}
for line in lines:
    splits = line.split()
    vocabulary[splits[0]] = np.array([float(val) for val in splits[1:]])
print ("Done.",len(vocabulary.keys())," words loaded!")

Loading Glove Model
Done. 400000  words loaded!


In [None]:
def findembedding(word):
    if word in vocabulary.keys():
        embedding = vocabulary[word]
    else:
        embedding = [0]*50
    return embedding

def glovesent(sentence):
    matrix = [findembedding(word) for word in tokenizer.tokenize(str(sentence))]
    matrix = np.array(matrix)
    return np.average(matrix, axis=0)


glove_X_train = np.array([glovesent(sentence) for sentence in train_set])
glove_X_test = np.array([glovesent(sentence) for sentence in test_set])

print(glove_X_train.shape)
