In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# %cd ./drive/MyDrive/Colab\ Notebooks/NLP_Project/

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
import random as rnd

In [None]:
def get_vocab(vocab_path, tags_path):
    vocab = {}
    with open(vocab_path) as f:
        for i, l in enumerate(f.read().splitlines()):
            vocab[l] = i  # to avoid the 0
        # loading tags (we require this to map tags to their indices)
    vocab['<PAD>'] = len(vocab) # 35180
    tag_map = {}
    with open(tags_path) as f:
        for i, t in enumerate(f.read().splitlines()):
            tag_map[t] = i

    return vocab, tag_map

def get_params(vocab, tag_map, sentences_file, labels_file):
    sentences = []
    labels = []

    with open(sentences_file) as f:
        for sentence in f.read().splitlines():
            # replace each token by its index if it is in vocab
            # else use index of UNK_WORD
            s = [vocab[token] if token in vocab
                 else vocab['UNK']
                 for token in sentence.split(' ')]
            sentences.append(s)

    with open(labels_file) as f:
        for sentence in f.read().splitlines():
            # replace each label by its index
            s = sentence.split(' ')
            # remove empty strings
            s = list(filter(None, s))
            l = [tag_map[label] for label in s] # I added plus 1 here
            labels.append(l)
    return sentences, labels, len(sentences)


# Importing and discovering the data

In [None]:
vocab, tag_map = get_vocab('./Dataset/characters/unique_chars.txt', './Dataset/characters/unique_labels.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, './Dataset/characters/t_chars.txt', './Dataset/characters/t_labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, './Dataset/characters/v_chars.txt', './Dataset/characters/v_labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, './Dataset/characters/test_chars.txt', './Dataset/characters/test_labels.txt')

In [None]:
# vocab translates from a word to a unique number
print('vocab["الأعم"]:', vocab["الأعم"])
# Pad token
print('padded token:', vocab['<PAD>'])

In [None]:
# The possible tags
print(tag_map)

In [None]:
# Exploring information about the data
print('The number of outputs is tag_map', len(tag_map))
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(vocab))
print('The training size is', t_size)
print('The validation size is', v_size)
print('An example of the first sentence is', t_sentences[0])
print('An example of its corresponding label is', t_labels[0])
len(t_sentences[0])==len( t_labels[0])

# NERDataset
The class that impelements the dataset for NER

In [None]:
class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    #####################  create two tensors one for x and the other for labels ###############################
    self.x = nn.utils.rnn.pad_sequence([torch.tensor(i) for i in x], padding_value=pad,batch_first = True)
    self.y = nn.utils.rnn.pad_sequence([torch.tensor(i) for i in y], padding_value=10,batch_first = True)
    #################################################################################################################
    # print the max length of the sentences
    print('The max length of the sentences is', self.x.shape[1])
    print('The max length of the labels is', self.y.shape[1])
  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ######################  return the length of the dataset #############################
    return len(self.x)
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ######################  return a tuple of x and y ###################################
    return self.x[idx], self.y[idx]
    ##########################################################################################

In [None]:
batch_size = 5
mini_sentences = t_sentences[0: 8]
mini_labels = t_labels[0: 8]
# print(mini_labels)
mini_dataset = NERDataset(mini_sentences, mini_labels, vocab['<PAD>'])
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

# NER
The class that implementss the pytorch model for NER

In [None]:
class NER(nn.Module):
  def __init__(self, vocab_size=len(t_sentences) + len(test_sentences) + len(v_sentences), embedding_dim=300, hidden_size=50, n_classes=len(tag_map)):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(NER, self).__init__()
    #######################  Create the layers of your model #######################################
    # (1) Create the embedding layer
    self.embedding = nn.Embedding(vocab_size, embedding_dim)

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size, n_classes)
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    #########################  implement the forward pass ####################################
    embeddings = self.embedding(sentences)
    lstm_out, (a,b) = self.lstm(embeddings)
    # print("lstm_out.size",lstm_out.size())
    # print("a.size",a.size())
    # print("b.size",b.size())
    final_output = self.linear(lstm_out)
    ###############################################################################################
    return final_output

In [None]:
model = NER()
print(model)

# Training

In [None]:
def train(model, train_dataset, batch_size=512, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ##############################  replace the Nones in the following code ##################################

  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = torch.nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      train_label = train_label.to(device)

      # (5) move the train label to the device
      train_input = train_input.to(device)


      # (6) do the forward pass
      output = model(train_input)
      print("output.shape",output.shape)
      print("train_label.shape",train_label.shape)
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      batch_loss = criterion(output.view(-1, output.shape[-1]), train_label.view(-1))

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss

      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (output.argmax(2) == train_label).sum().item()

      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()


      # (11) do the backward pass
      batch_loss.backward()


      # (12) update the weights with your optimizer
      optimizer.step()

    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = total_acc_train / (len(train_dataset) * train_dataset[0][0].shape[0])
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [None]:
train_dataset = NERDataset(t_sentences, t_labels, vocab['<PAD>'])
val_dataset = NERDataset(v_sentences, v_labels, vocab['<PAD>'])
test_dataset = NERDataset(test_sentences, test_labels, vocab['<PAD>'])

In [None]:
train(model, train_dataset)

# Evaluation

In [None]:
def evaluate(model, test_dataset, batch_size=512):
  """
  This function takes a NER model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a NER model
  - test_dataset: dataset of type NERDataset
  """
  ###########################  Replace the Nones in the following code ##########################

  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0

  # (2) disable gradients
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device)

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model(test_input)

      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = (output.argmax(2) == test_label).sum().item()
      total_acc_test += acc

    # (6) calculate the over all accuracy
    total_acc_test /= (len(test_dataset) * test_dataset[0][0].shape[0])
  ##################################################################################################


  print(f'\nTest Accuracy: {total_acc_test}')

In [None]:
evaluate(model, test_dataset)

In [None]:
torch.save(model.state_dict(), f'./SavedModels/model_lstm_linearNN')