In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd ./drive/MyDrive/Colab\ Notebooks/NLP_Project/

/content/drive/.shortcut-targets-by-id/1zPjf1cHfdKqObemkPReffGbQHU_wotr2/NLP_Project


# Import Modules


In [14]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
import random as rnd
from typing import Any, List, Optional
from torch.optim.lr_scheduler import StepLR
import gc

In [3]:
def get_vocab(vocab_path, tags_path):
    vocab = {}
    with open(vocab_path) as f:
        for i, l in enumerate(f.read().splitlines()):
            vocab[l] = i  # to avoid the 0
    # loading tags (we require this to map tags to their indices)
    vocab['<PAD>'] = len(vocab) # 35180
    tag_map = {}
    with open(tags_path) as f:
        for i, t in enumerate(f.read().splitlines()):
            tag_map[t] = i

    return vocab, tag_map

def get_params(vocab, tag_map, sentences_file, labels_file):
    sentences = []
    labels = []

    with open(sentences_file) as f:
        for sentence in f.read().splitlines():
            # replace each token by its index if it is in vocab
            # else use index of UNK_WORD
            s = [vocab[token] if token in vocab
                 else vocab['UNK']
                 for token in sentence.split(' ')]
            sentences.append(s)

    with open(labels_file) as f:
        for sentence in f.read().splitlines():
            # replace each label by its index
            s = sentence.split(' ')
            # remove empty strings
            s = list(filter(None, s))
            l = [tag_map[label] for label in s] # I added plus 1 here
            labels.append(l)
    return sentences, labels, len(sentences)

# Importing and discovering the data


In [None]:
vocab, tag_map = get_vocab('./Dataset/new_new_characters/unique_chars.txt', './Dataset/new_new_characters/unique_labels.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, './Dataset/new_new_characters/t_chars.txt', './Dataset/new_new_characters/t_labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, './Dataset/new_new_characters/v_chars.txt', './Dataset/new_new_characters/v_labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, './Dataset/new_new_characters/test_chars.txt', './Dataset/new_new_characters/test_labels.txt')
test_sentences2, test_labels2, test_size2 = get_params(vocab, tag_map, './Dataset/new_new_characters/test2_chars.txt', './Dataset/new_new_characters/test2_labels.txt')

In [None]:
# Exploring information about the data
print('The number of outputs is tag_map', len(tag_map))
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(vocab))
print('The training size is', t_size)
print('The validation size is', v_size)
print('An example of the first sentence is', t_sentences[0])
print('An example of its corresponding label is', t_labels[0])
len(t_sentences[0])==len( t_labels[0])

The number of outputs is tag_map 16
Num of vocabulary words: 39
The vocab size is 39
The training size is 116323
The validation size is 5907
An example of the first sentence is [24, 17, 0, 33, 34, 17, 24, 13, 32, 18, 0, 34, 17, 0, 10, 25, 33, 27, 0, 9, 24, 18, 0, 18, 0, 11, 15, 4, 2, 10, 18, 5, 1, 32, 15, 26, 16, 24, 17, 0, 33, 5, 0, 26, 22, 10, 24, 29, 12, 10, 33, 4, 27, 1, 4, 18, 15, 19, 10, 15, 6, 25, 10, 30, 5, 18, 0, 27, 23, 0, 18, 31, 17, 14, 17, 5, 31, 18, 32, 0, 31, 17, 14, 17, 5, 33, 31, 1, 18, 0, 25, 10, 1, 12, 15, 17, 15, 16, 4, 27, 0, 24, 18, 21, 31, 7, 6, 26, 5, 24, 28, 15, 17, 2, 25, 11, 1, 18, 15, 18, 5, 1, 32, 15, 26, 16, 24, 17, 0, 18, 5, 1, 2, 18, 23, 34, 17, 5, 26, 32, 0, 10, 29, 12, 31, 1, 33, 33, 17, 4, 0, 5, 23, 18, 0, 11, 1, 18, 15, 17, 27, 0, 24, 18, 21, 18, 0, 31, 7, 6, 26, 26, 10, 7, 15, 10, 6, 18, 0, 1, 14, 18, 23, 16, 17, 18, 0, 23, 14, 17, 25, 0, 0, 7, 1, 31, 17, 1, 6, 17, 28, 0, 4, 17, 23, 6, 15, 31, 6, 31, 25, 24, 17, 0, 31, 18, 0, 4, 17, 34, 7, 6, 18, 5,

True

# NERDataset

The class that impelements the dataset for NER


In [5]:
class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    self.x = nn.utils.rnn.pad_sequence([torch.tensor(i) for i in x], padding_value=pad,batch_first = True)
    self.y = nn.utils.rnn.pad_sequence([torch.tensor(i) for i in y], padding_value=tag_map["pad"],batch_first = True)
    print('The max length of the sentences is', self.x.shape[1])
    print('The max length of the labels is', self.y.shape[1])
  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

# Classifiers

The class that implementss the pytorch model for arabic diacritic classification


In [15]:
class BatchNormConv1d(nn.Module):
    """
    A nn.Conv1d followed by an optional activation function, and nn.BatchNorm1d
    """
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        kernel_size: int,
        stride: int,
        padding: int,
        activation: Any = None,
    ):
        super().__init__()
        self.conv1d = nn.Conv1d(
            input_dim,
            output_dim,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            bias=False,
        )
        self.bn = nn.BatchNorm1d(output_dim)
        self.activation = activation

    def forward(self, x: Any):
        x = self.conv1d(x)
        if self.activation is not None:
            x = self.activation(x)
        return self.bn(x)

class Prenet(nn.Module):
    """
    A prenet is a collection of linear layers with dropout(0.5), and RELU activation function
    """
    def __init__(self, input_dim: int, prenet_depth: List[int] = [256, 128], dropout: int = 0.5):
        super().__init__()
        in_sizes = [input_dim] + prenet_depth[:-1]
        self.layers = nn.ModuleList(
            [
                nn.Linear(in_size, out_size)
                for (in_size, out_size) in zip(in_sizes, prenet_depth)
            ]
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs: torch.Tensor):
        for linear in self.layers:
            inputs = self.dropout(self.relu(linear(inputs)))
        return inputs


class Highway(nn.Module):
    """
      To overcome the difficulty of training deep neural networks
    """
    def __init__(self, in_size, out_size):
        super().__init__()
        self.H = nn.Linear(in_size, out_size)
        self.H.bias.data.zero_()
        self.T = nn.Linear(in_size, out_size)
        self.T.bias.data.fill_(-1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs: torch.Tensor):
        H = self.relu(self.H(inputs))
        T = self.sigmoid(self.T(inputs))
        return H * T + inputs * (1.0 - T)


class CBHG(nn.Module):
    """
    The CBHG module (1-D Convolution Bank + Highway network + Bidirectional GRU)
    """
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        K: int,
        projections: List[int],
    ):
        """
        input_dim (int): the input size
        output_dim (int): the output size
        k (int): number of filters
        """
        super().__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.relu = nn.ReLU()
        self.conv1d_banks = nn.ModuleList(
            [
                BatchNormConv1d(
                    input_dim,
                    input_dim,
                    kernel_size=k,
                    stride=1,
                    padding=k // 2,
                    activation=self.relu,
                )
                for k in range(1, K + 1)
            ]
        )
        self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)

        in_sizes = [K * input_dim] + projections[:-1]
        activations = [self.relu] * (len(projections) - 1) + [None]
        self.conv1d_projections = nn.ModuleList(
            [
                BatchNormConv1d(
                    in_size, out_size, kernel_size=3, stride=1, padding=1, activation=ac
                )
                for (in_size, out_size, ac) in zip(in_sizes, projections, activations)
            ]
        )

        self.pre_highway = nn.Linear(projections[-1], input_dim, bias=False)
        self.highways = nn.ModuleList([Highway(input_dim, input_dim) for _ in range(4)])

        self.gru = nn.GRU(input_dim, output_dim, 1, batch_first=True, bidirectional=True)

    def forward(self, inputs, input_lengths=None):
        x = inputs
        x = x.transpose(1, 2)
        T = x.size(-1)

        # Concat conv1d bank outputs
        x = torch.cat([conv1d(x)[:, :, :T] for conv1d in self.conv1d_banks], dim=1)
        assert x.size(1) == self.input_dim * len(self.conv1d_banks)
        x = self.max_pool1d(x)[:, :, :T]

        for conv1d in self.conv1d_projections:
            x = conv1d(x)

        # Back to the original shape
        x = x.transpose(1, 2)

        if x.size(-1) != self.input_dim:
            x = self.pre_highway(x)

        # Residual connection
        x += inputs
        for highway in self.highways:
            x = highway(x)

        if input_lengths is not None:
            x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True)

        self.gru.flatten_parameters()
        outputs, _ = self.gru(x)

        if input_lengths is not None:
            outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)

        return outputs


In [16]:
class CBHGModel(nn.Module):
    def __init__(
        self,
        input_vocab_size: int,
        target_vocab_size: int,
        embedding_dim: int = 256,
        use_prenet: bool = True,
        prenet_sizes: List[int] = [256, 256],
        gru_units: int = 256,
        cbhg_filters: int = 16,
        cbhg_projections: List[int] = [128, 256],
        post_cbhg_layers_units: List[int] = [256, 256],
        post_cbhg_use_batch_norm: bool = True
    ):
        super().__init__()
        self.use_prenet = use_prenet
        self.embedding = nn.Embedding(input_vocab_size, embedding_dim)
        if self.use_prenet:
            self.prenet = Prenet(embedding_dim, prenet_depth=prenet_sizes)

        self.cbhg = CBHG(
            prenet_sizes[-1] if self.use_prenet else embedding_dim,
            gru_units,
            K=cbhg_filters,
            projections=cbhg_projections,
        )

        layers = []
        post_cbhg_layers_units = [gru_units] + post_cbhg_layers_units

        for i in range(1, len(post_cbhg_layers_units)):
            layers.append(
                nn.LSTM(
                    post_cbhg_layers_units[i - 1] * 2,
                    post_cbhg_layers_units[i],
                    bidirectional=True,
                    batch_first=True,
                )
            )
            if post_cbhg_use_batch_norm:
                layers.append(nn.BatchNorm1d(post_cbhg_layers_units[i] * 2))

        self.post_cbhg_layers = nn.ModuleList(layers)
        self.projections = nn.Linear(post_cbhg_layers_units[-1] * 2, target_vocab_size)
        self.post_cbhg_layers_units = post_cbhg_layers_units
        self.post_cbhg_use_batch_norm = post_cbhg_use_batch_norm


    def forward(
        self,
        src: torch.Tensor,
        lengths: Optional[torch.Tensor] = None,
    ):
        embedding_out = self.embedding(src)
        cbhg_input = embedding_out
        if self.use_prenet:
            cbhg_input = self.prenet(embedding_out)
        outputs = self.cbhg(cbhg_input, lengths)

        hn = torch.zeros((2, 2, 2))
        cn = torch.zeros((2, 2, 2))

        for i, layer in enumerate(self.post_cbhg_layers):
            if isinstance(layer, nn.BatchNorm1d):
                outputs = layer(outputs.permute(0, 2, 1))
                outputs = outputs.permute(0, 2, 1)
                continue
            if i > 0:
                outputs, (hn, cn) = layer(outputs, (hn, cn))
            else:
                outputs, (hn, cn) = layer(outputs)


        predictions = self.projections(outputs)
        output = {"diacritics": predictions}
        return output


In [None]:
# Create an instance of CBHG Model
model = CBHGModel(
      input_vocab_size = len(t_sentences) + len(v_sentences),
      target_vocab_size = len(tag_map),
        )
# print(model)

# Training


In [None]:
model_name = f"model_cbhg_all_data_lr0.001_batch32"

In [None]:
def train(model,train_dataset, start_epoch=0,batch_size = 32, epochs = 10, learning_rate = 0.001):
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


  # (2) make the criterion cross entropy loss
  # criterion = torch.nn.CrossEntropyLoss()
  criterion = torch.nn.CrossEntropyLoss(ignore_index=tag_map["pad"])

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # define your scheduler (to gradually decrease learning rate)
  # scheduler is used to decrease the learning rate by a factor of [gamma] every [step_size] epochs
  # scheduler = StepLR(optimizer, step_size=5, gamma=0.0001)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(start_epoch,start_epoch+epochs):
    total_acc_train = 0
    total_loss_train = 0
    prev_acc = -1
    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      train_label = train_label.to(device)

      # (5) move the train label to the device
      train_input = train_input.to(device)

      # (6) do the forward pass
      output = model(train_input)

      # (7) loss calculation
      batch_loss = criterion(output['diacritics'].view(-1, output['diacritics'].shape[-1]), train_label.view(-1))

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()

      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (output['diacritics'].argmax(2) == train_label).sum().item()

      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()


      # (11) do the backward pass
      batch_loss.backward()


      # (12) update the weights with your optimizer
      optimizer.step()

      # update the learning rate
      # scheduler.step()

    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracyS
    epoch_acc = total_acc_train / (len(train_dataset) * train_dataset[0][0].shape[0])
    if prev_acc >= epoch_acc:
      print("Finish training because too many epochs")
      return
    else:
      prev_acc = epoch_acc

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

    # clear the cache each epoch
    torch.cuda.empty_cache()
    torch.save(model.state_dict(), f'./SavedModels/{model_name}_epoch{epoch_num + 1}')

  ##############################################################################################################

In [None]:
train_dataset = NERDataset(t_sentences, t_labels, vocab['<PAD>'])
test_dataset = NERDataset(test_sentences, test_labels, vocab['<PAD>'])
test_dataset2 = NERDataset(test_sentences2, test_labels2, vocab['<PAD>'])

In [None]:
def load_model(model,model_name):
  model.load_state_dict(torch.load(f'./SavedModels/{model_name}'))
  return model

In [None]:
start_epoch = 4
if start_epoch != 0:
  model = load_model(model,"model_cbhg_all_data_lr0.001_batch32_epoch4") # start from a previously trained model with some epochs

train(model, train_dataset,start_epoch = start_epoch,epochs = 3)

# Save The Model


In [None]:
# Save the Diacritic Classifier Model
torch.save(model.state_dict(), f'./SavedModels/{model_name}')

In [None]:
del train_dataset
gc.collect()

# Evaluation


In [None]:
diacritic_results = []
gold_results = []
test_input_list = []
def evaluate(model, test_dataset, batch_size = 32):
  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0


  # (2) disable gradients
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device)

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model(test_input)
      prediction = output['diacritics'].argmax(2)


      diacritic_results.extend(np.array(prediction.cpu().data).flatten())
      gold_results.extend(np.array(test_label.cpu().data).flatten())
      test_input_list.extend(np.array(test_input.cpu().data).flatten())

In [None]:
evaluate(model, test_dataset)

In [None]:
der = 0
total_size = 0
for i in range(len(diacritic_results)):
  if test_input_list[i] != vocab['<PAD>']: # Do not include padding in DER calculations
    if diacritic_results[i] != gold_results[i] : # Miss Classification
      der += 1
    total_size += 1
der /= total_size
der *= 100
print("DER = ",der,"%")
print("Accuracy = ",100 - der,"%")

In [None]:
# these list are sorted as mentioned by the TA
LIST_OF_DIACRITICS = [
    "FATHA",
    "FATHATAN",
    "DAMMA",
    "DAMMATAN",
    "KASRA",
    "KASRATAN",
    "SUKUN",
    "SHADDA",
    "SHADDA_FATHA",
    "SHADDA_FATHATAN",
    "SHADDA_DAMMA",
    "SHADDA_DAMMATAN",
    "SHADDA_KASRA",
    "SHADDA_KASRATAN",
    "_"
]
LIST_OF_ARABIC_LETTERS = ['آ' ,'ض','ف','ص','أ','ت','ق','ث','ا','ه','غ','ة','ج','ك','م','ن','ي','ب','د','س','و','ل','ؤ','ش','إ','ط','ئ','ظ','ز','ى','ء','ر','ع','ذ','ح','خ','UNK','<pad>']

In [None]:
# Prepare the data that will be written in the CSV file
index = 100
filtered_diacritic_results = [] # diacrtic results without paddings
filtered_gold_results = [] # diacrtic gold results without paddings
filtered_inputs = [] # inputs without paddings
state = [] # This marks the incorrect results in the CSV file

for i in range(len(diacritic_results)):
  if test_input_list[i] != vocab['<PAD>']:
    filtered_diacritic_results.append(diacritic_results[i])
    filtered_gold_results.append(gold_results[i])
    filtered_inputs.append(test_input_list[i])
    if diacritic_results[i] != gold_results[i]:
      state.append("Incorrect")
    else:
      state.append("")


inputs = [LIST_OF_ARABIC_LETTERS[filtered_inputs[i]] for i in range(index)]
model_prediction = [LIST_OF_DIACRITICS[filtered_diacritic_results[i]] for i in range(index)]
gold_out = [LIST_OF_DIACRITICS[filtered_gold_results[i]] for i in range(index)]
state = state[0:index]

print(len(inputs))
print(len(model_prediction))
print(len(gold_out))
print(len(state))

In [None]:
# Output the accuracy to CSV file
model_name = f"CBHG"
df = pd.DataFrame(
    {
     'ID': range(len(filtered_diacritic_results[0:index])),
     'label': filtered_diacritic_results[0:index],
     # The following columns are provided for illustrative purposes only and are not necessary (will be commented when submitting the CSV file).
     'input':  inputs,
     'diactric':  model_prediction,
     'gold out': gold_out,
     'state' : state
     })

df.to_csv(f'./Results/result_{model_name}.csv', index=False)