In [136]:
!pip install bpemb
!pip install gensim
!python -m spacy download en_core_web_sm
!pip install datasets
!pip install nltk

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [137]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [138]:
from datasets import load_dataset

dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

In [229]:
sample_rows = train_set.filter(lambda example: example['language'] == 'arabic')
train_rows = sample_rows['document_plaintext']
train_rows = train_rows[:100]

validation_rows = validation_set.filter(lambda example: example['language'] == 'arabic')
validation_rows = validation_rows['document_plaintext']
validation_rows = validation_rows[:2]

In [230]:
train_rows[0]

"\n\nالمسألة الشرقية (بالإنجليزية: Eastern Question) (بالفرنسية: Question de l'orient): هي مسألة وجود العثمانيين المسلمين في أوروبا وطردهم منها واستعادة القسطنطينية من العثمانيين بعد سقوطها في 1453 وتهديد مصالح الدول الأوروبية في هذه المنطقة. كما يدل المصطلح على تصفية أملاك رجل أوروبا المريض في البلقان من طرف الدول الأوروبية."

In [231]:
import tensorflow as tf
import re
import string
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

def custom_standardization(input_data):
    lowercase = input_data.lower()
    lowercase = re.sub('\[\d+\]', ' ', lowercase)
    lowercase = re.sub('[%s]' % re.escape(string.punctuation), '', lowercase)
    return lowercase

In [232]:
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

def split_paragraphs_into_sentences(paragraphs):
    sentences = []
    for paragraph in paragraphs:
        sentences.extend(sent_tokenize(paragraph))
    return sentences

sentences = split_paragraphs_into_sentences(train_rows)
validation_sentences = split_paragraphs_into_sentences(validation_rows)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [233]:
standardized_sentences = [custom_standardization(sentence) for sentence in sentences]
filtered_sentences = [sentence for sentence in standardized_sentences if len(sentence.split()) > 2]

standardized_validation_sentences = [custom_standardization(sentence) for sentence in validation_sentences]
filtered_validation_sentences = [sentence for sentence in standardized_validation_sentences if len(sentence.split()) > 2]

In [234]:
filtered_sentences[:2]

['\n\nالمسألة الشرقية بالإنجليزية eastern question بالفرنسية question de lorient هي مسألة وجود العثمانيين المسلمين في أوروبا وطردهم منها واستعادة القسطنطينية من العثمانيين بعد سقوطها في 1453 وتهديد مصالح الدول الأوروبية في هذه المنطقة',
 'كما يدل المصطلح على تصفية أملاك رجل أوروبا المريض في البلقان من طرف الدول الأوروبية']

In [235]:
vocab = [word for sentence in filtered_sentences for word in sentence.split()]
vocab.append('<UNK>')
unique_vocab = list(set(vocab))
word2idx = {w: idx for (idx, w) in enumerate(unique_vocab)}
idx2word = {idx: w for (idx, w) in enumerate(word2idx)}
vocabulary_size = len(unique_vocab)

In [236]:
import numpy as np
def get_idx_pairs(input_sentences):
  window_size = 2
  idx_pairs = []
  for sen in input_sentences:
      indices = [word2idx[word] for sentence in input_sentences for word in sentence.split()]
      # for each word, threated as center word
      for center_word_pos in range(len(indices)):
          # for each window position
          for w in range(-window_size, window_size + 1):
              context_word_pos = center_word_pos + w
              # make soure not jump out sentence
              if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                  continue
              context_word_idx = indices[context_word_pos]
              idx_pairs.append((indices[center_word_pos], context_word_idx))

  idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
  return idx_pairs
#define a function for the previous task


In [237]:
def get_idx_pairs(input_sentences, word2idx):
  window_size = 2
  idx_pairs = []
  for sen in input_sentences:
      indices = [word2idx.get(word, word2idx['<UNK>']) for word in sen.split()]
      # for each word, treated as center word
      for center_word_pos in range(len(indices)):
          # for each window position
          for w in range(-window_size, window_size + 1):
              context_word_pos = center_word_pos + w
              # make sure not jump out sentence
              if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                  continue
              context_word_idx = indices[context_word_pos]
              idx_pairs.append((indices[center_word_pos], context_word_idx))

  idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
  return idx_pairs

In [238]:
idx_pairs = get_idx_pairs(filtered_sentences, word2idx)
idx_pairs

array([[2477, 1370],
       [2477, 4297],
       [1370, 2477],
       ...,
       [ 807, 4409],
       [4409, 1336],
       [4409,  807]])

In [239]:
idx_pairs_eval = get_idx_pairs(filtered_validation_sentences, word2idx)
idx_pairs_eval[:20]

array([[2826, 2252],
       [2826, 2722],
       [2252, 2826],
       [2252, 2722],
       [2252, 2826],
       [2722, 2826],
       [2722, 2252],
       [2722, 2826],
       [2722, 2826],
       [2826, 2252],
       [2826, 2722],
       [2826, 2826],
       [2826, 3634],
       [2826, 2722],
       [2826, 2826],
       [2826, 3634],
       [2826, 2826],
       [3634, 2826],
       [3634, 2826],
       [3634, 2826]])

In [240]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [241]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

In [242]:
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self, W1, W2):
        super(SimpleModel, self).__init__()
        self.W1 = W1
        self.W2 = W2

    def forward(self, x):
        x = x.view(-1, 1)
        z1 = torch.matmul(self.W1, x)
        z2 = torch.matmul(self.W2, z1)
        return z2

In [243]:
def train_network(idx_pairs, embedding_dims, vocabulary_size, num_epochs=100, learning_rate=0.001):
    W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
    W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)

    for epo in range(num_epochs):
        loss_val = 0
        for data, target in idx_pairs:
          x = Variable(get_input_layer(data)).float()
          y_true = Variable(torch.from_numpy(np.array([target])).long())
          z1 = torch.matmul(W1, x)
          z2 = torch.matmul(W2, z1)

          log_softmax = F.log_softmax(z2, dim=0)

          loss = F.nll_loss(log_softmax.view(1,-1), y_true)
          loss_val += loss.data
          loss.backward()
          W1.data -= learning_rate * W1.grad.data
          W2.data -= learning_rate * W2.grad.data

          W1.grad.data.zero_()
          W2.grad.data.zero_()
        if epo % 5 == 0:
            print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')
    model = SimpleModel(W1, W2)
    return model


In [244]:
import math
def compute_perplexity(model, test_data):
    idx_pairs_text_data = get_idx_pairs(test_data, word2idx)
    criterion = nn.CrossEntropyLoss()
    total_loss = 0.
    for data, target in idx_pairs_text_data:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(model.W1, x)
        z2 = torch.matmul(model.W2, z1)

        log_softmax = torch.nn.functional.log_softmax(z2, dim=0)
        loss = criterion(log_softmax.view(1,-1), y_true)
        total_loss += loss.item()

    mean_loss = total_loss / len(idx_pairs_text_data)
    perplexity = math.exp(mean_loss)
    return perplexity

In [245]:
embedding_dims = 20
num_epochs = 30
learning_rate = 0.01
model = train_network(idx_pairs, embedding_dims, vocabulary_size, num_epochs, learning_rate)

Loss at epo 0: 15.058667182922363
Loss at epo 5: 10.217268943786621
Loss at epo 10: 8.54259967803955
Loss at epo 15: 7.572190761566162
Loss at epo 20: 6.9180073738098145
Loss at epo 25: 6.429164409637451


In [246]:
perplexity = compute_perplexity(model, filtered_validation_sentences)
print(perplexity)

39915.24910830672
