In [1]:
!pip install bpemb
!pip install gensim
!python -m spacy download en_core_web_sm
!pip install datasets
!pip install nltk

Collecting bpemb
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting sentencepiece (from bpemb)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.4 sentencepiece-0.1.99
2023-11-02 07:51:11.741211: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-02 07:51:11.741268: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-02 07:51:11.741300: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting t

In [2]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from datasets import load_dataset

dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

Downloading readme:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/116067 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [4]:
sample_rows = train_set.filter(lambda example: example['language'] == 'arabic')
train_rows = sample_rows['document_plaintext']
question_rows = sample_rows['question_text']
train_rows = train_rows[:400]
question_rows = question_rows[:400]

validation_rows = validation_set.filter(lambda example: example['language'] == 'arabic')
validation_questions_rows = validation_rows['question_text']
validation_rows = validation_rows['document_plaintext']
validation_rows = validation_rows[:100]
validation_questions_rows = validation_questions_rows[:100]

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [5]:
train_rows[0]

"\n\nالمسألة الشرقية (بالإنجليزية: Eastern Question) (بالفرنسية: Question de l'orient): هي مسألة وجود العثمانيين المسلمين في أوروبا وطردهم منها واستعادة القسطنطينية من العثمانيين بعد سقوطها في 1453 وتهديد مصالح الدول الأوروبية في هذه المنطقة. كما يدل المصطلح على تصفية أملاك رجل أوروبا المريض في البلقان من طرف الدول الأوروبية."

In [6]:
import tensorflow as tf
import re
import string
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

def custom_standardization(input_data):
    lowercase = input_data.lower()
    lowercase = re.sub('\[\d+\]', ' ', lowercase)
    lowercase = re.sub('[%s]' % re.escape(string.punctuation), '', lowercase)
    return lowercase

In [7]:
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

def split_paragraphs_into_sentences(paragraphs):
    sentences = []
    for paragraph in paragraphs:
        sentences.extend(sent_tokenize(paragraph))
    return sentences

sentences = split_paragraphs_into_sentences(train_rows)
validation_sentences = split_paragraphs_into_sentences(validation_rows)

sentences_question = split_paragraphs_into_sentences(question_rows)
validation_sentences_question = split_paragraphs_into_sentences(validation_questions_rows)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
standardized_sentences = [custom_standardization(sentence) for sentence in sentences]
filtered_sentences = [sentence for sentence in standardized_sentences if len(sentence.split()) > 2]

standardized_sentences_question = [custom_standardization(sentence) for sentence in sentences_question]
filtered_sentences_question = [sentence for sentence in standardized_sentences_question if len(sentence.split()) > 2]

standardized_validation_sentences = [custom_standardization(sentence) for sentence in validation_sentences]
filtered_validation_sentences = [sentence for sentence in standardized_validation_sentences if len(sentence.split()) > 2]

standardized_validation_sentences_question = [custom_standardization(sentence) for sentence in validation_sentences_question]
filtered_validation_sentences_question = [sentence for sentence in standardized_validation_sentences_question if len(sentence.split()) > 2]


In [9]:
filtered_sentences[:2]

['\n\nالمسألة الشرقية بالإنجليزية eastern question بالفرنسية question de lorient هي مسألة وجود العثمانيين المسلمين في أوروبا وطردهم منها واستعادة القسطنطينية من العثمانيين بعد سقوطها في 1453 وتهديد مصالح الدول الأوروبية في هذه المنطقة',
 'كما يدل المصطلح على تصفية أملاك رجل أوروبا المريض في البلقان من طرف الدول الأوروبية']

In [10]:
vocab = [word for sentence in filtered_sentences for word in sentence.split()]
vocab.append('<UNK>')
unique_vocab = list(set(vocab))
word2idx = {w: idx for (idx, w) in enumerate(unique_vocab)}
idx2word = {idx: w for (idx, w) in enumerate(word2idx)}
vocabulary_size = len(unique_vocab)

In [11]:
question_vocab = [word for sentence in filtered_sentences_question for word in sentence.split()]
question_vocab.append('<UNK>')
unique_question_vocab = list(set(question_vocab))
word2idx_question = {w: idx for (idx, w) in enumerate(unique_question_vocab)}
idx2word_question = {idx: w for (idx, w) in enumerate(word2idx_question)}
vocabulary_size_question = len(unique_question_vocab)

In [12]:
def get_idx_pairs(input_sentences, word2idx):
  window_size = 2
  idx_pairs = []
  for sen in input_sentences:
      indices = [word2idx.get(word, word2idx['<UNK>']) for word in sen.split()]
      # for each word, treated as center word
      for center_word_pos in range(len(indices)):
          # for each window position
          for w in range(-window_size, window_size + 1):
              context_word_pos = center_word_pos + w
              # make sure not jump out sentence
              if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                  continue
              context_word_idx = indices[context_word_pos]
              idx_pairs.append((indices[center_word_pos], context_word_idx))

  idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
  return idx_pairs

In [13]:
idx_pairs = get_idx_pairs(filtered_sentences, word2idx)
idx_pairs

idx_pairs_question = get_idx_pairs(filtered_sentences_question, word2idx_question)


In [14]:
idx_pairs_eval = get_idx_pairs(filtered_validation_sentences, word2idx)
idx_pairs_eval[:20]

idx_pairs_eval_question = get_idx_pairs(filtered_validation_sentences_question, word2idx_question)

In [15]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [16]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

In [17]:
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self, W1, W2):
        super(SimpleModel, self).__init__()
        self.W1 = W1
        self.W2 = W2

    def forward(self, x):
        x = x.view(-1, 1)
        z1 = torch.matmul(self.W1, x)
        z2 = torch.matmul(self.W2, z1)
        return z2

In [26]:
def train_network(idx_pairs, embedding_dims, vocabulary_size, num_epochs=100, learning_rate=0.001):
    W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
    W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)

    for epo in range(num_epochs):
        loss_val = 0
        for data, target in idx_pairs:
          x = Variable(get_input_layer(data)).float()
          y_true = Variable(torch.from_numpy(np.array([target])).long())
          z1 = torch.matmul(W1, x)
          z2 = torch.matmul(W2, z1)

          log_softmax = F.log_softmax(z2, dim=0)

          loss = F.nll_loss(log_softmax.view(1,-1), y_true)
          loss_val += loss.data
          loss.backward()
          W1.data -= learning_rate * W1.grad.data
          W2.data -= learning_rate * W2.grad.data
          W1.grad.data.zero_()
          W2.grad.data.zero_()

        print(f'Loss at epo {epo}: {loss_val}')
    model = SimpleModel(W1, W2)
    return model


In [19]:
import math
def compute_perplexity(model, test_data):
    idx_pairs_text_data = get_idx_pairs(test_data, word2idx)
    criterion = nn.CrossEntropyLoss()
    total_loss = 0.
    for data, target in idx_pairs_text_data:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(model.W1, x)
        z2 = torch.matmul(model.W2, z1)

        log_softmax = torch.nn.functional.log_softmax(z2, dim=0)
        loss = criterion(log_softmax.view(1,-1), y_true)
        total_loss += loss.item()

    mean_loss = total_loss / len(idx_pairs_text_data)
    perplexity = math.exp(mean_loss)
    return perplexity

In [33]:
embedding_dims = 5
num_epochs = 3
learning_rate = 0.01
model_document = train_network(idx_pairs, embedding_dims, vocabulary_size, num_epochs, learning_rate)


Loss at epo 0: 1464274.875
Loss at epo 1: 1383878.0
Loss at epo 2: 1338681.625


In [34]:
perplexity = compute_perplexity(model_document, filtered_validation_sentences)
print(perplexity)


14706.011276997506


# Now train on Bengali


In [36]:
bengali_train_rows = train_set.filter(lambda example: example['language'] == 'bengali')
bengali_train_rows = bengali_train_rows['document_plaintext']
bengali_train_rows = bengali_train_rows[:400]

bengali_train_row_question = train_set.filter(lambda example: example['language'] == 'bengali')
bengali_train_row_question = bengali_train_row_question['question_text']
bengali_train_row_question = bengali_train_row_question[:400]


bengali_validation_rows = validation_set.filter(lambda example: example['language'] == 'bengali')
bengali_validation_rows = bengali_validation_rows['document_plaintext']
bengali_validation_rows = bengali_validation_rows[:100]

bengali_validation_rows_question = validation_set.filter(lambda example: example['language'] == 'bengali')
bengali_validation_rows_question = bengali_validation_rows_question['question_text']
bengali_validation_rows_question = bengali_validation_rows_question[:100]


Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [37]:
bengali_sentences = split_paragraphs_into_sentences(bengali_train_rows)
bengali_validation_sentences = split_paragraphs_into_sentences(bengali_validation_rows)

bengali_question_sentences = split_paragraphs_into_sentences(bengali_train_row_question)
bengali_validation_question_sentences = split_paragraphs_into_sentences(bengali_validation_rows_question)



bengali_standardized_sentences = [custom_standardization(sentence) for sentence in bengali_sentences]
bengali_filtered_sentences = [sentence for sentence in bengali_standardized_sentences if len(sentence.split()) > 2]

bengali_standardized_sentences_question = [custom_standardization(sentence) for sentence in bengali_question_sentences]
bengali_filtered_sentences_question = [sentence for sentence in bengali_standardized_sentences_question if len(sentence.split()) > 2]

bengali_standardized_validation_sentences = [custom_standardization(sentence) for sentence in bengali_validation_sentences]
bengali_filtered_validation_sentences = [sentence for sentence in bengali_standardized_validation_sentences if len(sentence.split()) > 2]

bengali_standardized_validation_sentences_question = [custom_standardization(sentence) for sentence in bengali_validation_question_sentences]
bengali_filtered_validation_sentences_question = [sentence for sentence in bengali_standardized_validation_sentences_question if len(sentence.split()) > 2]

In [None]:
bengali_filtered_sentences[:5]

In [38]:
bengali_vocab = [word for sentence in bengali_filtered_sentences for word in sentence.split()]
bengali_vocab.append('<UNK>')
bengali_unique_vocab = list(set(bengali_vocab))
word2idx = {w: idx for (idx, w) in enumerate(bengali_unique_vocab)}
idx2word = {idx: w for (idx, w) in enumerate(word2idx)}
vocabulary_size = len(bengali_unique_vocab)

In [39]:
bengali_vocab_question = [word for sentence in bengali_filtered_sentences_question for word in sentence.split()]
bengali_vocab_question.append('<UNK>')
bengali_unique_vocab_question = list(set(bengali_vocab_question))
word2idx_question = {w: idx for (idx, w) in enumerate(bengali_unique_vocab_question)}
idx2word_question = {idx: w for (idx, w) in enumerate(word2idx_question)}
vocabulary_size_question = len(bengali_unique_vocab_question)


In [40]:
idx_pairs = get_idx_pairs(bengali_filtered_sentences, word2idx)
idx_pairs[:10]

idx_pairs_question = get_idx_pairs(bengali_filtered_sentences_question, word2idx_question)

idx_pairs_evaluation = get_idx_pairs(bengali_filtered_validation_sentences, word2idx)
idx_pairs_eval_question = get_idx_pairs(bengali_filtered_validation_sentences_question, word2idx_question)

In [41]:
embedding_dims = 3
num_epochs = 5
learning_rate = 0.01
model_document = train_network(idx_pairs, embedding_dims, vocabulary_size, num_epochs, learning_rate)

Loss at epo 0: 1376317.625
Loss at epo 1: 1326118.25
Loss at epo 2: 1303853.375
Loss at epo 3: 1286253.125
Loss at epo 4: 1265331.125


RuntimeError: ignored

In [42]:
perplexity = compute_perplexity(model_document, bengali_filtered_validation_sentences)
print(perplexity)

24240.872142106164


# Indonesian


In [43]:
indonesian_train_rows = train_set.filter(lambda example: example['language'] == 'indonesian')
indonesian_train_rows = indonesian_train_rows['document_plaintext']
indonesian_train_rows = indonesian_train_rows[:400]

indonesian_train_rows_question = train_set.filter(lambda example: example['language'] == 'indonesian')
indonesian_train_rows_question = indonesian_train_rows_question['question_text']
indonesian_train_rows_question = indonesian_train_rows_question[:400]


indonesian_validation_rows = validation_set.filter(lambda example: example['language'] == 'indonesian')
indonesian_validation_rows = indonesian_validation_rows['document_plaintext']
indonesian_validation_rows = indonesian_validation_rows[:100]

indonesian_validation_rows_question = validation_set.filter(lambda example: example['language'] == 'indonesian')
indonesian_validation_rows_question = indonesian_validation_rows_question['question_text']
indonesian_validation_rows_question = indonesian_validation_rows_question[:100]

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [44]:
indonesian_sentences = split_paragraphs_into_sentences(indonesian_train_rows)
indonesian_validation_sentences = split_paragraphs_into_sentences(indonesian_validation_rows)

indonesian_sentences_question = split_paragraphs_into_sentences(indonesian_train_rows_question)
indonesian_validation_sentences_question = split_paragraphs_into_sentences(indonesian_validation_rows_question)

indonesian_standardized_sentences = [custom_standardization(sentence) for sentence in indonesian_sentences]
indonesian_filtered_sentences = [sentence for sentence in indonesian_standardized_sentences if len(sentence.split()) > 2]

indonesian_standardized_sentences_question = [custom_standardization(sentence) for sentence in indonesian_sentences_question]
indonesian_filtered_sentences_question = [sentence for sentence in indonesian_standardized_sentences_question if len(sentence.split()) > 2]

indonesian_standardized_validation_sentences = [custom_standardization(sentence) for sentence in indonesian_validation_sentences]
indonesian_filtered_validation_sentences = [sentence for sentence in indonesian_standardized_validation_sentences if len(sentence.split()) > 2]

indonesian_standardized_validation_sentences_question = [custom_standardization(sentence) for sentence in indonesian_validation_sentences_question]
indonesian_filtered_validation_sentences_question = [sentence for sentence in indonesian_standardized_validation_sentences_question if len(sentence.split()) > 2]

In [45]:
indonesian_standardized_sentences[:5]

['ernest douwes dekker wafat dini hari tanggal 28 agustus 1950 tertulis di batu nisannya 29 agustus 1950 versi van der veur 2006 dan dimakamkan di tmp cikutra bandung',
 'pada tanggal 18 februari 2008 desain yoo kerl dari iarc terpilih dalam kompetisi desain pemerintah metropolitan seoul',
 '  bagian atas depan bangunan baru ini dirancang untuk menonjol dalam bentuk lengkung',
 'perancang yoo merangkul tiga kata kunci yakni tradisi warga negara dan masa depan dalam rancangannya yang mencerminkan elemen horisontal dari arsitektur tradisional tingkat rendah korea dan penafsiran ulang dari nuansa mendalam dan lekukan atap atap',
 ' ']

In [46]:
indonesian_vocab = [word for sentence in bengali_filtered_sentences for word in sentence.split()]
indonesian_vocab.append('<UNK>')
indonesian_unique_vocab = list(set(indonesian_vocab))
word2idx = {w: idx for (idx, w) in enumerate(indonesian_unique_vocab)}
idx2word = {idx: w for (idx, w) in enumerate(word2idx)}
vocabulary_size = len(indonesian_unique_vocab)

In [47]:
indonesian_vocab_question = [word for sentence in indonesian_filtered_sentences_question for word in sentence.split()]
indonesian_vocab_question.append('<UNK>')
indonesian_unique_vocab_question = list(set(indonesian_vocab_question))
word2idx_question = {w: idx for (idx, w) in enumerate(indonesian_unique_vocab_question)}
idx2word_question = {idx: w for (idx, w) in enumerate(word2idx_question)}
vocabulary_size_question = len(indonesian_unique_vocab_question)


In [48]:
idx_pairs = get_idx_pairs(indonesian_filtered_sentences, word2idx)
idx_pairs[:10]

idx_pairs_question = get_idx_pairs(indonesian_filtered_sentences_question, word2idx_question)

In [None]:
embedding_dims = 5
num_epochs = 3
learning_rate = 0.01
model_training = train_network(idx_pairs, embedding_dims, vocabulary_size, num_epochs, learning_rate)


In [None]:
perplexity = compute_perplexity(model_training, indonesian_filtered_validation_sentences)
print(perplexity)