In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

arabic_df = pd.read_feather('./arabic_train_set.feather')

[nltk_data] Downloading package stopwords to /home/duy/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/duy/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
import re
# as per recommendation from @freylis, compile once only
CLEANR = re.compile('<.*?>')

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [3]:
def break_into_sentences(paragraph):
    sentences = list()
    temp_sentence = list()
    flag = False
    for ch in paragraph.strip():
        if ch in [u'؟', u'!', u'.', u':', u'؛']:
            flag = True
        elif flag:
            sentences.append(''.join(temp_sentence).strip())
            temp_sentence = []
            flag = False

        temp_sentence.append(ch)

    else:
        sentences.append(''.join(temp_sentence).strip())
        return sentences

In [4]:
def remove_ref(sentence):
  result = re.sub("(\[\d\])", "", sentence)
  return result

In [5]:
def clean_arabic(l_arabic):
  l_cleaned_arabic = []
  for p in l_arabic:
    ss = break_into_sentences(remove_ref(cleanhtml(p)))
    for s in ss:
      l_cleaned_arabic.append(s)
  return l_cleaned_arabic

In [7]:
from datasets import load_dataset
dataset = load_dataset("copenlu/answerable_tydiqa")
valid_set = dataset["validation"].filter(lambda example, idx: example['language'] == 'arabic', with_indices=True)


Downloading readme: 100%|██████████████████████████████████████| 4.94k/4.94k [00:00<00:00, 16.5MB/s]
Downloading metadata: 100%|████████████████████████████████████| 2.47k/2.47k [00:00<00:00, 7.11MB/s]
Downloading data files:   0%|                                                 | 0/2 [00:00<?, ?it/s]
Downloading data:   0%|                                                 | 0.00/71.6M [00:00<?, ?B/s][A
Downloading data:   6%|██▎                                     | 4.19M/71.6M [00:01<00:24, 2.70MB/s][A
Downloading data:  18%|███████                                 | 12.6M/71.6M [00:02<00:12, 4.73MB/s][A
Downloading data:  29%|███████████▋                            | 21.0M/71.6M [00:04<00:09, 5.12MB/s][A
Downloading data:  41%|████████████████▍                       | 29.4M/71.6M [00:05<00:07, 6.03MB/s][A
Downloading data:  53%|█████████████████████                   | 37.7M/71.6M [00:06<00:05, 6.63MB/s][A
Downloading data:  64%|█████████████████████████▊              | 46.1M/71

In [11]:
l_cleaned_arabic_train = clean_arabic(list(arabic_df['document_plaintext']))
l_cleaned_arabic_val = clean_arabic(list(valid_set['document_plaintext']))

In [12]:
import torch
import os
from collections import defaultdict

UNK = '<unk>'

class Dictionary(object):
    def __init__(self):
        self.idx2word = {}
        self.word2idx = defaultdict(lambda: 0)
        self.idx = 0
        self.idx2word[0] = UNK
        self.word2idx[UNK] = 0
        self.idx = 1

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return len(self.word2idx)


class Corpus(object):
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, l_arabic, batch_size=20):
        # Add words to the dictionary
        # with open(path, 'r') as f:
        #     tokens = 0
        #     for line in f:
        #         words = line.split() + ['<eos>']
        #         tokens += len(words)
        #         for word in words:
        #             self.dictionary.add_word(word)

        tokens = 0
        for l in l_arabic:
          words = word_tokenize(l) + ['<eos>']
          tokens += len(words)
          for word in words:
            self.dictionary.add_word(word)

        # Tokenize the file content
        ids = torch.LongTensor(tokens)
        token = 0
        # with open(path, 'r') as f:
        #     for line in f:
        #         words = line.split() + ['<eos>']
        #         for word in words:
        #             ids[token] = self.dictionary.word2idx[word]
        #             token += 1
        for l in l_arabic:
          words = word_tokenize(l) + ['<eos>']
          for word in words:
            ids[token] = self.dictionary.word2idx[word]
            token += 1

        num_batches = ids.size(0) // batch_size
        ids = ids[:num_batches*batch_size]
        return ids.view(batch_size, -1)

In [13]:
import torch
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

# Hyper-parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 200     # number of words to be sampled
batch_size = 45
seq_length = 30
learning_rate = 2e-4

device

device(type='cuda')

In [14]:
corpus = Corpus()
ids = corpus.get_data(l_cleaned_arabic_train, batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length

test_corpus = Corpus()
test_ids = test_corpus.get_data(l_cleaned_arabic_val, batch_size)
test_vocab_size =  len(test_corpus.dictionary)
test_num_batches = test_ids.size(1) // seq_length

print(num_batches)
# print(test_num_batches)

2014


In [15]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        # Embed word ids to vectors
        x = self.embed(x)

        # Forward propagate LSTM
        out, (h, c) = self.lstm(x, h)

        # Reshape output to (batch_size*sequence_length, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))

        # Decode hidden states of all time steps
        out = self.linear(out)
        return out, (h, c)

model = RNNLM(vocab_size, embed_size, hidden_size, num_layers).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
def detach(states):
    return [state.detach() for state in states]

In [None]:
# Train the model
for epoch in range(num_epochs):
    # Set initial hidden and cell states
    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
              torch.zeros(num_layers, batch_size, hidden_size).to(device))

    for i in range(0, ids.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = ids[:, i:i+seq_length].to(device)
        targets = ids[:, (i+1):(i+1)+seq_length].to(device)
        # print('input', inputs, 'output', targets)


        # Forward pass
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        states = detach(states)
        # print(inputs.shape)
        # print(states[0].shape)
        # print(targets.shape)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))

        # Backward and optimize
        model.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        step = (i+1) // seq_length
        if step % 500 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(epoch+1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item())))

Epoch [1/5], Step[0/2014], Loss: 12.3197, Perplexity: 224057.99
Epoch [1/5], Step[500/2014], Loss: 8.0985, Perplexity: 3289.48
Epoch [1/5], Step[1000/2014], Loss: 8.0992, Perplexity: 3291.81
Epoch [1/5], Step[1500/2014], Loss: 7.7678, Perplexity: 2363.30
Epoch [1/5], Step[2000/2014], Loss: 7.8666, Perplexity: 2608.75
Epoch [2/5], Step[0/2014], Loss: 7.7353, Perplexity: 2287.74
Epoch [2/5], Step[500/2014], Loss: 6.6503, Perplexity: 773.01
Epoch [2/5], Step[1000/2014], Loss: 6.9875, Perplexity: 1082.96
Epoch [2/5], Step[1500/2014], Loss: 6.6587, Perplexity: 779.52
Epoch [2/5], Step[2000/2014], Loss: 6.8869, Perplexity: 979.41
Epoch [3/5], Step[0/2014], Loss: 6.8098, Perplexity: 906.67
Epoch [3/5], Step[500/2014], Loss: 5.6924, Perplexity: 296.61
Epoch [3/5], Step[1000/2014], Loss: 6.1823, Perplexity: 484.11
Epoch [3/5], Step[1500/2014], Loss: 5.8611, Perplexity: 351.09
Epoch [3/5], Step[2000/2014], Loss: 6.0567, Perplexity: 426.97
Epoch [4/5], Step[0/2014], Loss: 6.0479, Perplexity: 423.

In [None]:
torch.save(model.state_dict(), './arabic_document.pt')

In [16]:
model.load_state_dict(torch.load('./arabic_document.pt'))

<All keys matched successfully>

In [19]:
import math

# Test the model
states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
              torch.zeros(num_layers, batch_size, hidden_size).to(device))
test_loss = 0.
with torch.no_grad():
    for i in range(0, test_ids.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = test_ids[:, i:i+seq_length].to(device)
        targets = test_ids[:, (i+1):(i+1)+seq_length].to(device)
        
        # Forward pass
        states = detach(states)
        outputs, states = model(inputs, states)
        test_loss += criterion(outputs, targets.reshape(-1)).item()

test_loss = test_loss / test_num_batches
print('-' * 89)
print('test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('-' * 89)


# Generate texts using trained model
# with torch.no_grad():
#     with open('sample.txt', 'w') as f:
#         # Set intial hidden ane cell states
#         state = (torch.zeros(num_layers, 1, hidden_size).to(device),
#                  torch.zeros(num_layers, 1, hidden_size).to(device))

#         # Select one word id randomly
#         prob = torch.ones(vocab_size)
#         input = torch.multinomial(prob, num_samples=1).unsqueeze(1).to(device)

#         for i in range(num_samples):
#             # Forward propagate RNN 
#             output, state = model(input, state)

#             # Sample a word id
#             prob = output.exp()
#             word_id = torch.multinomial(prob, num_samples=1).item()

#             # Fill input with sampled word id for the next time step
#             input.fill_(word_id)

#             # File write
#             word = corpus.dictionary.idx2word[word_id]
#             word = '\n' if word == '<eos>' else word + ' '
#             f.write(word)

#             if (i+1) % 100 == 0:
#                 print('Sampled [{}/{}] words and save to {}'.format(i+1, num_samples, 'sample.txt'))

-----------------------------------------------------------------------------------------
test loss 12.04 | test ppl 168735.00
-----------------------------------------------------------------------------------------
