In [76]:
import pandas as pd
import numpy as np 
from torch import nn
import torch
from torchtext import data
from torch.nn  import functional as F
import torch.optim as  optim 
df = pd.read_csv("data/arabic_english.txt",delimiter="\t",names=["eng","ar"])
df

Unnamed: 0,eng,ar
0,Hi.,مرحبًا.
1,Run!,اركض!
2,Help!,النجدة!
3,Jump!,اقفز!
4,Stop!,قف!
...,...,...
24633,rising voices promoting a more linguistically ...,شاركنا تحدي ابداع ميم بلغتك الام تعزيزا للتنوع...
24634,following last year s successful campaign we i...,استكمالا لنجاح حملة العام السابق ندعوكم للمشار...
24635,during last year s challenge we also met langu...,تعرفنا خلال تحدي العام الماضي على ابطال لغويين...
24636,to take part just follow the simple steps outl...,للمشاركة في التحدي اتبع الخطوات الموضحة على ال...


In [2]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


# Tokenizer

In [61]:
import math
import pandas as pd
import math
import pandas as pd
import torchtext
import torch
import torch.nn as nn
import random
import re
import spacy
from torchtext import data
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.lang.ar import Arabic
from nltk.translate.bleu_score import sentence_bleu
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torch import Tensor


random.seed(0)
df = pd.read_csv("data/arabic_english.txt",delimiter="\t",names=["eng","ar"])

'''
First :
python -m spacy download en_core_web_sm
'''
spacy_eng = spacy.load("en_core_web_sm")

arab = Arabic()
ar_Tokenizer = Tokenizer(arab.vocab)

def engTokenizer(text):
 return  [word.text for word in spacy_eng.tokenizer(text)] 

def arTokenizer(sentence):
    return  [word.text for word in 
             ar_Tokenizer(re.sub(r"\s+"," ",re.sub(r"[\.\'\"\n+]"," ",sentence)).strip())]

SRC = data.Field(tokenize=engTokenizer,batch_first=False,init_token="<sos>",eos_token="<eos>")
TARGET = data.Field(tokenize=arTokenizer,batch_first=False,tokenizer_language="ar",init_token="ببدأ",eos_token="نهها")

class TextDataset(data.Dataset):

    def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
        fields = [('eng', src_field), ('ar',target_field)]
        samples = []
        for i, row in df.iterrows():
            eng = row.eng 
            ar = row.ar
            samples.append(data.Example.fromlist([eng, ar], fields))

        super().__init__(samples, fields, **kwargs)
    
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

torchdataset = TextDataset(df,SRC,TARGET)

train_data, valid_data = torchdataset.split(split_ratio=0.8, random_state = random.seed(0))

SRC.build_vocab(train_data,min_freq=2)
TARGET.build_vocab(train_data,min_freq=2)

print(TARGET.vocab.freqs.most_common(50))  




[('في', 10114), ('من', 8806), ('على', 5213), ('ان', 2562), ('عن', 2299), ('العالمية', 2260), ('الاصوات', 2191), ('الى', 1867), ('لا', 1599), ('هذا', 1572), ('ما', 1385), ('التي', 1384), ('هذه', 1157), ('مع', 1077), ('الذي', 847), ('أن', 824), ('ذلك', 808), ('كان', 799), ('لم', 797), ('او', 797), ('الانترنت', 785), ('توم', 733), ('هل', 715), ('و', 715), ('كل', 671), ('بعد', 668), ('هو', 616), ('قبل', 580), ('تم', 572), ('موقع', 562), ('حول', 552), ('عام', 505), ('العالم', 496), ('حيث', 488), ('كما', 485), ('بين', 463), ('اكثر', 448), ('المدون', 447), ('قد', 445), ('غير', 441), ('خلال', 432), ('أنا', 432), ('إلى', 428), ('يوم', 427), ('هناك', 426), ('كانت', 424), ('بعض', 420), ('ايضا', 419), ('هي', 419), ('اي', 418)]


In [63]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_embeddings = nn.Embedding(src_vocab_size,embedding_size)
        self.src_positional_embeddings= nn.Embedding(max_len,embedding_size)
        self.trg_embeddings= nn.Embedding(trg_vocab_size,embedding_size)
        self.trg_positional_embeddings= nn.Embedding(max_len,embedding_size)
        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )

        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx
    
    def make_src_mask(self, src):
        src_mask = src.transpose(0,1) == self.src_pad_idx

        return src_mask

    def forward(self,src,trg):
        src_seq_length, S = src.shape
        trg_seq_length, S = trg.shape
        #adding zeros is an easy way
        src_positions = (
            torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, S).to(self.device)
        )
        
        
        trg_positions = (
            torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, S).to(self.device)
        )

        embed_src  = self.dropout(
                ( self.src_embeddings(src) + self.src_positional_embeddings(src_positions) )
            )

        embed_trg = self.dropout(
                ( self.trg_embeddings(trg) + self.trg_positional_embeddings(trg_positions) )
            )
        
        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(device)
        
        
        out = self.transformer(embed_src,embed_trg, src_key_padding_mask=src_padding_mask,tgt_mask=trg_mask )
        out= self.fc_out(out)

        return out
        

# Training phase

In [64]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [65]:
BATCH_SIZE = 64


train_iterator, valid_iterator  = data.BucketIterator.splits(
    (train_data,valid_data), 
    batch_size = BATCH_SIZE,
    sort=None,
    sort_within_batch=False,
    sort_key = lambda x: len(x.SRC),
    shuffle=None, 
    device=device
)

In [66]:
load_model = False
save_model = True

num_epochs = 60
learning_rate = 3e-4

num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3

max_len= 250
dropout = 0.10
embedding_size= 512
src_pad_idx = SRC.vocab.stoi["<pad>"]
forward_expansion = 4
step = 0


src_vocab_size  = len(SRC.vocab)
print("Size of english vocabulary:",src_vocab_size)

#No. of unique tokens in label
trg_vocab_size =len(TARGET.vocab)
print("Size of arabic vocabulary:",trg_vocab_size)


model = Transformer(        
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)





Size of english vocabulary: 12886
Size of arabic vocabulary: 22062


In [67]:
for i,batch  in enumerate(train_iterator):
    print(batch.eng)
    print(type(batch))
    break

tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [ 2121,     0,   232,  ...,     0,  2709,    15],
        [  224,  1377,  2451,  ...,    31, 11479,   221],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]], device='cuda:0')
<class 'torchtext.data.batch.Batch'>


In [77]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [81]:
loss_track = []
loss_validation_track= []


optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = SRC.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

for epoch in range(num_epochs):
    stepLoss=[]
    model.train()
    for i,batch  in enumerate(train_iterator):
        input_data = batch.eng.to(device)
        target = batch.ar.to(device)

        output = model(input_data,target[:-1])
        optimizer.zero_grad()
        
        output = output.reshape(-1,trg_vocab_size)
        target = target[1:].reshape(-1)

        loss = criterion(output,target)
        loss.backward()

        optimizer.step()
        stepLoss.append(loss.item())

    loss_track.append(np.mean(stepLoss))
    print("train crossentropy at epoch {} loss: ".format(i),np.mean(stepLoss))        
        
    stepValidLoss=[]
    model.eval() # the evaluation mode for the model (doesn't apply dropout and batchNorm)
    for i,batch  in enumerate(valid_iterator):
            input_sentence = batch.eng.to(device)
            target = batch.ar.to(device)

            optimizer.zero_grad()
            output = model(input_sentence,trg[:-1])
            output = output.reshape(-1,trg_vocab_size)
            target = target[1:].reshape(-1)
            loss = criterion(output,target)
                
            stepValidLoss.append(loss.item())
  
loss_validation_track.append(np.mean(stepValidLoss))
print("validation crossentropy at epoch {} loss: ".format(i),np.mean(stepValidLoss))    

RuntimeError: CUDA out of memory. Tried to allocate 970.00 MiB (GPU 0; 8.00 GiB total capacity; 5.13 GiB already allocated; 150.07 MiB free; 5.79 GiB reserved in total by PyTorch)