In [1]:
import numpy as np
import pandas as pd
import time 
import matplotlib.pyplot as plt

In [2]:
import torch

In [3]:
from bs4 import BeautifulSoup
import os
import re

In [4]:
directory = 'corpus_morphological_analysis'
file_paths = []
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    file_paths.append(f)    

In [5]:
temp_file_paths = file_paths[:2000]

In [6]:
content = []
#i = 0
for filepath in temp_file_paths :
    #print(i)
    with open(filepath, encoding='utf-8') as f :
        html = f.read()
    soup = BeautifulSoup(html, features="html.parser")
    for script in soup(["script", "style"]):
        script.extract()  
    text = soup.get_text()
    content.append(text)
    #i+=1

In [7]:
split_list = []
for item in content : 
    tmp = item.splitlines()
    split_list.append(tmp)

In [8]:
work_list = []
for k in split_list :
    l = [item for item in k if 'لا توجد نتائج لتحليل هذه الكلمة' not in item]
    tmp_l = [item.replace("#",'') for item in l]
    work_list.append(tmp_l)

In [9]:
final_list = []
for k in work_list :
    tst = [item.split(':') for item in k]
    final_list.append(tst)

In [10]:
# identifie le prefixe, racine et le suffixe et les placent dans un dictionnaire
def identify(word_l):
    if len(word_l) < 4 :
        return None
    dictt = {}
    dictt['word'] = word_l[0]
    # le cas s'il existe un préfixe
    if word_l[2] != '' and word_l[2] != ' ' :   
        if word_l[4] not in word_l[0]: 
            if word_l[5] in word_l[0] and word_l[5] != '': 
                dictt['prefixe'] = word_l[2]
                dictt['root'] = word_l[8]
                dictt['suffixe'] = word_l[9]
            elif word_l[3] in word_l[0] and word_l[3] != '':
                dictt['prefixe'] = word_l[2]
                dictt['root'] = word_l[3]
                dictt['suffixe'] = ''
        else :
            dictt['prefixe'] = word_l[2]
            dictt['root'] = word_l[7]
            dictt['suffixe'] = word_l[8]
    # s'il n'existe pas un préfixe
    else : 
        if word_l[2] == '' : 
            dictt['prefixe'] = word_l[2]
            dictt['root'] = word_l[6]
            dictt['suffixe'] = word_l[7]
        elif  word_l[2] == ' ' :
            dictt['prefixe'] = ''
            dictt['root'] = word_l[3]
            dictt['suffixe'] = ''    
    return dictt

In [11]:
# filtre la liste de mots en liste de dictionnaires
def word_to_dict_list(wordlist):
    dictlist = []
    for k in wordlist : 
        dictlist.append(identify(k))
    return dictlist

In [12]:
final = []
for k in final_list: 
    for j in k :
        s = identify(j)
        if s == None :
            continue
        final.append(identify(j))

In [13]:
def dic_to_list(listt):
    L = []
    for k in listt : 
        tmp = []
        #print(k)
        if len(k) == 4 : 
            tmp.append(k['word'])
            tmp.append(k['prefixe'])
            tmp.append(k['root'])
            tmp.append(k['suffixe'])
            L.append(tmp)
    return L
data = dic_to_list(final)

In [14]:
final_l = dic_to_list(final)
final_l[5990]

['يقدمها', 'ي', 'قدم', 'هَا']

In [15]:
# experimenting with our first neural network to predict the root of a word.

data_1 = []

for word in final_l:
    tmp = []
    word[0] = word[0].replace('+', ' ')
    word[2] = word[2].replace('+', ' ')
    tmp.append(word[0])
    tmp.append(word[2])
    data_1.append(tmp)
    #print(tmp)
#data_1

In [16]:
# we extract our dictionnary from the our dataset 
def extract_dict(listt) :
    dictt = []
    for word in listt :
        for item in word : 
            tmp = set(item)
            for k in tmp : 
                if k not in dictt : 
                    dictt.append(k)
    return dictt       
dic = extract_dict(data_1)

In [17]:
root_data = []
for word in data : 
    tmp =[]
    tmp.append(word[0])
    tmp.append(word[2])
    root_data.append(tmp)
#root_data

In [18]:
class encode :
    
    def __init__(self, data):
        self.data = data
        
    def code_sequence(self):
        L = []
        for word in self.data : 
            tmp_string = code_word(word)
            L.append(tmp_string)
        final_string = '#'.join(map(str,L))
        return final_string 
    
    def code_normal_text(self):
        L = []
        for word in self.data : 
            L.append(word[1])
        final_string = ' '.join(map(str,L))
        return final_string
        
final_str = encode(root_data).code_normal_text()      

In [19]:
data_root = []
for item in root_data : 
    tmp = []
    if len(item[1]) <= 3 and len(item[1]) != 0:
        tmp.append('$'+item[0]+'£')
        tmp.append('$'+item[1]+'£')
        data_root.append(tmp)

In [20]:
def data_padding(dat, padding_char):
    #Le'ts create a padding character : 
    pad_char = padding_char
    padded_data = []
    
    ls_words = []
    ls_roots = []

    for instance in dat : 
        ls_words.append(instance[0])
        ls_roots.append(instance[1])
    max_len_words = max([len(item) for item in ls_words])
    max_len_roots = max([len(item) for item in ls_roots])
    
    for instance in dat: 
        tmp = []
        word,root = instance[0], instance[1]
        while(len(word) != max_len_words):
            word += pad_char
        tmp.append(word)
        while(len(root) != max_len_roots):
            root += pad_char
        tmp.append(root)
        padded_data.append(tmp)
        
    return padded_data

In [21]:
data_padding(data_root,'%')[:10]

[['$مسقط£%%%%%%%%%', '$سقط£'],
 ['$يقام£%%%%%%%%%', '$قمي£'],
 ['$الرابع£%%%%%%%', '$ربع£'],
 ['$والعشرين£%%%%%', '$عشر£'],
 ['$من£%%%%%%%%%%%', '$من£%'],
 ['$شهر£%%%%%%%%%%', '$شهر£'],
 ['$فبراير£%%%%%%%', '$رير£'],
 ['$المقبل£%%%%%%%', '$قبل£'],
 ['$قصر£%%%%%%%%%%', '$قصر£'],
 ['$البستان£%%%%%%', '$بسس£']]

In [22]:
    
    def train_batch(self, batch):
                
        self.optimizer.zero_grad()
        for instance in batch: 
            '''
            word = instance[0] : the word to extract root from
            target_word = instance[1] : the root of the word
            '''
            word = instance[0]
            target_root = instance[1]
            
            res1 = self.encode_word(word)
            predicted_root = []
            original_root_seq = self.word_to_idx_seq(target_root)
            predicted_root_seq = []
            res_char = target_root[0]
            
            i = 0
            for char in target_root : 
                if len(predicted_root) == len(target_root):
                    break
                
                
                if random.random() < self.teacher_forcing_ratio and i!=0 : 
                    res_char = char
                    i+= 1
                
                res_char , hidd , idx = self.decode_word(res1, res_char)
                predicted_root_seq.append(idx)
                test = hidd
                predicted_root.append(res_char)

            predicted_root = ''.join(predicted_root)

            predicted_root_seq = torch.tensor(predicted_root_seq)

            org = self.embedding(original_root_seq)
            pred = self.embedding(predicted_root_seq)
            
            
            loss = self.criterion(org, pred)
            
            loss.backward()

            print('target root :', target_root)
            print('predicted root : ', predicted_root)

            print('the loss : ', loss.item())  
        self.optimizer.step()
        
        return
    
            
            
        #print(em1)
        

In [23]:
def prepare_data(self):
    
    #Le'ts create a padding for ouriinstances : 
    
    pad_char = ''
    padded_data = []
    ls_words = []
    ls_roots = []
    for instance in self.data : 
        ls_words.append(instance[0])
        ls_roots.append(instance[1])
    max_len_words = max([len(item) for item in ls_words])
    max_len_roots = max([len(item) for item in ls_roots])
    
    for instance in self.data: 
        tmp = []
        word,root = instance[0], instance[1]
        while(len(word) != max_len_words):
            word += pad_char
        tmp.append(word)
        while(len(root) != max_len_roots):
            root += pad_char
        tmp.append(root)
        padded_data.append(tmp)
    

    # let's create our vocab : 
    
    vocab = []
    for word in padded_data :
        for item in word : 
            tmp = set(item)
            for k in tmp : 
                if k not in vocab : 
                    vocab.append(k)
    
    
    # Let's create our dictionnary with unique indexes
    
    char_to_idx_map = {char: idx for idx, char in enumerate(dictt)}
    
    # Let's now split our data to batches
   
    final_data = []
    for instance in padded_data : 
        tmp = []
        word = self.word_to_seq(instance[0])
        root = self.word_to_seq(instance[1])
        tmp.append(word)
        tmp.append(root)
        final_data.append(tmp)
        
    size= self.batch_size 
    batches = [final_data[i:i + size] for i in range(0, len(final_data), size)]
    
    return batches , vocab , char_to_idx_map

In [24]:
max([len(item[0]) for item in data_root])

15

In [29]:
print(len(data_root))
for item in data_root :
    if len(item[0])==15:
        data_root.pop(data_root.index(item))
print(len(data_root))

794906
794906


In [96]:
# implementation of a paper : 

import tensorflow as tf
import numpy as np
import torch.nn as nn
import torch 
import string
import torch.optim as optim
import random


class model(nn.Module): 

    def __init__(self, data, batch_size ,embedding_size, hidden_size,num_layers ,dropout, teacher_forcing_ratio, learning_rate):
        super().__init__()
        
        '''
        our input data are arabic words with their roots, with the hypothesis that each word has it's own root.
        
        (our dataset for this is named root_data)
        '''
        self.sow = '$'
        self.eow = '£'
        self.lr = learning_rate
        self.ratio = 0.8
        self.batch_size = batch_size
        self.data = data
        self.batches, self.vocab, self.char_index_dic = self.prepare_data(self.data)
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        
        self.num_layers = num_layers
        
        self.dropout = dropout
        self.embedding = nn.Embedding(num_embeddings = len(self.vocab), embedding_dim = self.embedding_size, padding_idx = self.char_index_dic['%']) 
        
        
        #self.bigru = nn.GRU(input_size=self.embedding_size, hidden_size=self.hidden_size, num_layers=self.num_layers, bidirectional=True, batch_first=True)
        
        self.BILSTM = nn.LSTM(input_size=self.embedding_size, hidden_size=self.hidden_size, num_layers=self.num_layers, bidirectional=True, batch_first=True, dropout = self.dropout)

        
        #self.gru = nn.GRU(input_size= self.embedding_size ,hidden_size = self.hidden_size * 2, num_layers = self.num_layers, batch_first = True)
        self.LSTM = nn.LSTM(input_size= self.embedding_size ,hidden_size = self.hidden_size*2, num_layers = self.num_layers, batch_first = True, dropout = self.dropout)

        
        self.criterion = nn.CrossEntropyLoss(ignore_index =self.char_index_dic['%'])
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.Linear = nn.Linear(self.hidden_size * 2,len(self.vocab))
        #self.Linear = nn.Linear(self.hidden_size * 2,1)
        
        #self.optimizer = optim.Adam([*self.bigru.parameters(), *self.gru.parameters(), *self.Linear.parameters()], lr = 0.001)
        #self.optimizer = optim.Adam([*self.BILSTM.parameters(), *self.LSTM.parameters()], lr = 0.1)
        #self.optimizer = optim.Adam(self.parameters(), lr = self.lr)
        self.optimizer = optim.RMSprop(self.parameters(), lr = self.lr)
    
    def prepare_data(self, data):
    
        #Le'ts create a padding for ouriinstances : 

        pad_char = '%'
        padded_data = []
        ls_words = []
        ls_roots = []
        for instance in data : 
            ls_words.append(instance[0])
            ls_roots.append(instance[1])
        
        # Let's calculate the biggest length
        max_len_words = max([len(item) for item in ls_words])
        max_len_roots = max([len(item) for item in ls_roots])

        # Now we pad the word until we reach the max length
        for instance in data: 
            tmp = []
            word,root = instance[0], instance[1]
            while(len(word) != max_len_words):
                word += pad_char
            tmp.append(word)
            while(len(root) != max_len_roots):
                root += pad_char
            tmp.append(root)
            padded_data.append(tmp)

        # let's create our vocab : 

        vocab = []
        for word in padded_data :
            for item in word : 
                tmp = set(item)
                for k in tmp : 
                    if k not in vocab : 
                        vocab.append(k)

        # Let's create our dictionnary with unique indexes

        char_to_idx_map = {char: idx for idx, char in enumerate(vocab)}

        # Let's now split our data to batches

        final_data = []
        for instance in padded_data : 
            tmp = []
            word = [char_to_idx_map[char] for char in instance[0]]
            root = [char_to_idx_map[char] for char in instance[1]]
            tmp.append(word)
            tmp.append(root)
            final_data.append(tmp)

        size= self.batch_size 
        batches = [final_data[i:i + size] for i in range(0, len(final_data), size)]
        
        return batches , vocab , char_to_idx_map
    
    
    def word_to_seq(self, word):
        '''
        this function returns a sequence of the unique indexes for the given word 
        (sequence is tensor that can be changed using a .tolist() )
        '''
        word_char_idx_seq =[self.char_index_dic[char] for char in word]    
        return word_char_idx_seq # word sequence
    
    
    # Let's now construct our model : 
    
    # we should think about character embeddings in order to create an embeded matrix for each word
        
    
    
    def encode(self, batch):    
        '''
        input : a batch of sequences of instances : [word_seq , root_seq] * batch_size
                input_size : (input_size,2)
        '''
        
        word_batch = [] # list of words in the batch
        root_batch = [] # list of roots in the batch
        
        for instance in batch : 
            word_batch.append(instance[0])
            root_batch.append(instance[1])
            
        word_batch = torch.tensor(word_batch)
        root_batch = torch.tensor(root_batch)
        
        # we create embedding of the word batch : 
        
        embedded_word_batch = self.embedding(word_batch)
                
        outputs, (hidden, cell) = self.BILSTM(embedded_word_batch) # we pass the emebedded vector through the bi-GRU 
        
        
        # hidden size : [2 * num_layers, batch_size , hidden_size]
        
        # we want hidden size : [num_layers , batch_size  , 2 * hidden_size]
        
        # we return an adequate layer for the decoder : 
        
        final_hid, final_ce = [], []
        for k in range(0,hidden.size(0), 2):
            
            tmp_hid = hidden[k:k+2 , :, :]
            tmp_ce = cell[k:k+2, :, :]
            
            
            cct_hid = torch.cat((tmp_hid[0], tmp_hid[1]), dim  = 1).tolist()
            cct_ce = torch.cat((tmp_ce[0], tmp_ce[1]), dim  = 1).tolist()
            
            final_hid.append(cct_hid)
            final_ce.append(cct_ce)
        
        final_hid, final_ce = torch.tensor(final_hid), torch.tensor(final_ce)
        
        #print(final_hid.size(), final_ce.size())
            
        return root_batch , (final_hid, final_ce)
    
    
    def decode(self, encoder_hidden_cell , batch, teacher_forcing_bool):
        
        '''
        input : encoding_hidden_layer => corresponds to the concatenation of the final hidden layers 
                                        of the bidirectionnal gru in our encoder
                
                batch : subset of data that contains the roots of the words we encoded.
                
        output : we'll see :) 
        
        '''

        (hidden_layer , cell) , root_batch = encoder_hidden_cell , batch 
                        
        embedded_char = self.embedding(torch.unsqueeze(root_batch[:, 0], 1))
            
        outputs = []
        
        for i in range(root_batch.size(1)): 
            
            decoder_output , (hidden_layer, cell) = self.LSTM(embedded_char, (hidden_layer, cell))
                        
            input_dense = nn.Linear(self.hidden_size * 2,self.embedding_size)
            input_decoder_output = input_dense(decoder_output)
            
            embedded_char = input_decoder_output
    
            mask = np.where([random.random() < self.teacher_forcing_ratio for i in range(root_batch.size(0))])[0]
            
            teacher_forcing_input = self.embedding(torch.unsqueeze(torch.clone(root_batch[:, i]), 1))
            
            if teacher_forcing_bool : 

                embedded_char[mask] = teacher_forcing_input[mask] 
                
            Dense_decoded_output = self.Linear(decoder_output)
            
            soft = nn.Softmax(dim = 2)
            
            soft_out = soft(Dense_decoded_output)

            outputs.append(soft_out)

        return outputs 
                            
        
    
    def train_model(self, batches, teacher_forcing_bool):
                
        train_batches = batches        
         
        epoch_loss = 0
        
        n = 0            
        
        for batch in train_batches :

            self.optimizer.zero_grad()

            root_batch, encoder_states = self.encode(batch)

            outputs = self.decode(encoder_states, root_batch, teacher_forcing_bool)

            a = [torch.squeeze(item, 1) for item in outputs]
            a = [torch.unsqueeze(item, 0) for item in a]

            output = torch.cat(a, dim = 0)
            
            output_dim = output.shape[-1]

            output = output.view(-1, output_dim)

            trg = root_batch.transpose(0, 1)

            trg = trg.reshape(-1)

            #print(output.size(),trg.size())
            
            loss = self.criterion(output, trg)

            loss.backward()

            torch.nn.utils.clip_grad_norm_(self.parameters(), 1)

            self.optimizer.step()

            epoch_loss+=loss.item()

            n+=1

            print('the loss of the train batch ', n ,' is : ', loss.item())
    
        return epoch_loss/n

    def evaluate_model(self, batches, teacher_forcing_bool):
        '''
        this method evaluates our model :=)
        will be similar to train but without the teacher forcing/ using an optimizer 
        '''          
        self.eval()

        val_batches = batches

        n = 0

        epoch_loss = 0
        
        with torch.no_grad() :

            for batch in val_batches :

                root_batch, encoder_states = self.encode(batch)

                outputs = self.decode(encoder_states, root_batch, teacher_forcing_bool)

                a = [torch.squeeze(item, 1) for item in outputs]
                a = [torch.unsqueeze(item, 0) for item in a]

                output = torch.cat(a, dim = 0)

                output_dim = output.shape[-1]

                output = output.view(-1, output_dim)

                trg = root_batch.transpose(0, 1)

                trg = trg.reshape(-1)
                
                #print(output.size(), trg.size())
                
                loss = self.criterion(output, trg)

                epoch_loss+=loss.item()

                n+=1

                print('the loss of the val batch ', n ,' is : ', loss.item())

        return epoch_loss / n
    
    def predict(self, word):
        '''
        this is the adaptation of encoder-decoder network on a single word w/o optimization
        '''

        # Let's turn the word into a sequence of word indexes 
        word_seq = self.word_to_seq(word)

        # Let's create an embedding of the word seq
        embedded_word = self.embedding(torch.tensor(word_seq))

        # Let's feed our word embedding to the encoder network
        outputs, (hidden, cell) = self.BILSTM(embedded_word)
        
        #print(hidden.size())
        
        final_hid, final_ce = [], []
        for k in range(0,hidden.size(0), 2):
            
            tmp_hid = hidden[k:k+2 ,:]
            tmp_ce = cell[k:k+2, :]

            cct_hid = torch.cat((tmp_hid[0], tmp_hid[1]), dim  = -1).tolist()
            cct_ce = torch.cat((tmp_ce[0], tmp_ce[1]), dim  = -1).tolist()

            final_hid.append(cct_hid)
            final_ce.append(cct_ce)
        
        final_hidden, final_cell = torch.tensor(final_hid), torch.tensor(final_ce)

        #initialize the input of the decoder

        embedded_char = torch.unsqueeze(self.embedding(torch.tensor(self.char_index_dic[self.sow])), 0)

        prediction_output = [] # a list of the outputs of the decoder 
     
        # we create a softmax layer : 

        soft = nn.Softmax(dim = 1)
        
        key_list = list(self.char_index_dic.keys())
        val_list = list(self.char_index_dic.values())
        
        for i in range(5):
                        
            decoder_output , (final_hidden, final_cell) = self.LSTM(embedded_char, (final_hidden, final_cell))

            input_dense = nn.Linear(self.hidden_size * 2,self.embedding_size)
            input_decoder_output = input_dense(decoder_output)

            embedded_char = input_decoder_output

            Dense_decoded_output = self.Linear(decoder_output)
            prediction_output.append(soft(Dense_decoded_output))

                
        best_char_indexes = [torch.argmax(item).item() for item in prediction_output]


 
        position = [val_list.index(item) for item in best_char_indexes]
        result_char = [key_list[pos] for pos in position]
        predicted_root = ''.join(result_char)
           
        print(predicted_root)
    
        return predicted_root

    

    def fit(self, num_epochs):
        
        """
        let's first prepare our data
        
        """
        
        print(f'The model has {self.count_parameters():,} trainable parameters')
        
        data = self.data
        
        data = random.sample(data, len(data))
        data_size = len(data)
        middle_index = int(data_size * self.ratio)        
        train_data , val_data = data[:middle_index], data[middle_index:]
        
        train_batches, voc, dic = self.prepare_data(train_data)
        val_batches ,voc , dic = self.prepare_data(val_data)
        
        epochs = list(range(num_epochs))
        
        best_val_loss = 1000
        best_model_par = 0
        
        losses =[]
        predicted_roots = []
        test_word = '$' + 'تحليل' + '£'
        
        
        for epoch in epochs : 
            
            print('epoch num : ', epoch) 
            
            t1 = time.time()
            
            train_batches = random.sample(train_batches , len(train_batches))
            #val_batches = random.sample(val_batches, len(val_batches))
                        
            train_loss = self.train_model(train_batches, 1)
            val_loss = self.evaluate_model(val_batches, 0) # we set the teacher forcing to false            
            t2 = time.time()
            
            
            predicted_root = self.predict(test_word)
            print(predicted_root)
            predicted_roots.append(predicted_root)
            
            
            
            tmp = [train_loss, val_loss]
            losses.append(tmp)
            
            print('the training loss : ', train_loss , 'the val loss :', val_loss)
            print('epoch num : ' ,epoch , ' lasted : ', t2 - t1 , 'seconds')
            
            if val_loss < best_val_loss :
                
                best_val_loss = val_loss 
                best_model_par = self.state_dict()
            
            torch.save(best_model_par, 'best_model.pt')
            
        return losses
    
    
    def count_parameters(self):
        return sum(torch.numel(p) for p in self.parameters() if p.requires_grad)

        


In [30]:
'''
stuff to test in order to reduce overfitting :/

==> shuffling the dataset before each epoch -- Done 
==> initialize the encoder with random values instead of zeros. 
==> reclean my dataset and check for outliers 
==> recheck the structure of my code for the model 

'''

'\nstuff to test in order to reduce overfitting :/\n\n==> shuffling the dataset before each epoch  \n==> reclean my dataset and check for outliers\n==> recheck the structure of my code for the model\n\n'

In [31]:
len(data_root)

794906

In [81]:
d = []
for item in data_root:
    if len(item[0]) > 5 :
        d.append(item)
print(len(d))

549072


In [None]:
mod = model(d, 256, 64 , 256 , 6 , 0.2 ,0.35, 0.0001)
res = mod.fit(5)

The model has 20,256,743 trainable parameters
epoch num :  0
the loss of the train batch  1  is :  3.6634202003479004
the loss of the train batch  2  is :  3.6625521183013916
the loss of the train batch  3  is :  3.6615493297576904
the loss of the train batch  4  is :  3.6597795486450195
the loss of the train batch  5  is :  3.653594493865967
the loss of the train batch  6  is :  3.585444211959839
the loss of the train batch  7  is :  3.482663154602051
the loss of the train batch  8  is :  3.4820497035980225
the loss of the train batch  9  is :  3.4808826446533203
the loss of the train batch  10  is :  3.480748414993286
the loss of the train batch  11  is :  3.4803426265716553
the loss of the train batch  12  is :  3.4795522689819336
the loss of the train batch  13  is :  3.4793527126312256
the loss of the train batch  14  is :  3.4790451526641846
the loss of the train batch  15  is :  3.4781334400177
the loss of the train batch  16  is :  3.4775454998016357
the loss of the train batch

the loss of the train batch  142  is :  3.264816999435425
the loss of the train batch  143  is :  3.257237434387207
the loss of the train batch  144  is :  3.255398750305176
the loss of the train batch  145  is :  3.2619471549987793
the loss of the train batch  146  is :  3.254399061203003
the loss of the train batch  147  is :  3.264723539352417
the loss of the train batch  148  is :  3.255960464477539
the loss of the train batch  149  is :  3.254455327987671
the loss of the train batch  150  is :  3.253262758255005
the loss of the train batch  151  is :  3.258193254470825
the loss of the train batch  152  is :  3.2616536617279053
the loss of the train batch  153  is :  3.262356996536255
the loss of the train batch  154  is :  3.257763624191284
the loss of the train batch  155  is :  3.250488758087158
the loss of the train batch  156  is :  3.25148868560791
the loss of the train batch  157  is :  3.263176679611206
the loss of the train batch  158  is :  3.2521653175354004
the loss of 

the loss of the train batch  283  is :  3.251084327697754
the loss of the train batch  284  is :  3.2478246688842773
the loss of the train batch  285  is :  3.251664638519287
the loss of the train batch  286  is :  3.2462611198425293
the loss of the train batch  287  is :  3.2506563663482666
the loss of the train batch  288  is :  3.2489469051361084
the loss of the train batch  289  is :  3.2501208782196045
the loss of the train batch  290  is :  3.249495267868042
the loss of the train batch  291  is :  3.2490742206573486
the loss of the train batch  292  is :  3.2417709827423096
the loss of the train batch  293  is :  3.2462568283081055
the loss of the train batch  294  is :  3.25225567817688
the loss of the train batch  295  is :  3.2467100620269775
the loss of the train batch  296  is :  3.2563345432281494
the loss of the train batch  297  is :  3.249711275100708
the loss of the train batch  298  is :  3.2452938556671143
the loss of the train batch  299  is :  3.243373155593872
the 

the loss of the train batch  424  is :  3.2417383193969727
the loss of the train batch  425  is :  3.248202323913574
the loss of the train batch  426  is :  3.237534999847412
the loss of the train batch  427  is :  3.242185592651367
the loss of the train batch  428  is :  3.2393295764923096
the loss of the train batch  429  is :  3.253091335296631
the loss of the train batch  430  is :  3.2470204830169678
the loss of the train batch  431  is :  3.2482364177703857
the loss of the train batch  432  is :  3.246723175048828
the loss of the train batch  433  is :  3.2443487644195557
the loss of the train batch  434  is :  3.247401237487793
the loss of the train batch  435  is :  3.248199462890625
the loss of the train batch  436  is :  3.247392177581787
the loss of the train batch  437  is :  3.247722864151001
the loss of the train batch  438  is :  3.2456612586975098
the loss of the train batch  439  is :  3.2433664798736572
the loss of the train batch  440  is :  3.2474324703216553
the lo

the loss of the train batch  565  is :  3.2413547039031982
the loss of the train batch  566  is :  3.244192361831665
the loss of the train batch  567  is :  3.248594284057617
the loss of the train batch  568  is :  3.249035358428955
the loss of the train batch  569  is :  3.2521603107452393
the loss of the train batch  570  is :  3.2416460514068604
the loss of the train batch  571  is :  3.24820876121521
the loss of the train batch  572  is :  3.2560057640075684
the loss of the train batch  573  is :  3.2458336353302
the loss of the train batch  574  is :  3.2476093769073486
the loss of the train batch  575  is :  3.244934558868408
the loss of the train batch  576  is :  3.2405786514282227
the loss of the train batch  577  is :  3.2428605556488037
the loss of the train batch  578  is :  3.2419116497039795
the loss of the train batch  579  is :  3.237229585647583
the loss of the train batch  580  is :  3.242180109024048
the loss of the train batch  581  is :  3.243151903152466
the loss 

the loss of the train batch  706  is :  3.242140769958496
the loss of the train batch  707  is :  3.2326278686523438
the loss of the train batch  708  is :  3.239206552505493
the loss of the train batch  709  is :  3.2379119396209717
the loss of the train batch  710  is :  3.2374565601348877
the loss of the train batch  711  is :  3.2408199310302734
the loss of the train batch  712  is :  3.230748176574707
the loss of the train batch  713  is :  3.244701862335205
the loss of the train batch  714  is :  3.2358930110931396
the loss of the train batch  715  is :  3.242849588394165
the loss of the train batch  716  is :  3.2404041290283203
the loss of the train batch  717  is :  3.236936092376709
the loss of the train batch  718  is :  3.236704111099243
the loss of the train batch  719  is :  3.2351248264312744
the loss of the train batch  720  is :  3.236184597015381
the loss of the train batch  721  is :  3.2380311489105225
the loss of the train batch  722  is :  3.24908185005188
the los

the loss of the train batch  847  is :  3.2417359352111816
the loss of the train batch  848  is :  3.238518238067627
the loss of the train batch  849  is :  3.2455036640167236
the loss of the train batch  850  is :  3.237635850906372
the loss of the train batch  851  is :  3.2430624961853027
the loss of the train batch  852  is :  3.2342371940612793
the loss of the train batch  853  is :  3.2419254779815674
the loss of the train batch  854  is :  3.229968309402466
the loss of the train batch  855  is :  3.2341103553771973
the loss of the train batch  856  is :  3.231009006500244
the loss of the train batch  857  is :  3.234459400177002
the loss of the train batch  858  is :  3.23561429977417
the loss of the train batch  859  is :  3.2426810264587402
the loss of the train batch  860  is :  3.237022876739502
the loss of the train batch  861  is :  3.244290590286255
the loss of the train batch  862  is :  3.2437615394592285
the loss of the train batch  863  is :  3.2384042739868164
the lo

the loss of the train batch  988  is :  3.246032476425171
the loss of the train batch  989  is :  3.238886594772339
the loss of the train batch  990  is :  3.239346742630005
the loss of the train batch  991  is :  3.2360587120056152
the loss of the train batch  992  is :  3.235269069671631
the loss of the train batch  993  is :  3.231198787689209
the loss of the train batch  994  is :  3.2397282123565674
the loss of the train batch  995  is :  3.236402750015259
the loss of the train batch  996  is :  3.233729839324951
the loss of the train batch  997  is :  3.2345457077026367
the loss of the train batch  998  is :  3.232931613922119
the loss of the train batch  999  is :  3.2349441051483154
the loss of the train batch  1000  is :  3.244171142578125
the loss of the train batch  1001  is :  3.229979991912842
the loss of the train batch  1002  is :  3.230489492416382
the loss of the train batch  1003  is :  3.233675479888916
the loss of the train batch  1004  is :  3.2387053966522217
the 

the loss of the train batch  1127  is :  3.235210418701172
the loss of the train batch  1128  is :  3.2365193367004395
the loss of the train batch  1129  is :  3.233816623687744
the loss of the train batch  1130  is :  3.2407968044281006
the loss of the train batch  1131  is :  3.2354209423065186
the loss of the train batch  1132  is :  3.2280120849609375
the loss of the train batch  1133  is :  3.2375595569610596
the loss of the train batch  1134  is :  3.2341740131378174
the loss of the train batch  1135  is :  3.2384166717529297
the loss of the train batch  1136  is :  3.2426998615264893
the loss of the train batch  1137  is :  3.2467851638793945
the loss of the train batch  1138  is :  3.23193621635437
the loss of the train batch  1139  is :  3.2418057918548584
the loss of the train batch  1140  is :  3.237238883972168
the loss of the train batch  1141  is :  3.2357468605041504
the loss of the train batch  1142  is :  3.239203691482544
the loss of the train batch  1143  is :  3.234

In [None]:
res = np.array(res)
plt.figure()
x = list(range(5))
plt.plot(x, res[:,0])
plt.plot(x, res[:,1])
plt.title("evolution of loss with epochs")
plt.xlabel('epochs ')
plt.ylabel('loss')
plt.show()

In [None]:
# implementation of a paper : 

import tensorflow as tf
import numpy as np
import torch.nn as nn
import torch 
import string
import torch.optim as optim
import random


class model(nn.Module): 
    
    def __init__(self, data, batch_size ,embedding_size, hidden_size, num_layers, teacher_forcing_ratio):
        super().__init__()
        
        '''
        our input data are arabic words with their roots, with the hypothesis that each word has it's own root.
        
        (our dataset for this is named root_data)
        '''
        self.sow = '$' # the start of root character 
        self.eow = '£' # the end of root character
        self.batch_size = batch_size
        self.data = data
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.vocab = self.extract_dict()
        self.char_index_dic = self.char_to_index()
        print(self.char_index_dic)
        self.num_layers = num_layers
        self.embedding = nn.Embedding(num_embeddings = len(self.vocab), embedding_dim = self.embedding_size) 
        self.bigru = nn.GRU(input_size=self.embedding_size, hidden_size=self.hidden_size, num_layers=self.num_layers, bidirectional=True)
        self.gru = nn.GRU(input_size= self.embedding_size ,hidden_size = self.hidden_size, num_layers = self.num_layers)
        self.criterion = nn.CrossEntropyLoss()
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.Linear = nn.Linear(self.hidden_size,len(self.vocab))
        self.optimizer = optim.Adam([*self.bigru.parameters(), *self.gru.parameters(), *self.Linear.parameters()], lr = 0.1)

    
    def extract_dict(self) :
        '''
        this function extracts all the unique characters from the given dataset
        '''
        dictt = []
        for word in self.data :
            for item in word : 
                tmp = set(item)
                for k in tmp : 
                    if k not in dictt : 
                        dictt.append(k)
        return dictt 
    
    
    # Let's now construct our model : 
    
    # we should think about character embeddings in order to create an embeded matrix for each word
    
    def char_to_index(self):
        '''
        this function creates unique indexes of each character
        '''
        char_to_idx_map = {char: idx for idx, char in enumerate(self.vocab)}
        return char_to_idx_map  
    
    def data_batches(self): 
        size = self.batch_size
        
        batches = [self.data[i:i + size] for i in range(0, len(self.data), size)]
        
        return batches
        
    
    def word_to_idx_seq(self, word):
        '''
        this function returns a sequence of the unique indexes for the given word 
        (sequence is tensor that can be changed using a .tolist() )
        '''
        word_char_idx_tensor = torch.tensor([self.char_index_dic[char] for char in word])        
        return word_char_idx_tensor # word sequence
    
    def word_to_seq(self, word):
        '''
        this function returns a sequence of the unique indexes for the given word 
        (sequence is tensor that can be changed using a .tolist() )
        '''
        word_char_idx_tensor =[self.char_index_dic[char] for char in word]    
        return word_char_idx_tensor # word sequence
    
    
    def encode_word(self, word):
        
            
        # we create embedding of the sequence : 
        word_seq = self.word_to_idx_seq(word)
        embedded_vec = self.embedding(word_seq)
        
        outputs, hidden = self.bigru(embedded_vec) # we pass the emebedded vector through the bi-GRU 

        
        '''
        kaynin two cases : 
        
             case1 :  we work on the outputs  ==> chosen
             case2 :  we work on the final hidden state ==> discarted 
            
        '''
        
        
        '''test_hidden = torch.flatten(hidden)
        final_hidden = torch.unsqueeze(test_hidden, 0)'''
        
        
        # can also be outputs.
        
        encoder_output = torch.mean(hidden , dim=0)  # Average the hidden vectors across all time steps. Shape: (hidden_size*2,) if bidirectional, else (hidden_size,)
        
        final_output = torch.unsqueeze(encoder_output, 0)
               
        
        return final_output
    
    
    def decode_word(self, encoding, character):

        
        '''
        encoding : output of the encoder network. 
        
        '''
        
        seq = self.word_to_idx_seq(self.sow)
        embedded_sow= self.embedding(self.word_to_idx_seq(character))   # starts with self.sow

        input_size = embedded_sow.size(1)

        hidden_size = encoding.size(1)
                 
        dec_out , dec_hidden = self.gru(embedded_sow,encoding)
                
        a = self.Linear(dec_out)
        m = nn.Softmax(dim = 1)
        
        output = m(a)
        
        top1 = output.argmax(1)[0].item()
        
        key_list = list(self.char_index_dic.keys())
        val_list = list(self.char_index_dic.values())

        position = val_list.index(top1)
        
        result_char = key_list[position]

        return result_char, dec_hidden , top1
        
        
    def train_batch(self, batch):
                
        
        for instance in batch: 
            '''
            word = instance[0] : the word to extract root from
            target_word = instance[1] : the root of the word
            '''
            word = instance[0]
            target_root = instance[1]
            
            hidd = self.encode_word(word)
            predicted_root = []
            original_root_seq = self.word_to_idx_seq(target_root)
            predicted_root_seq = []
            res_char = target_root[0]
            for char in target_root : 
                if len(predicted_root) == len(target_root):
                    break
                
                
                if random.random() < self.teacher_forcing_ratio : 
                    res_char = char
                
                res_char , hidd , idx = self.decode_word(hidd, res_char)
                predicted_root_seq.append(idx)
                test = hidd
                predicted_root.append(res_char)

            predicted_root = ''.join(predicted_root)

            predicted_root_seq = torch.tensor(predicted_root_seq)

            org = self.embedding(original_root_seq)
            pred = self.embedding(predicted_root_seq)
            
            self.optimizer.zero_grad()
            loss = self.criterion(org, pred)
            
            loss.backward()
            self.optimizer.step()

            print('target root :', target_root)
            print('predicted root : ', predicted_root)

            print('the loss : ', loss.item())        
        
        return None
    
    
    def train(self):
        
        batches = self.data_batches()
        
        for batch in batches : 
            
            self.train_batch(batch)

        #print(em1)
        
        

mod = model(data_root, 100, 50, 100, 1, 0.9)
mod.train()*