# wiki dataset

In [1]:
import hazm
import numpy as np
import pandas as pd
from hazm import word_tokenize,Normalizer
import nltk
from nltk.tokenize import word_tokenize as eng_tokenize
from tqdm.auto import tqdm
import pickle
import json
import string
import re
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
# nltk.download('punkt');

In [2]:
farsi_path="./PEPC_Bidirectional/wiki_extracted_200k.fa"
english_path="./PEPC_Bidirectional/wiki_extracted_200k.en"

In [3]:
class preprocess1:
    def __init__(self):
        self.vocab_fa=[]
        self.vocab_en=[]
        self.dataset_en=[]
        self.dataset_fa=[]
        self.EOS="EOS"
        self.punctuation_pattern = '[' + re.escape('!"#$%&\'()*+-./:;<=>@[\\]^_`{|}~') + ']'
        self.number_pattern = r'\d'
    def ReadCorpus(self,farsi_path,english_path):
        with open(farsi_path,"r") as f:
            self.farsi_corpus=f.readlines()
        with open(english_path,"r") as f:
            self.english_corpus=f.readlines()
    def Tokenize(self):
        for i in tqdm(range(len(self.english_corpus))):
            en_tokenized=eng_tokenize(self.english_corpus[i])
            fa_tokenized=word_tokenize(self.farsi_corpus[i])
            en_tokenized.append(self.EOS)
            fa_tokenized.append(self.EOS)
            en_tokenized,fa_tokenized=self.Clean(en_tokenized,fa_tokenized)
            self.dataset_en.append(en_tokenized)
            self.dataset_fa.append(fa_tokenized)
            self.vocab_en+=en_tokenized
            self.vocab_fa+=fa_tokenized
        self.vocab_en=sorted(list(set(self.vocab_en)))
        self.vocab_fa=sorted(list(set(self.vocab_fa)))
    def IndexVocab(self):
        self.en_index={}
        self.fa_index={}
        for i,x in enumerate(self.vocab_en):
            self.en_index[x]=i
        for i,x in enumerate(self.vocab_fa):
            
            self.fa_index[x]=i
    def Clean(self,fa_tokenized,en_tokenized):
        en_tokenized=[x for x in en_tokenized if not re.search(self.punctuation_pattern, x) and not re.search(self.number_pattern, x)]
        fa_tokenized=[x for x in fa_tokenized if not re.search(self.punctuation_pattern, x) and not re.search(self.number_pattern, x)]
        return en_tokenized,fa_tokenized
    def Transfrom(self,en_path,fa_path,del_emoji=False,del_numbers=False):
        self.del_emoji=del_emoji
        self.del_numbers=del_numbers
        self.ReadCorpus(fa_path,en_path)
        self.Tokenize()
        self.IndexVocab()
        return self.vocab_en,self.vocab_fa,self.dataset_en,self.dataset_fa,self.en_index,self.fa_index

In [None]:
prep1=preprocess1()
vocab_en,vocab_fa,dataset_en,dataset_fa,en_index,fa_index=prep1.Transfrom(en_path=english_path,fa_path=farsi_path)

In [None]:
with open("./preprocessed/vocab_en", "wb") as fp:   
    pickle.dump(vocab_en, fp)
with open("./preprocessed/vocab_fa", "wb") as fp:  
    pickle.dump(vocab_fa, fp)
with open("./preprocessed/dataset_en", "wb") as fp:   
    pickle.dump(dataset_en, fp)
with open("./preprocessed/dataset_fa", "wb") as fp:
    pickle.dump(dataset_fa, fp)
with open("./preprocessed/en_index.json", "w") as outfile: 
    json.dump(en_index, outfile)
with open("./preprocessed/fa_index.json", "w") as outfile: 
    json.dump(fa_index, outfile)

In [None]:
import re
import string
inputString="۱ dldl"
punctuation_pattern = '[' + re.escape('!"#$%&\'()*+-/:;<=>@[\\]^_`{|}~') + ']'
number_pattern = r'\d'
bool(re.search(number_pattern, inputString))

# huggingface dataset

In [None]:
# next(iter(imdb_dataset))["translation"]
dataset=load_dataset("tep_en_fa_para")["train"]["translation"]

In [4]:
dataset_en=[]
dataset_fa=[]
vocab_en=[]
vocab_fa=[]
normalizer = Normalizer()
def clean(farsi_s,eng_s):
    farsi_s=re.sub(r",+"," , ",farsi_s)
    farsi_s=re.sub(r".+"," . ",farsi_s)
    farsi_s=re.sub(r"_+"," _ ",farsi_s)
    farsi_s=re.sub(r"$","",farsi_s)
    farsi_s=re.sub(r"#","",farsi_s)
    farsi_s=re.sub(r"=","",farsi_s)
    farsi_s=re.sub(r"@","",farsi_s)
    farsi_s=re.sub(r"\d","",farsi_s)
    farsi_s=re.sub(r"~","",farsi_s)
    farsi_s=re.sub(r"\'","",farsi_s)
    farsi_s=re.sub(r">","",farsi_s)
    farsi_s=re.sub(r"<","",farsi_s)
    farsi_s=re.sub(r"\+","",farsi_s)
    farsi_s=re.sub(r"\-","",farsi_s)
    farsi_s=re.sub(r"\/","",farsi_s)
    farsi_s=re.sub(r"\*","",farsi_s)
    farsi_s=re.sub(r"\"","",farsi_s)
    
    eng_s=re.sub(r",+"," , ",eng_s)
    eng_s=re.sub(r".+"," . ",eng_s)
    eng_s=re.sub(r"_+"," _ ",eng_s)
    eng_s=re.sub(r"$","",eng_s)
    eng_s=re.sub(r"#","",eng_s)
    eng_s=re.sub(r"=","",eng_s)
    eng_s=re.sub(r"@","",eng_s)
    eng_s=re.sub(r"\d","",eng_s)
    eng_s=re.sub(r"~","",eng_s)
    eng_s=re.sub(r"\'","",eng_s)
    eng_s=re.sub(r">","",eng_s)
    eng_s=re.sub(r"<","",eng_s)
    eng_s=re.sub(r"\+","",eng_s)
    eng_s=re.sub(r"\-","",eng_s)
    eng_s=re.sub(r"\/","",eng_s)
    eng_s=re.sub(r"\*","",eng_s)
    eng_s=re.sub(r"\"","",eng_s)
    return farsi_s,eng_s
for dic in tqdm(dataset):
    farsi_s=dic["fa"]
    eng_s=dic["fa"]
    farsi_s,eng_s=clean(farsi_s,eng_s)
    dataset_en.append(word_tokenize(normalizer.normalize(farsi_s))+["_EOS_"])
    dataset_fa.append(word_tokenize(normalizer.normalize(eng_s))+["_EOS_"])
    vocab_en+=dataset_en[-1]
    vocab_fa+=dataset_fa[-1]

  0%|          | 0/612087 [00:00<?, ?it/s]

In [5]:
vocab_en=sorted(list(set(vocab_en)))
vocab_fa=sorted(list(set(vocab_fa)))

In [6]:
en_index={}
fa_index={}
for i,x in enumerate(vocab_en):
    en_index[x]=i
for i,x in enumerate(vocab_fa):
    fa_index[x]=i

In [10]:
# normalizer = Normalizer()
s="تاب,سس"
s=" ".join(s.split(","))
word_tokenize(s)

['تاب', 'سس']

In [12]:
word_tokenize("مسممس   مسمس")

['مسممس', 'مسمس']

In [38]:
s="spsp -,slsl ,,, , , , ss,,,"
pat=r',+(?=\s*\b)'
re.sub("\-","",s)

'spsp ,slsl ,,, , , , ss,,,'