### Notes 

T5 Paper: https://arxiv.org/pdf/1910.10683.pdf

T5 Tokenizer: https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_t5.py

Important Tasks: https://docs.google.com/document/d/1weIZM6QTlnitpPQmpg-WeV2RW70TnYmDuogBQPr5mB0/edit

In [150]:
#James Chartouni
#Joey Park
#Raef Khan

import torch
import pandas as pd
import numpy as np
import pickle

import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import sentencepiece as spm

from transformers import T5Tokenizer


In [23]:
ls data/AD_NMT-master

LAV-MSA-2-both.pkl        Magribi_MSA-test.pkl      english-Arabic-train.pkl
LAV-MSA-2-test.pkl        Magribi_MSA-train.pkl     english-arabic-test.pkl
LAV-MSA-2-train.pkl       README.md
Magribi_MSA-both.pkl      english-Arabic-both.pkl


In [38]:
file_path = 'data/AD_NMT-master/'

with open(file_path + "english-Arabic-train.pkl", 'rb') as handle:
    train_ds = pickle.load(handle) 
    
with open(file_path + "english-Arabic-test.pkl", 'rb') as handle:
    test_ds = pickle.load(handle) 
    
with open(file_path + "LAV-MSA-2-both.pkl", 'rb') as handle:
    data_LAV_MSA = pickle.load(handle) 

with open(file_path + "english-Arabic-both.pkl", 'rb') as handle:
    data_English_MSA = pickle.load(handle) 
    
with open(file_path + "Magribi_MSA-both.pkl", 'rb') as handle:
    data_Magribi_MSA = pickle.load(handle) 
    

In [32]:
print(train_ds[0:5])
print(test_ds[0:5])

[['Tom was also there', 'كان توم هنا ايضا'], ['That old woman lives by herself', 'تلك المراة العجوز تسكن بمفردها'], ['He went abroad for the purpose of studying English', 'سافر خارج البلد ليتعلم الانجليزية'], ['There is a fork missing', 'هناك شوكة ناقصة'], ["I don't know this game", 'لا اعرف هذه اللعبة']]
[["Where's your money?", 'اين مالك؟'], ['Be prepared', 'كن مستعدا'], ["I figured you'd be impressed", 'توقعت انك ستنبهر'], ['May I come in?', 'هل بامكاني الدخول؟'], ['Read through the article', 'اقرا المقالة']]


## Build Vocabulary 

Sentence Piece Google Colab
https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb

In [88]:
len(data_English_MSA)

10001

In [128]:
"""
Create a text file with all the MSA vocab available for SentencePiece to create a library 
"""

text_file_en = open("data/english_data.txt", "wt")
text_file_msa = open("data/arabic_data.txt", "wt")

MSA_text = ""
EN_text = ""

for line in data_English_MSA:
        english = line[0]
        english_words = english.split(" ")
        for count, word in enumerate(english_words):
            text_file_en.write(word)
        text_file_en.write("\n")
        
        arabic = line[1]
        arabic_words = arabic.split(" ")
        for count, word in enumerate(arabic_words):
            text_file_msa.write(word)
        text_file_msa.write("\n")


text_file.close()

In [138]:
spm.SentencePieceTrainer.train('--input=data/arabic_data.txt --model_prefix=data/msa --vocab_size=2000')
spm.SentencePieceTrainer.train('--input=data/english_data.txt --model_prefix=data/en --vocab_size=2000')

True

In [144]:
sp = spm.SentencePieceProcessor()
sp.load('en.model')

True

In [145]:
ls

Env_Setup_Instructions.txt  en.vocab
Model_1.ipynb               msa.model
README.md                   msa.vocab
[34mdata[m[m/                       [34mtransformers[m[m/
en.model


In [146]:
print(sp.encode_as_pieces('This is a test'))
print(sp.encode_as_ids('This is a test'))

['▁This', '▁', 'is', '▁', 'a', '▁', 't', 'est']
[89, 83, 12, 83, 8, 83, 6, 309]


## TF Tokenizer

https://huggingface.co/transformers/model_doc/t5.html#t5tokenizer

In [151]:
ls data/

[34mAD_NMT-master[m[m/          UNv1.0.ar-en.tar.gz.01  english_data.txt
AD_NMT-master.zip       arabic_data.txt         msa.model
PADIC.xml               en.model                msa.vocab
UNv1.0.ar-en.tar.gz.00  en.vocab


In [153]:
cat msa.vocab

<unk>	0
<s>	0
</s>	0
ا	-3.13275
▁	-3.1368
ت	-3.45305
ي	-3.56965
ال	-3.57586
؟	-3.73331
ب	-3.99997
م	-4.04006
ك	-4.05122
ة	-4.19093
ل	-4.22863
ه	-4.35769
ن	-4.60958
في	-4.72085
و	-4.78685
ع	-4.84808
▁ا	-4.85412
توم	-4.92814
من	-5.00118
ها	-5.08404
ف	-5.09055
ان	-5.12223
د	-5.12656
ر	-5.13282
ق	-5.13918
نا	-5.25225
ح	-5.31822
ني	-5.32711
ج	-5.3788
ط	-5.38223
ما	-5.40789
ص	-5.43782
خ	-5.44452
ش	-5.55425
ين	-5.66536
ز	-5.68319
ذلك	-5.6844
على	-5.69837
▁هل	-5.71044
لا	-5.73919
ى	-5.75836
الى	-5.78152
س	-5.78609
انت	-5.82002
!	-5.83917
ون	-5.86576
ية	-5.87871
▁انا	-5.89159
عن	-5.91744
ض	-5.92319
هذا	-5.94499
هنا	-5.95061
لي	-5.97166
الم	-5.97322
غ	-5.97556
ار	-5.99059
ست	-6.0039
ئ	-6.08552
اء	-6.10566
اني	-6.1508
فيال	-6.15133
ته	-6.15667
بال	-6.16746
كل	-6.17003
وا	-6.17988
▁سا	-6.18937
▁لقد	-6.20255
عمل	-6.24192
الان	-6.27717
هم	-6.28594
ث	-6.28779
ذ	-6.30442
بعد	-6.3089
ذهب	-6.31993
انا	-6.32773
ات	-6.34609
هو	

In [154]:
msa_tokenizer = T5Tokenizer("data/msa.vocab")
en_tokenizer = T5Tokenizer("data/en.vocab")

RuntimeError: Internal: /Users/travis/build/google/sentencepiece/src/sentencepiece_processor.cc(73) [model_proto->ParseFromArray(serialized.data(), serialized.size())] 