<a href="https://colab.research.google.com/github/jameschartouni/arabic_translation/blob/master/Model_1_Final_Backwards.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Notes 

T5 Paper: https://arxiv.org/pdf/1910.10683.pdf

T5 Tokenizer: https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_t5.py

Important Tasks: https://docs.google.com/document/d/1weIZM6QTlnitpPQmpg-WeV2RW70TnYmDuogBQPr5mB0/edit

In [1]:
#installation step
!pip install transformers
!pip install t5
!pip install sentencepiece
#creating the folders 
!mkdir data/
!mkdir data/AD_NMT-master
!mkdir data/train/
!mkdir data/test/
!mkdir data/val/
!mkdir data/model/
!mkdir data/config/
#fetching the pkl files
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V9crCmqvgQcv0Sx2MCNWB9AET2j6M6FW' -O data/AD_NMT-master/english-Arabic-both.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UzL4cOWTMCee83KBUh2QO_H62AFVpDQV' -O data/AD_NMT-master/LAV-MSA-2-both.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UjDX7cCG2S23SPfSHxSPdVayMTxB5Y16' -O data/AD_NMT-master/Magribi_MSA-both.pkl
# !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1fEVj9jCxvcKn9zg8lO43i2sWZquegg5H' -O data/operative_config.gin
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UGKswXSqHSxWpx57cEDzvNeJaqbAuyt8' -O data/padic.xml

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 3.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 15.2MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 18.2MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

In [2]:
#James Chartouni
#Joey Park
#Raef Khan

import torch
from torch.optim import SGD
import pandas as pd
import numpy as np
import pickle
import os, io, glob
import functools

import sentencepiece as spm

import transformers
import t5
from t5.data import preprocessors
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
tf.compat.v1.enable_eager_execution()

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 


## Prepare Datasets

We need to take our training and test sets from the pkl files and create new .txt files that are formatted so that the standard torchtext Dataset class can read them

### PADIC Dataset Parsing

In [3]:
import xml.etree.ElementTree as ET

padic_tree = ET.parse('data/padic.xml')

padic_alg_msa = []
padic_ann_msa = []
padic_syr_msa = []
padic_pal_msa = []
padic_mor_msa = [] 

for sentence in padic_tree.getroot():
  padic_alg_msa.append([sentence.find('ALGIERS').text.strip(), sentence.find('MODERN-STANDARD-ARABIC').text.strip()])
  padic_ann_msa.append([sentence.find('ANNABA').text.strip(), sentence.find('MODERN-STANDARD-ARABIC').text.strip()])
  padic_syr_msa.append([sentence.find('SYRIAN').text.strip(), sentence.find('MODERN-STANDARD-ARABIC').text.strip()])
  padic_pal_msa.append([sentence.find('PALESTINIAN').text.strip(), sentence.find('MODERN-STANDARD-ARABIC').text.strip()])
  padic_mor_msa.append([sentence.find('MOROCCAN').text.strip(), sentence.find('MODERN-STANDARD-ARABIC').text.strip()])

In [4]:
print(padic_alg_msa[0])
print(padic_ann_msa[0])
print(padic_syr_msa[0])
print(padic_pal_msa[0])
print(padic_mor_msa[0])

['EAdw AlnAs ytbAkAw bdyt nhdr mn qlby tqwl nhdy fAlnAs', "tEAlt >SwAt AlnAs bAlbkA'،  bd>t >tHdv bAnfEAl w k>nny >hdy fy AlnAs"]
['EAdwA AlnAs ytbAkAw bdyt nhdr bg$ w qwl ElyA nhdy fy AlnAs', "tEAlt >SwAt AlnAs bAlbkA'،  bd>t >tHdv bAnfEAl w k>nny >hdy fy AlnAs"]
['Ely Swt AlnAs bAlbky w bl~$t >Hky bESbyp w k>ny Em Ahdy bAlnAs', "tEAlt >SwAt AlnAs bAlbkA'،  bd>t >tHdv bAnfEAl w k>nny >hdy fy AlnAs"]
['SArwA AlnAs ySyHwA bSwt EAly wbdyt AHky wAnA mnfEl wk>ny bhdy fy AlnAs', "tEAlt >SwAt AlnAs bAlbkA'،  bd>t >tHdv bAnfEAl w k>nny >hdy fy AlnAs"]
['nAs bdAw tytbAkAw wbdyt tnhdr b nfEl bHAl <lY tnhdy AlnAs', "tEAlt >SwAt AlnAs bAlbkA'،  bd>t >tHdv bAnfEAl w k>nny >hdy fy AlnAs"]


In [5]:
alg_msa_train, alg_msa_val = train_test_split(padic_alg_msa, test_size=.15)
ann_msa_train, ann_msa_val = train_test_split(padic_ann_msa, test_size=.15)
syr_msa_train, syr_msa_val = train_test_split(padic_syr_msa, test_size=.15)
pal_msa_train, pal_msa_val = train_test_split(padic_pal_msa, test_size=.15)
mor_msa_train, mor_msa_val = train_test_split(padic_mor_msa, test_size=.15)

In [6]:
#all the translations have equal amt. of examples
print(len(alg_msa_train))
print(len(alg_msa_val))

6131
1082


###Initial Loading from Pickle

In [7]:
ls data/AD_NMT-master

english-Arabic-both.pkl  LAV-MSA-2-both.pkl  Magribi_MSA-both.pkl


In [8]:
file_path = 'data/AD_NMT-master/'

with open(file_path + "english-Arabic-both.pkl", 'rb') as handle:
    data_MSA_English_both = pickle.load(handle) 

with open(file_path + "LAV-MSA-2-both.pkl", 'rb') as handle:
    data_LAV_MSA_both = pickle.load(handle) 

with open(file_path + "Magribi_MSA-both.pkl", 'rb') as handle:
    data_Magribi_MSA_both = pickle.load(handle) 
    

In [9]:
#few dataset examples
print(data_MSA_English_both[0:5])
print(data_MSA_English_both[-5:])
print(data_LAV_MSA_both[0:5])
print(data_Magribi_MSA_both[0:5])

[['Tom was also there', 'كان توم هنا ايضا'], ['That old woman lives by herself', 'تلك المراة العجوز تسكن بمفردها'], ['He went abroad for the purpose of studying English', 'سافر خارج البلد ليتعلم الانجليزية'], ['There is a fork missing', 'هناك شوكة ناقصة'], ["I don't know this game", 'لا اعرف هذه اللعبة']]
[['Please send us more information', 'ارسل الينا المزيد من المعلومات اذا تكرمت'], ['I am an only child', 'انا طفل وحيد ابي و امي'], ['Make good use of your time', 'استفد من وقتك جيدا'], ["Fighting won't settle anything", 'لن يحل القتال اي شيء'], ['Practice makes perfect', 'الممارسة هي الطريق الى الاتقان']]
[['لا انا بعرف وحدة راحت ع فرنسا و معا شنتا حطت فيها الفرش', 'لا اعرف واحدة ذهبت الى فرنسا و لها غرفة و ضعت فيها الافرشة'], ['روح بوشك و فتول عاليسار', 'اذهب تقدم و استدر يسارا'], ['لا لا لازم انه يكون عندك موضوع ما في اشي', ' لا لا يجب ان يكون لديك موضوع هذا ضروري'], ['اوعي تبعدي من هون بلاش تضيعي ', 'لا تبتعد عن هنا حتى لا تفقد الطريق '], ['قصدي صراحة يما انا كمان كرهته من يوم ما 

In [10]:
#splits the train dataset into train and validation sets, define test set as datafile
msa_en_train, msa_en_val = train_test_split(data_MSA_English_both, test_size=.3)
msa_en_val, msa_en_test = train_test_split(msa_en_val, test_size=.5)
lav_msa_train, lav_msa_val = train_test_split(data_LAV_MSA_both, test_size=.3)
lav_msa_val, lav_msa_test = train_test_split(lav_msa_val, test_size=.5)
mag_msa_train, mag_msa_val = train_test_split(data_Magribi_MSA_both, test_size=.3)
mag_msa_val, mag_msa_test = train_test_split(mag_msa_val, test_size=.5)

In [11]:
print(len(msa_en_train))
print(len(msa_en_val))
print(len(msa_en_test))

print(len(lav_msa_train))
print(len(lav_msa_val))
print(len(lav_msa_test))

print(len(mag_msa_train))
print(len(mag_msa_val))
print(len(mag_msa_test))

7000
1500
1501
11063
2371
2371
13815
2960
2961


In [12]:
file_path = 'data/'

def remappings(arabic_char):
  arabic_english_mapping = {
    'ا':'a',
    'ب':'b',
    'ت':'t',
    'ث':'p',
    'ج':'g',
    'ح':'7',
    'خ':'x',
    'د':'d',
    'ذ':'v',
    'ر':'r',
    'ز':'z',
    'س':'s',
    'ش':'j',
    'ص':'9',
    'ض':'(',
    'ط':'6',
    'ظ':'^',
    'ع':'3',
    'غ':'#',
    'ف':'f',
    'ق':'q',
    'ك':'k',
    'ل':'l',
    'م':'m',
    'ن':'n',
    'ه':'h',
    'و':'w',
    'ي':'y',
    'ة':'t',
    'ء':'2',
    'أ':'2',
    'ؤ':'2',
    'ئ':'2',
    'ى':'Y',
    'لا':'la',
    'لأ':'la2',
    '؟':'?',
  }
  return arabic_english_mapping.get(arabic_char, arabic_char)

def list_to_csv_msa_en(ds, src='msa', trg='en', datatype=''):
    src_formatted = datatype + '_' + src + '_' + trg + '.' + 'csv'
    
    with open(file_path + datatype + "/" + src_formatted, 'wt') as csv:
        for i, arr in enumerate(ds):
          csv.write(arr[0] + ',' + arr[1] + '\n')

def list_to_csv_ar(ds, src='msa', trg='en', datatype=''):
    src_formatted = datatype + '_' + src + '_' + trg + '.' + 'csv'
    
    with open(file_path + datatype + "/" + src_formatted, 'wt') as csv:
        for i, arr in enumerate(ds):
          csv.write(arr[1] + ',' + arr[0] + '\n')

def list_to_csv_test(ds, src='msa', trg='en', datatype=''):
    src_formatted = datatype + '_' + src + '_' + trg + '.' + 'csv'
    
    with open(file_path + datatype + "/" + src_formatted, 'wt') as csv:
        for i, arr in enumerate(ds):
          csv.write("Translate " + src + " to " + trg + ": " + arr[0] + '\n')

def list_to_csv_test_ar(ds, src='msa', trg='en', datatype=''):
    src_formatted = datatype + '_' + src + '_' + trg + '.' + 'csv'
    
    with open(file_path + datatype + "/" + src_formatted, 'wt') as csv:
        for i, arr in enumerate(ds):
          csv.write("Translate " + src + " to " + trg + ": " + arr[1] + '\n')

In [13]:
list_to_csv_msa_en(msa_en_train, 'en', 'msa', 'train')
list_to_csv_msa_en(msa_en_val, 'en', 'msa', 'val')
list_to_csv_test(msa_en_test, 'English', 'MSA', 'test')

list_to_csv_ar(lav_msa_train, 'msa', 'lav', 'train')
list_to_csv_ar(lav_msa_val, 'msa', 'lav', 'val')
list_to_csv_test_ar(lav_msa_test, 'MSA', 'Levantine', 'test')

list_to_csv_ar(mag_msa_train, 'msa', 'mag', 'train')
list_to_csv_ar(mag_msa_val, 'msa', 'mag', 'val')
list_to_csv_test_ar(mag_msa_test, 'MSA', 'Maghrabi', 'test')

## Training SentencePiece Model

In [14]:
#combine all the training lines of all three languages
spm_input_ds = msa_en_train + mag_msa_train + lav_msa_train
#for i, v in enumerate(spm_input_ds):
  #print(v)

In [15]:
def list_to_input(ds):
    src_formatted = 'spm_input' + '.' + 'txt'

    with open(file_path + "/" + src_formatted, 'wt') as sentencelinefile:
        for i, arr in enumerate(ds):
            sentencelinefile.write(arr[0] + '\n' + arr[1] + '\n')

In [16]:
list_to_input(spm_input_ds)

In [17]:
VOCAB_SIZE = 32128
spm.SentencePieceTrainer.train('--input=data/spm_input.txt --model_prefix=data/model/spm --vocab_size=' + str(VOCAB_SIZE) + ' --unk_id=2 --bos_id=-1 --eos_id=1 --pad_id=0 --hard_vocab_limit=False')

In [18]:
filepath = 'data/model/spm.model'

##Tensor Processing + Add to TaskRegistry

In [19]:
#  !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=13yx1kO7Skr39VZXI8V8SOl-OhXFGafzE' -O data/key.json

--2020-09-02 01:23:56--  https://docs.google.com/uc?export=download&id=13yx1kO7Skr39VZXI8V8SOl-OhXFGafzE
Resolving docs.google.com (docs.google.com)... 172.217.212.139, 172.217.212.113, 172.217.212.100, ...
Connecting to docs.google.com (docs.google.com)|172.217.212.139|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-14-7c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/1nhh3lt839ffh9pkv48hep03pl41fbb2/1599009825000/01542315482457206469/*/13yx1kO7Skr39VZXI8V8SOl-OhXFGafzE?e=download [following]
--2020-09-02 01:23:56--  https://doc-14-7c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/1nhh3lt839ffh9pkv48hep03pl41fbb2/1599009825000/01542315482457206469/*/13yx1kO7Skr39VZXI8V8SOl-OhXFGafzE?e=download
Resolving doc-14-7c-docs.googleusercontent.com (doc-14-7c-docs.googleusercontent.com)... 172.217.214.132, 2607:f8b0:4001:c05::84
Connecting to doc-14-7c-docs.googleusercontent.com (doc-14

In [20]:
# import os
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'data/key.json'

In [19]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

BASE_DIR = "gs://mllu-t5" #@param { type: "string" }
if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  try:
    # tpu = tf.distribute.cluster_resolver.TPUClusterResolver('mllu-tpu', zone = 'us-central1-b', project = 'mllu-t5')
    # TPU_ADDRESS = tpu.get_master()
    # TPU_TOPOLOGY = 'v2-8'
    
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    TPU_TOPOLOGY = "v2-8"
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Installing dependencies...
Setting up GCS access...
Running on TPU: grpc://10.61.25.138:8470


INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Instructions for updating:
non-resource variables are not supported in the long term


Instructions for updating:
non-resource variables are not supported in the long term


### Arabic to English Task

In [38]:
msa_en_split_csv_path = {
    "train": "gs://bucket-remove-excess-params/data/train_backwards/train_en_msa.csv",
    "validation": "gs://bucket-remove-excess-params/data/val_backwards/val_en_msa.csv"
}
msa_en_example_count = {
    "train": len(msa_en_train),
    "validation": len(msa_en_val)
}

In [39]:
def msa_en_translation_dataset_fn(split, shuffle_files=False):
  ds = tf.data.TextLineDataset(msa_en_split_csv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["",""],
                        field_delim=",", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE
  )
  ds = ds.map(lambda *example: dict(zip(["source", "target"], example)) )
  return ds

In [40]:
#turn the ds of dictionaries and change the keys to inputs and targets that the model
def msa_en_translation_preprocessor(ds):
  def to_inputs_and_targets(ex):
    return{
        "inputs": tf.strings.join(["translate English to MSA: ",ex["source"]]),
        "targets": ex["target"]
    }
  return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [41]:
t5.data.TaskRegistry.remove('translation_msa_en')
t5.data.TaskRegistry.add(
    #name of the Task
    'translation_msa_en',
    #Supply a function which returns a tf.data.Dataset
    dataset_fn=msa_en_translation_dataset_fn,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[msa_en_translation_preprocessor],
    # Lowercase targets before computing metrics.

    postprocess_fn=t5.data.postprocessors.lower_text, 

    # We'll use accuracy as our evaluation metric.

    metric_fns=[t5.evaluation.metrics.bleu],
    #metric_fns=[],

    # Not required, but helps for mixing and auto-caching.
    num_input_examples=msa_en_example_count,
    # output_features
    output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(filepath)),
    # specifying token processor
    # token_preprocessor=[
    #   functools.partial(
    #       preprocessors.select_random_chunk,
    #       feature_key="targets",
    #       max_length=65536
    #   ),
    #   functools.partial(
    #       preprocessors.reduce_concat_tokens,
    #       feature_key="targets",
    #       batch_size=128
    #   ),
    #   preprocessors.split_tokens_to_inputs_length,
    #   functools.partial(
    #       preprocessors.denoise,
    #       inputs_fn=preprocessors.noise_span_to_unique_sentinel,
    #       targets_fn=preprocessors.nonnoise_span_to_unique_sentinel,
    #       noise_density=0.15,
    #       noise_mask_fn=preprocessors.iid_noise_mask,
    #   )
    # ]
)

In [42]:
review_task = t5.data.TaskRegistry.get("translation_msa_en")
ds = review_task.get_dataset(split="validation", sequence_length={"inputs": 32, "targets": 32})
print("A few preprocessed validation examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

  return dataset.map(my_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)


A few preprocessed validation examples...
{'inputs_plaintext': b'translate English to MSA: Whose books are these?', 'inputs': array([ 6537,  1193,    51,   325, 11829,  1291,  1238,  1259,  2394,
         243,  4493,    62,     1]), 'targets_plaintext': b'\xd9\x84\xd9\x85\xd9\x86 \xd9\x87\xd8\xb0\xd9\x87 \xd8\xa7\xd9\x84\xd9\x83\xd8\xaa\xd8\xa8\xd8\x9f', 'targets': array([1825,   80, 4597,   26,    1])}
{'inputs_plaintext': b'translate English to MSA: He was able to build a small house', 'inputs': array([ 6537,  1193,    51,   325, 11829,  1291,   174,   199,     5,
        2429,    51, 13611,    75,  3778,   922,     1]), 'targets_plaintext': b'\xd8\xa7\xd8\xb3\xd8\xaa\xd8\xb7\xd8\xa7\xd8\xb9 \xd8\xa7\xd9\x86 \xd9\x8a\xd8\xa8\xd9\x86\xd9\x8a \xd8\xa8\xd9\x8a\xd8\xaa\xd8\xa7 \xd8\xb5\xd8\xba\xd9\x8a\xd8\xb1\xd8\xa7', 'targets': array([7556,   14, 6498,   41, 6973,  350,    6,    1])}
{'inputs_plaintext': b'translate English to MSA: Tom likes tea better than coffee', 'inputs': array([ 6

###Levantine to MSA Task

In [43]:
lav_msa_split_csv_path = {
    "train": "gs://bucket-remove-excess-params/data/train_backwards/train_msa_lav.csv",
    "validation": "gs://bucket-remove-excess-params/data/val_backwards/val_msa_lav.csv"
}
lav_msa_example_count = {
    "train": len(lav_msa_train),
    "validation": len(lav_msa_val)
}

In [44]:
def lav_msa_translation_dataset_fn(split, shuffle_files=False):
  ds = tf.data.TextLineDataset(lav_msa_split_csv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["",""],
                        field_delim=",", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE
  )
  ds = ds.map(lambda *example: dict(zip(["source", "target"], example)) )
  return ds

for example in tfds.as_numpy(lav_msa_translation_dataset_fn("train").take(5)):
    print(example)

{'source': b'\xd9\x88 \xd8\xb1\xd9\x81\xd8\xb9 \xd8\xb5\xd9\x88\xd8\xaa\xd9\x87 \xd9\x82\xd9\x84\xd9\x8a\xd9\x84\xd8\xa7 \xd9\x82\xd8\xaf\xd8\xa7\xd9\x85\xd9\x83 \xd8\xb4\xd8\xac\xd8\xb1\xd9\x87 \xd9\x88 \xd9\x84\xd9\x85 \xd9\x8a\xd8\xac\xd9\x8a\xd8\xa8 \xd8\xb9\xd9\x84\xd9\x8a\xd9\x87', 'target': b'\xd8\xb9\xd9\x84\xd9\x89 \xd8\xb5\xd9\x88\xd8\xaa\xd9\x88 \xd9\x88\xd8\xad\xd9\x83\xd9\x89 \xd9\x82\xd8\xaf\xd8\xa7\xd9\x85\xd9\x83 \xd8\xb4\xd8\xac\xd8\xb1\xd8\xa9 \xd8\xa8\xd8\xb1\xd8\xb6\xd9\x88 \xd9\x85\xd8\xa7 \xd8\xb1\xd8\xaf \xd8\xb9\xd9\x84\xd9\x8a'}
{'source': b'\xd8\xa8\xd8\xa7\xd9\x84\xd8\xb7\xd8\xa8\xd8\xb9', 'target': b'\xd8\xb7\xd8\xa8\xd8\xb9\xd8\xa7'}
{'source': b'\xd8\xa7\xd9\x87\xd9\x84\xd8\xa7 \xd8\xa7\xd9\x86\xd8\xb3\xd8\xaa\xd9\x89 \xd9\x85\xd9\x8a\xd9\x88\xd8\xaf\xd8\xa9 \xd8\xa7\xd9\x88\xd9\x84\xd8\xa7 \xd8\xa7\xd9\x86\xd8\xaa\xd9\x8a \xd9\x84\xd8\xb3\xd9\x87 \xd8\xb5\xd8\xba\xd9\x8a\xd8\xb1\xd8\xa9 \xd8\xb9\xd9\x84\xd9\x8a \xd8\xa7\xd9\x84\xd8\xac\xd9\x88\xd8\xa7\xd8

In [45]:
#turn the ds of dictionaries and change the keys to inputs and targets that the model
def lav_msa_translation_preprocessor(ds):
  def to_inputs_and_targets(ex):
    return{
        "inputs": tf.strings.join(["translate MSA to Levantine: ",ex["source"]]),
        "targets": ex["target"]
    }
  return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE);

In [46]:
t5.data.TaskRegistry.remove('translation_lav_msa')
t5.data.TaskRegistry.add(
    #name of the Task
    'translation_lav_msa',
    #Supply a function which returns a tf.data.Dataset
    dataset_fn=lav_msa_translation_dataset_fn,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[lav_msa_translation_preprocessor],
    # Lowercase targets before computing metrics.

    postprocess_fn = t5.data.postprocessors.lower_text, 

    # We'll use accuracy as our evaluation metric.

    metric_fns=[t5.evaluation.metrics.bleu],
    #metric_fns=[],

    # Not required, but helps for mixing and auto-caching.
    num_input_examples=lav_msa_example_count,
    # output_features
    output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(filepath)),
    # specifying token processor
    # token_preprocessor=[
    #   functools.partial(
    #       preprocessors.select_random_chunk,
    #       feature_key="targets",
    #       max_length=65536
    #   ),
    #   functools.partial(
    #       preprocessors.reduce_concat_tokens,
    #       feature_key="targets",
    #       batch_size=128
    #   ),
    #   preprocessors.split_tokens_to_inputs_length,
    #   functools.partial(
    #       preprocessors.denoise,
    #       inputs_fn=preprocessors.noise_span_to_unique_sentinel,
    #       targets_fn=preprocessors.nonnoise_span_to_unique_sentinel,
    #       noise_density=0.15,
    #       noise_mask_fn=preprocessors.iid_noise_mask,
    #   )
    # ]
)

###Maghrib to MSA Task

In [47]:
mag_msa_split_csv_path = {
    "train": "gs://bucket-remove-excess-params/data/train_backwards/train_msa_mag.csv",
    "validation": "gs://bucket-remove-excess-params/data/val_backwards/val_msa_mag.csv"
}
mag_msa_example_count = {
    "train": len(mag_msa_train),
    "validation": len(mag_msa_val)
}

In [48]:
def mag_msa_translation_dataset_fn(split, shuffle_files=False):
  ds = tf.data.TextLineDataset(mag_msa_split_csv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["",""],
                        field_delim=",", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE
  )
  ds = ds.map(lambda *example: dict(zip(["source", "target"], example)) )
  return ds

for example in tfds.as_numpy(mag_msa_translation_dataset_fn("train").take(5)):
    print(example)
    print(example['source'].decode())
    print(example['target'].decode())
    print(len(example['source']))
    print(len(example['target']))

{'source': b'\xd8\xb3\xd8\xa7\xd8\xb9\xd9\x88\xd8\xaf', 'target': b'\xd8\xaf\xd9\x88\xd9\x83 \xd9\x86\xd9\x88\xd9\x84\xd9\x8a'}
ساعود
دوك نولي
10
15
{'source': b'\xd9\x88 \xd8\xa7\xd8\xaa\xd8\xb5\xd9\x84\xd9\x8a \xd8\xa8\xd8\xa8\xd8\xa7\xd9\x82\xd9\x8a \xd9\x84\xd9\x83\xd9\x8a \xd9\x8a\xd9\x82\xd9\x88\xd9\x84 \xd9\x84\xd9\x83\xd9\x88\xd9\x83\xd8\xa9 \xd8\xa7\xd9\x86 \xd8\xaa\xd8\xac\xd9\x84\xd8\xa8 \xd9\x84\xd9\x86\xd8\xa7 \xd8\xa7\xd8\xa7\xd9\x84\xd8\xae\xd8\xa8\xd8\xb2', 'target': b'\xd9\x88 \xd8\xb9\xd9\x8a\xd8\xb7\xd9\x8a \xd9\x84 \xd8\xa8\xd8\xa7\xd9\x82\xd9\x8a \xd8\xa8\xd8\xa7\xd8\xb4 \xd8\xaa\xd9\x82\xd9\x88\xd9\x84 \xd9\x84 \xd9\x83\xd9\x88\xd9\x83\xd8\xa9 \xd8\xaa\xd8\xac\xd9\x8a\xd8\xa8 \xd9\x84\xd9\x8a\xd9\x86\xd8\xa7 \xd8\xa7\xd9\x84\xd8\xae\xd8\xa8\xd8\xb2'}
و اتصلي بباقي لكي يقول لكوكة ان تجلب لنا االخبز
و عيطي ل باقي باش تقول ل كوكة تجيب لينا الخبز
85
80
{'source': b'\xd9\x84\xd8\xa7 \xd8\xa7\xd9\x86\xd8\xa7 \xd9\x84\xd8\xa7 \xd8\xa7\xd8\xb9\xd8\xaa\xd9\x82\xd8\xaf \xd8

In [49]:
  #turn the ds of dictionaries and change the keys to inputs and targets that the model
def mag_msa_translation_preprocessor(ds):
    def to_inputs_and_targets(ex):
      return{
        "inputs": tf.strings.join(["translate MSA to Maghrabi: ",ex["source"]]),
        "targets": ex["target"]
    }
    return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE);

In [50]:
t5.data.TaskRegistry.remove('translation_mag_msa')
t5.data.TaskRegistry.add(
    #name of the Task
    'translation_mag_msa',
    #Supply a function which returns a tf.data.Dataset
    dataset_fn=mag_msa_translation_dataset_fn,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[mag_msa_translation_preprocessor],
    # Lowercase targets before computing metrics.

    postprocess_fn = t5.data.postprocessors.lower_text, 
    
    # We'll use accuracy as our evaluation metric.
    
    metric_fns=[t5.evaluation.metrics.bleu],
    #metric_fns=[],
    
    # Not required, but helps for mixing and auto-caching.
    num_input_examples=mag_msa_example_count,
    # output_features
    output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(filepath)),
    # specifying token processor
    # token_preprocessor=[
    #   functools.partial(
    #       preprocessors.select_random_chunk,
    #       feature_key="targets",
    #       max_length=65536
    #   ),
    #   functools.partial(
    #       preprocessors.reduce_concat_tokens,
    #       feature_key="targets",
    #       batch_size=128
    #   ),
    #   preprocessors.split_tokens_to_inputs_length,
    #   functools.partial(
    #       preprocessors.denoise,
    #       inputs_fn=preprocessors.noise_span_to_unique_sentinel,
    #       targets_fn=preprocessors.nonnoise_span_to_unique_sentinel,
    #       noise_density=0.15,
    #       noise_mask_fn=preprocessors.iid_noise_mask,
    #   )
    # ]
)

##Dataset Mixture

In [51]:
t5.data.MixtureRegistry.remove("translation_msa")
t5.data.MixtureRegistry.add(
    "translation_msa",
    ["translation_msa_en", "translation_lav_msa", "translation_mag_msa"],
     default_rate=1.0
)

##Pre-Training

In [36]:
#gotta get the base config and add the new tasks' task params
#!wget "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json" -O data/config/t5-base-config.json

In [37]:
# if torch.cuda.is_available():
#     device = torch.device("cuda")
# else:
#     device = torch.device("cpu")

# #Using the base config from Huggingface T5 Model
# config = transformers.T5Config.from_json_file(json_file="data/config/t5-base-config.json")
# model = t5.models.HfPyTorchModel(config, "/tmp/hft5/", device)

In [38]:
#ls /tmp/hft5

In [52]:
 !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1pjeI4EBOjES-gZ-s4rdVlKCAqWYV5FU7' -O data/operative_config.gin
 !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=13yx1kO7Skr39VZXI8V8SOl-OhXFGafzE' -O data/key.json

--2020-09-02 14:52:57--  https://docs.google.com/uc?export=download&id=1pjeI4EBOjES-gZ-s4rdVlKCAqWYV5FU7
Resolving docs.google.com (docs.google.com)... 172.217.219.102, 172.217.219.101, 172.217.219.138, ...
Connecting to docs.google.com (docs.google.com)|172.217.219.102|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0g-7c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/mrqjjdcbopearuikv1hnbag7a7ll6g9v/1599058350000/01542315482457206469/*/1pjeI4EBOjES-gZ-s4rdVlKCAqWYV5FU7?e=download [following]
--2020-09-02 14:52:58--  https://doc-0g-7c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/mrqjjdcbopearuikv1hnbag7a7ll6g9v/1599058350000/01542315482457206469/*/1pjeI4EBOjES-gZ-s4rdVlKCAqWYV5FU7?e=download
Resolving doc-0g-7c-docs.googleusercontent.com (doc-0g-7c-docs.googleusercontent.com)... 172.217.212.132, 2607:f8b0:4001:c03::84
Connecting to doc-0g-7c-docs.googleusercontent.com (doc-0g

In [40]:
#pip install --upgrade google-cloud-storage

In [53]:
 import gin
 with gin.unlock_config():
   gin.parse_config_file("data/operative_config.gin")

In [54]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
TPU_ADDRESS = tpu.get_master()
TPU_TOPOLOGY = '2x2'
print(TPU_ADDRESS)

grpc://10.61.25.138:8470


In [57]:
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver('mllu-tpu', zone = 'us-central1-b', project = 'mllu-t5')
# TPU_ADDRESS = tpu.get_master()
# TPU_TOPOLOGY = 'v2-8'
# TPU_ADDRESS

MODEL_SIZE = 'base'
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}['base']


model = t5.models.MtfModel(
    model_dir='gs://bucket-remove-excess-params/model/final_backwards_individual',
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,    
    batch_size=train_batch_size,
    sequence_length={"inputs": 32, "targets": 32},
    learning_rate_schedule=0.003,
    save_checkpoints_steps=10000,
    iterations_per_loop=2000,
)

In [58]:
model.train(mixture_or_task_name='translation_msa', steps=10000)

INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket-remove-excess-params/model/final_backwards', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.61.25.138:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.61.25.138:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.61.25.138:8470', '_evaluation_master': 'grpc://10.61.25.138:8470', '_is_chief': True, '_num_ps_replicas': 0, '

  return dataset.map(my_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('batch', 'batch'), ('ensemble', 'ensemble'), ('heads', 'mo

In [None]:
#STEPS = 10000 #@param {type: "integer"}
#model.train(
#    mixture_or_task_name="translation_msa",
#    steps=STEPS,
#    save_steps=STEPS/5,                                                   
#    sequence_length={"inputs": 32, "targets": 32},
#    split="train",
#    batch_size=32,
#    optimizer=functools.partial(transformers.AdamW, lr=1e-4),
#)

In [None]:
# FINETUNE_STEPS = 25000 #@param {type: "integer"}

# model.finetune(
#     mixture_or_task_name="translation_msa",
#     pretrained_model_dir="gs://mllu-bucket2/model",
#     finetune_steps=FINETUNE_STEPS
# )

In [59]:
# Use a larger batch size for evaluation, which requires less memory.
#model.batch_size = train_batch_size * 4
model.eval(
    mixture_or_task_name="translation_msa",
    checkpoint_steps=10000,
    split="validation",

)

INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket-remove-excess-params/model/final_backwards', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.61.25.138:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.61.25.138:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.61.25.138:8470', '_evaluation_master': 'grpc://10.61.25.138:8470', '_is_chief': True, '_num_ps_replicas': 0, '

  return dataset.map(my_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)


INFO:tensorflow:Checkpoint path gs://bucket-remove-excess-params/model/final_backwards/model.ckpt-10000
INFO:tensorflow:Querying Tensorflow master (grpc://10.61.25.138:8470) for TPU system metadata.
INFO:tensorflow:Initializing TPU system (master: grpc://10.61.25.138:8470) to fetch topology for model parallelism. This might take a while.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 860019331057456105)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, -6454476014470149357)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 2889123826406870324)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2

In [None]:
#STEPS = 10000
#model.finetune(
#    mixture_or_task_name='translation_msa',
#    finetune_steps=STEPS,
#    pretrained_model_dir='/tmp/hft5/',
#    sequence_length={"inputs": 32, "targets": 32},
#    split="train",
#    batch_size=32,
#    save_steps=STEPS/5,
#    optimizer=functools.partial(transformers.AdamW, lr=1e-4),
#)

In [60]:
review_task = t5.data.TaskRegistry.get("translation_lav_msa")
ds = review_task.get_dataset(split="validation", sequence_length={"inputs": 128, "targets": 32})
print("A few preprocessed validation examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

  return dataset.map(my_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)


A few preprocessed validation examples...
{'inputs_plaintext': b'translate MSA to Levantine: \xd8\xa7\xd9\x84\xd8\xb0\xd9\x89 \xd9\x8a\xd8\xb9\xd8\xb1\xd9\x81 \xd9\x85\xd9\x86 \xd8\xa7\xd9\x83\xd9\x88\xd9\x86 \xd8\xa7\xd9\x86\xd8\xa7 \xd8\xb3\xd9\x88\xd9\x81 \xd8\xa7\xd8\xad\xd8\xb6\xd8\xb1 \xd9\x84\xd9\x87 \xd8\xb3\xd9\x83\xd8\xb1 \xd8\xac\xd9\x84\xd8\xa7\xd8\xa8 \xd9\x85\xd9\x86 \xd9\x82\xd8\xb1\xd9\x8a\xd8\xaa\xd9\x86\xd8\xa7 \xd8\xa8\xd9\x86\xd8\xac\xd8\xb9 \xd8\xa7\xd9\x84\xd9\x82\xd8\xb7\xd9\x8a\xd9\x87', 'inputs': array([ 6537,   325, 11829,    51,  1977, 20834,  5671,  6034,  1291,
         363,    12,   508,    10,    15,   669,    16,   388,   574,
          88,  1814,  9481,    10,  1929,    34,   362,  8081,  3188,
         380,     1]), 'targets_plaintext': b'\xd8\xb9\xd9\x84\xd9\x89 \xd9\x81\xd9\x83\xd8\xb1\xd9\x87 \xd8\xa7\xd9\x84\xd9\x84\xd9\x8a \xd8\xa8\xd8\xb9\xd8\xb1\xd9\x81 \xd8\xa7\xd9\x86\xd8\xa7 \xd9\x85\xd9\x8a\xd9\x86 \xd8\xa8\xd8\xac\xd9\x8a\xd8\xa8\xd9\x84\xd

##Evaluation

In [None]:
# # Evaluate after fine-tuning
# model.eval(
#     mixture_or_task_name="translation_msa",
#     checkpoint_steps= 20000,
#     sequence_length={"inputs": 32, "targets": 32},
#     batch_size=32,
# )

In [61]:
model.export(checkpoint_step=10000, vocabulary=t5.data.SentencePieceVocabulary(filepath))
#model.export(checkpoint_step=35000, vocabulary=t5.data.SentencePieceVocabulary(filepath))

INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket-remove-excess-params/model/final_backwards', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.61.25.138:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.61.25.138:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.61.25.138:8470', '_evaluation_master': 'grpc://10.61.25.138:8470', '_is_chief': True, '_num_ps_replicas': 0, '

  return dataset.map(my_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)


INFO:tensorflow:Variable decoder/block_000/layer_000/SelfAttention/k                  size 589824       slice_size 589824       Shape[d_model=768, heads=768]                               
INFO:tensorflow:Variable decoder/block_000/layer_000/SelfAttention/o                  size 589824       slice_size 589824       Shape[heads=768, d_model=768]                               
INFO:tensorflow:Variable decoder/block_000/layer_000/SelfAttention/q                  size 589824       slice_size 589824       Shape[d_model=768, heads=768]                               
INFO:tensorflow:Variable decoder/block_000/layer_000/SelfAttention/relative_attention_bias size 384          slice_size 384          Shape[heads=12, buckets=32]                                 
INFO:tensorflow:Variable decoder/block_000/layer_000/SelfAttention/v                  size 589824       slice_size 589824       Shape[d_model=768, heads=768]                               
INFO:tensorflow:Variable decoder/block_000/layer_0

b'gs://bucket-remove-excess-params/model/final_backwards/1599059630'

##Predictions

In [62]:
inputs = [
    "translate Levantine to MSA: hla kyfk", #Hey how are you
    "translate MSA to English: ana la 2ryd an anam", #I don't want to sleep
]

model.predict(
    input_file="gs://bucket-remove-excess-params/data/test_backwards/test_English_MSA.csv", 
    output_file="gs://bucket-remove-excess-params/data/out_backwards/out_en_msa.csv", 
    checkpoint_steps=10000,
    beam_size=1, 
    temperature=1.0, 
    vocabulary=t5.data.SentencePieceVocabulary(filepath),
)

#model.predict(
##    inputs,
 #   sequence_length={"inputs": 32},
 #   batch_size=2,
 #   output_file="/tmp/hft5/example_predictions.txt",
    #vocabulary=t5.data.SentencePieceVocabulary(filepath),
#)

INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket-remove-excess-params/model/final_backwards', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.61.25.138:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.61.25.138:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.61.25.138:8470', '_evaluation_master': 'grpc://10.61.25.138:8470', '_is_chief': True, '_num_ps_replicas': 0, '

In [63]:
model.predict(
    input_file="gs://bucket-remove-excess-params/data/test_backwards/test_MSA_Levantine.csv", 
    output_file="gs://bucket-remove-excess-params/data/out_backwards/out_msa_lav.csv", 
    checkpoint_steps=10000,
    beam_size=1, 
    temperature=1.0, 
    vocabulary=t5.data.SentencePieceVocabulary(filepath),
)

INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket-remove-excess-params/model/final_backwards', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.61.25.138:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.61.25.138:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.61.25.138:8470', '_evaluation_master': 'grpc://10.61.25.138:8470', '_is_chief': True, '_num_ps_replicas': 0, '

In [64]:
model.predict(
    input_file="gs://bucket-remove-excess-params/data/test_backwards/test_MSA_Maghrabi.csv", 
    output_file="gs://bucket-remove-excess-params/data/out_backwards/out_msa_mag.csv", 
    checkpoint_steps=10000,
    beam_size=1, 
    temperature=1.0, 
    vocabulary=t5.data.SentencePieceVocabulary(filepath),
)

INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket-remove-excess-params/model/final_backwards', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.61.25.138:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.61.25.138:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.61.25.138:8470', '_evaluation_master': 'grpc://10.61.25.138:8470', '_is_chief': True, '_num_ps_replicas': 0, '