<a href="https://colab.research.google.com/github/jameschartouni/arabic_translation/blob/google-cloud/Model_1_Clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Notes 

T5 Paper: https://arxiv.org/pdf/1910.10683.pdf

T5 Tokenizer: https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_t5.py

Important Tasks: https://docs.google.com/document/d/1weIZM6QTlnitpPQmpg-WeV2RW70TnYmDuogBQPr5mB0/edit

In [0]:
#installation step
!pip install transformers
!pip install sentencepiece
!pip install bpemb
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

In [0]:
#creating the folders 
!mkdir data/
!mkdir data/AD_NMT-master
!mkdir data/train/
!mkdir data/test/
!mkdir data/val/
#fetching the pkl files
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V9crCmqvgQcv0Sx2MCNWB9AET2j6M6FW' -O data/AD_NMT-master/english-Arabic-both.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V8_tp8ZlWUYaX7QQL46t0uSRNrVehSf1' -O data/AD_NMT-master/english-Arabic-test.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V7X0qtuDIyjTHY0wh-ZNoVwsiF4lId2e' -O data/AD_NMT-master/english-Arabic-train.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UzL4cOWTMCee83KBUh2QO_H62AFVpDQV' -O data/AD_NMT-master/LAV-MSA-2-both.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UpfCbkxhztof7dvNjeAs1bHjD4SER6h3' -O data/AD_NMT-master/LAV-MSA-2-test.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UlAZGtYsSfXzK7hrC_PbxQFqTSXD0DMw' -O data/AD_NMT-master/LAV-MSA-2-train.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UjDX7cCG2S23SPfSHxSPdVayMTxB5Y16' -O data/AD_NMT-master/Magribi_MSA-both.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UaVWIqRXo0rxuxDF4KArA4bEK1TaLX3l' -O data/AD_NMT-master/Magribi_MSA-test.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UYvlhdYAdfa4riP_4hn3-IEVd1ZUXVTQ' -O data/AD_NMT-master/Magribi_MSA-train.pkl

In [0]:
#James Chartouni
#Joey Park
#Raef Khan

import torch
from torch.optim import SGD
import pandas as pd
import numpy as np
import pickle
import os, io, glob

import sentencepiece as spm
from bpemb import BPEmb

import transformers
import t5
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split

import functools
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf

In [0]:
BASE_DIR = "gs://t5at" #@param { type: "string" }
if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tensorflow_gcs_config.configure_gcs_from_colab_auth('/device:CPU:0')
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

##Initial Loading from Pickle

In [0]:
VOCAB_SIZE = 5000
ls data/AD_NMT-master

In [0]:
file_path = 'data/AD_NMT-master/'

with open(file_path + "english-Arabic-train.pkl", 'rb') as handle:
    data_MSA_English_trainval = pickle.load(handle)

with open(file_path + "english-Arabic-test.pkl", 'rb') as handle:
    data_MSA_English_test = pickle.load(handle)

with open(file_path + "english-Arabic-both.pkl", 'rb') as handle:
    data_MSA_English_both = pickle.load(handle) 




with open(file_path + "LAV-MSA-2-train.pkl", 'rb') as handle:
    data_LAV_MSA_trainval = pickle.load(handle) 

with open(file_path + "LAV-MSA-2-test.pkl", 'rb') as handle:
    data_LAV_MSA_test = pickle.load(handle) 

with open(file_path + "LAV-MSA-2-both.pkl", 'rb') as handle:
    data_LAV_MSA_both = pickle.load(handle) 




with open(file_path + "Magribi_MSA-train.pkl", 'rb') as handle:
    data_Magribi_MSA_trainval = pickle.load(handle) 
    
with open(file_path + "Magribi_MSA-test.pkl", 'rb') as handle:
    data_Magribi_MSA_test = pickle.load(handle) 

with open(file_path + "Magribi_MSA-both.pkl", 'rb') as handle:
    data_Magribi_MSA_both = pickle.load(handle) 
    

In [0]:
#few dataset examples
print(data_MSA_English_both[0:5])
print(data_LAV_MSA_both[0:5])

#print length of train + val dataset, print length of both (train + val) + test
print(len(data_MSA_English_trainval))
print(len(data_MSA_English_both))

## Prepare Datasets

example: https://iwslt2010.fbk.eu/node/32/

We need to take our training and test sets from the pkl files and create new .txt files that are formatted so that the standard torchtext Dataset class can read them

In [0]:
ls data/AD_NMT-master/

In [0]:
#splits the train dataset into train and validation sets, define test set as datafile
msa_en_train, msa_en_val = train_test_split(data_MSA_English_trainval, test_size=.2)
msa_en_test = data_MSA_English_test

lav_msa_train, lav_msa_val = train_test_split(data_LAV_MSA_trainval, test_size=.2)
lav_msa_test = data_LAV_MSA_test

mag_msa_train, mag_msa_val = train_test_split(data_Magribi_MSA_trainval, test_size=.2)
mag_msa_test = data_Magribi_MSA_test

In [0]:
print(len(msa_en_train))
print(len(msa_en_val))

print(len(lav_msa_train))
print(len(lav_msa_val))

print(len(mag_msa_train))
print(len(mag_msa_val))

In [0]:
file_path = 'data/'

def list_to_csv(ds, src='msa', trg='en', datatype=''):
    src_formatted = datatype + '_' + src + '_' + trg + '.' + 'csv'
    
    with open(file_path + datatype + "/" + src_formatted, 'wt') as csv:
        for i, arr in enumerate(ds):
            csv.write(arr[1] + ',' + arr[0] + '\n')

In [0]:
list_to_csv(msa_en_train, 'msa', 'en', 'train')
list_to_csv(msa_en_val, 'msa', 'en', 'val')
list_to_csv(msa_en_test, 'msa', 'en', 'test')

list_to_csv(lav_msa_train, 'lav', 'msa', 'train')
list_to_csv(lav_msa_val, 'lav', 'msa', 'val')
list_to_csv(lav_msa_test, 'lav', 'msa', 'test')

list_to_csv(mag_msa_train, 'mag', 'msa', 'train')
list_to_csv(mag_msa_val, 'mag', 'msa', 'val')
list_to_csv(mag_msa_test, 'mag', 'msa', 'test')

In [0]:
!gsutil -m cp -r "data/*" $DATA_DIR

##Loading MultiBPemb: Multilingual BPEmb Sentencepiece Model

In [0]:
multibpemb = BPEmb(lang="multi", vs=1000000, dim=300)

##Tensor Processing + Add to TaskRegistry

### English to Arabic Task

In [0]:
msa_en_split_csv_path = {
    "train": os.path.join(DATA_DIR, "train/train_msa_en.csv"),
    "validation": os.path.join(DATA_DIR, "val/val_msa_en.csv")
}
msa_en_example_count = {
    "train": 7200,
    "validation": 1800
}

In [0]:
def msa_en_translation_dataset_fn(split, shuffle_files=False):
  ds = tf.data.TextLineDataset(msa_en_split_csv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["",""],
                        field_delim=",", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE
  )
  ds = ds.map(lambda *example: dict(zip(["source", "target"], example)) )
  return ds

for example in tfds.as_numpy(msa_en_translation_dataset_fn("train").take(5)):
    print(example)

In [0]:
#turn the ds of dictionaries and change the keys to inputs and targets that the model
def msa_en_translation_preprocessor(ds):
  def to_inputs_and_targets(ex):
    return{
        "inputs": tf.strings.join(["Translate MSA to English: ",ex["source"]]),
        "targets": ex["target"]
    }
  return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [0]:
filepath = str(multibpemb.model_file)
print(filepath)

In [0]:
t5.data.TaskRegistry.remove("msa_en_translation")
t5.data.TaskRegistry.add(
    #name of the Task
    "msa_en_translation",
    #Supply a function which returns a tf.data.Dataset
    dataset_fn=msa_en_translation_dataset_fn,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[msa_en_translation_preprocessor],
    # Use the same vocabulary that we used for pre-training.
    sentencepiece_model_path= filepath, #str(multibpemb.model_file), #t5.data.DEFAULT_SPM_PATH
    # Lowercase targets before computing metrics.
    postprocess_fn = t5.data.postprocessors.lower_text, 
    # We'll use accuracy as our evaluation metric.
    metric_fns=[t5.evaluation.metrics.accuracy],
    # Not required, but helps for mixing and auto-caching.
    #num_input_examples=num_nq_examples
)

###Levantine to MSA Task

In [0]:
lav_msa_split_csv_path = {
    "train": os.path.join(DATA_DIR, "train/train_lav_msa.csv"),
    "validation": os.path.join(DATA_DIR, "val/val_lav_msa.csv")
}
lav_msa_example_count = {
    "train": 11044,
    "validation": 2761
}

In [0]:
def lav_msa_translation_dataset_fn(split, shuffle_files=False):
  ds = tf.data.TextLineDataset(lav_msa_split_csv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["",""],
                        field_delim=",", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE
  )
  ds = ds.map(lambda *example: dict(zip(["source", "target"], example)) )
  return ds

for example in tfds.as_numpy(lav_msa_translation_dataset_fn("train").take(5)):
    print(example)

In [0]:
#turn the ds of dictionaries and change the keys to inputs and targets that the model
def lav_msa_translation_preprocessor(ds):
  def to_inputs_and_targets(ex):
    return{
        "inputs": tf.strings.join(["Translate Levantine to MSA: ",ex["source"]]),
        "targets": ex["target"]
    }
  return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [0]:
t5.data.TaskRegistry.remove("lav_msa_translation")
t5.data.TaskRegistry.add(
    #name of the Task
    "lav_msa_translation",
    #Supply a function which returns a tf.data.Dataset
    dataset_fn=lav_msa_translation_dataset_fn,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[lav_msa_translation_preprocessor],
    # Use the same vocabulary that we used for pre-training.
    sentencepiece_model_path= filepath, #str(multibpemb.model_file), #t5.data.DEFAULT_SPM_PATH
    # Lowercase targets before computing metrics.
    postprocess_fn = t5.data.postprocessors.lower_text, 
    # We'll use accuracy as our evaluation metric.
    metric_fns=[t5.evaluation.metrics.accuracy],
    # Not required, but helps for mixing and auto-caching.
    #num_input_examples=num_nq_examples
)

###Maghrib to MSA Task

In [0]:
mag_msa_split_csv_path = {
    "train": os.path.join(DATA_DIR, "train/train_mag_msa.csv"),
    "validation": os.path.join(DATA_DIR, "val/val_mag_msa.csv")
}
mag_msa_example_count = {
    "train": 14188,
    "validation": 3548
}

In [0]:
def mag_msa_translation_dataset_fn(split, shuffle_files=False):
  ds = tf.data.TextLineDataset(mag_msa_split_csv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["",""],
                        field_delim=",", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE
  )
  ds = ds.map(lambda *example: dict(zip(["source", "target"], example)) )
  return ds

for example in tfds.as_numpy(mag_msa_translation_dataset_fn("train").take(5)):
    print(example)
    print(example['source'].decode())
    print(example['target'].decode())
    print(len(example['source']))
    print(len(example['target']))

In [0]:
#turn the ds of dictionaries and change the keys to inputs and targets that the model
def mag_msa_translation_preprocessor(ds):
  def to_inputs_and_targets(ex):
    return{
        "inputs": tf.strings.join(["Translate Maghrib to Arabic: ",ex["source"]]),
        "targets": ex["target"]
    }
  return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [0]:
t5.data.TaskRegistry.remove("mag_msa_translation")
t5.data.TaskRegistry.add(
    #name of the Task
    "mag_msa_translation",
    #Supply a function which returns a tf.data.Dataset
    dataset_fn=mag_msa_translation_dataset_fn,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[mag_msa_translation_preprocessor],
    # Use the same vocabulary that we used for pre-training.
    sentencepiece_model_path= filepath, #str(multibpemb.model_file), #t5.data.DEFAULT_SPM_PATH
    # Lowercase targets before computing metrics.
    postprocess_fn = t5.data.postprocessors.lower_text, 
    # We'll use accuracy as our evaluation metric.
    metric_fns=[t5.evaluation.metrics.accuracy],
    # Not required, but helps for mixing and auto-caching.
    #num_input_examples=num_nq_examples
)

##Dataset Mixture

In [0]:
t5.data.MixtureRegistry.remove("ar_translation")
t5.data.MixtureRegistry.add(
    "ar_translation",
    ["msa_en_translation", "lav_msa_translation", "mag_msa_translation"],
     default_rate=1.0
)

##Fine Tune

In [0]:
MODEL_SIZE = "base" #@param["small", "base", "large"]
# Public GCS path for T5 pre-trained model checkpoints
BASE_PRETRAINED_DIR = "gs://t5-data/pretrained_models"
PRETRAINED_DIR = os.path.join(BASE_PRETRAINED_DIR, MODEL_SIZE)
MODEL_DIR = os.path.join(MODELS_DIR, MODEL_SIZE)

model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)
# The models from our paper are based on the Mesh Tensorflow Transformer.
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 128, "targets": 128},
    learning_rate_schedule=0.003,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)


In [0]:
STEPS = 1000000 #@param {type: "integer"}
MODEL_SIZE = "base"
model.train(
    mixture_or_task_name="ar_translation",
    steps=STEPS,
    )


##Evaluation

In [0]:
# Use a larger batch size for evaluation, which requires less memory.
model.batch_size = train_batch_size * 4
model.eval(
    mixture_or_task_name="ar_translation",
    checkpoint_steps="all"
)

##Predictions

In [0]:
msa_en_1 = "احبك" #@param {type:"string"}
msa_en_2 = "ماذا تقولون" #@param {type:"string"}
msa_en_3 = "انا في بيتي" #@param {type:"string"}
msa_en_4 = "مرحبا" #@param {type:"string"}

questions = [msa_en_1, msa_en_2, msa_en_3, msa_en_4]

now = time.time()
# Write out the supplied questions to text files.
predict_inputs_path = os.path.join(MODEL_DIR, "predict_inputs_%d.txt" % now)
predict_outputs_path = os.path.join(MODEL_DIR, "predict_outputs_%d.txt" % now)
# Manually apply preprocessing by prepending "triviaqa question:".
with tf.io.gfile.GFile(predict_inputs_path, "w") as f:
  for q in questions:
    f.write("Translate MSA to English: %s\n" % q.lower())

# Ignore any logging so that we only see the model's answers to the questions.
with tf_verbosity_level('ERROR'):
  model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
  model.predict(
      input_file=predict_inputs_path,
      output_file=predict_outputs_path,
      # Select the most probable output token at each step.
      temperature=0,
  )

# The output filename will have the checkpoint appended so we glob to get 
# the latest.
prediction_files = sorted(tf.io.gfile.glob(predict_outputs_path + "*"))
print("\nPredictions using checkpoint %s:\n" % prediction_files[-1].split("-")[-1])
with tf.io.gfile.GFile(prediction_files[-1]) as f:
  for q, a in zip(questions, f):
    if q:
      print("Q: " + q)
      print("A: " + a)
      print()