In [1]:
!pip install --no-cache-dir transformers sentencepiece datasets sacrebleu

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 46.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 52.2 MB/s 
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 51.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 17.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,704 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [4]:
import transformers

print(transformers.__version__)

4.12.5


We will see how to easily load the dataset for this task using 🤗 Datasets and how to fine-tune a model on it using the Trainer API.

In [5]:
model_checkpoint = "google/mt5-small"

In [6]:
from datasets import Dataset, load_metric, load_dataset

metric = load_metric("sacrebleu")

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

In [7]:
import torch
import numpy as np
import pandas as pd
import os
from google.colab import drive
import logging

In [8]:
drive.mount('content')

Mounted at content


In [9]:
PATH_TO_DATASET = "content/MyDrive"

In [10]:
data  =  pd.read_excel(os.path.join(PATH_TO_DATASET, "Data.xlsx"), index_col = 0)

In [11]:
data.head()

Unnamed: 0,Arabic_transcript,English_transcript
0,مرحبا بكم في مقدمة لعلوم البيانات مع بايثون. ه...,Welcome to an introduction to Data Science wit...
1,مرحباً، أنا (كريس بروكز)، هيئة التدريس هنا بكل...,"Hi, I'm Chris Brooks, faculty here at the Univ..."
2,مرحبا. أريد أن أريكم قليلا عن نظام دفتر جوبيتر...,Hi. I want to show you a little bit about the ...
3,في بقية هذه الوحدة، سأقوم بتقديم نظرة عامة أسا...,"In the rest of this module, I'm going to provi..."
5,تحدثنا عن السلاسل عندما تحدثنا عن القوائم والت...,We talked about strings when we talked about l...


In [12]:
data.shape

(16818, 2)

In [13]:
import random

random.seed(10)

train_set, validate_set, test_set = np.split(data.sample(frac=1), [int(.8*len(data)), int(.9*len(data))])
# https://datascience.stackexchange.com/questions/15135/train-test-validation-set-splitting-in-sklearn 



In [14]:
train_set.shape, validate_set.shape, test_set.shape

((13454, 2), (1682, 2), (1682, 2))

In [15]:
train_set.isnull().sum() / train_set.shape[0] *100

Arabic_transcript     0.044596
English_transcript    0.007433
dtype: float64

In [16]:
train_set.dropna(inplace=True)
validate_set.dropna(inplace=True)

In [17]:
train_set.isnull().sum() / train_set.shape[0] *100

Arabic_transcript     0.0
English_transcript    0.0
dtype: float64

In [18]:
train_set = train_set.drop_duplicates(subset=['Arabic_transcript', 'English_transcript'])

# train_set.head()
train_set['English_transcript'] = train_set.apply(lambda row: row.English_transcript.lower(), axis=1)
train_set.columns = ['ar', 'en']

In [19]:
validate_set = validate_set.drop_duplicates(subset=['Arabic_transcript', 'English_transcript'])

# train_set.head()
validate_set['English_transcript'] = validate_set.apply(lambda row: row.English_transcript.lower(), axis=1)
validate_set.columns = ['ar', 'en']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


text generated seems to be not random. I have got something wrong here

### Pre-processing for arabic text?

In [20]:
train_set.head()

Unnamed: 0,ar,en
15884,التكرار هو برمجة ديناميكية ، على ما أعتقد. حسنًا,"recurrences is dynamic programming, i guess."
10424,أو يساوي [ضوضاء].,or equal to [noise].
599,أم ، إنه تطبيق أقل قليلاً من,"um, it is a little bit less applied than"
10454,أعتقد أن هذا هو لام ثيتا.,i guess this is l of theta.
5551,أممم ، تقدير الاحتمالية القصوى.,"um, maximum likelihood estimation."


**Text cleaning?**

In [21]:
pip install mpu

Collecting mpu
  Downloading mpu-0.23.1-py3-none-any.whl (69 kB)
[?25l[K     |████▊                           | 10 kB 14.2 MB/s eta 0:00:01[K     |█████████▍                      | 20 kB 12.8 MB/s eta 0:00:01[K     |██████████████                  | 30 kB 14.6 MB/s eta 0:00:01[K     |██████████████████▉             | 40 kB 17.4 MB/s eta 0:00:01[K     |███████████████████████▌        | 51 kB 6.4 MB/s eta 0:00:01[K     |████████████████████████████▏   | 61 kB 6.8 MB/s eta 0:00:01[K     |████████████████████████████████| 69 kB 3.5 MB/s 
[?25hInstalling collected packages: mpu
Successfully installed mpu-0.23.1


In [22]:
train_set['translation'] = train_set.apply(lambda row: {'en':row['en'], 'ar':row['ar']},
                            axis=1)
train_set = train_set.drop(columns = ['ar', 'en'])  #https://stackoverflow.com/questions/55136065/convert-multiple-pandas-column-into-json 
train_set.head()

Unnamed: 0,translation
15884,"{'en': 'recurrences is dynamic programming, i ..."
10424,"{'en': 'or equal to [noise].', 'ar': 'أو يساوي..."
599,"{'en': 'um, it is a little bit less applied th..."
10454,"{'en': 'i guess this is l of theta.', 'ar': 'أ..."
5551,"{'en': 'um, maximum likelihood estimation.', '..."


In [23]:
validate_set['translation'] = validate_set.apply(lambda row: {'en':row['en'], 'ar':row['ar']},
                            axis=1)
validate_set = validate_set.drop(columns = ['ar', 'en'])  #https://stackoverflow.com/questions/55136065/convert-multiple-pandas-column-into-json 
validate_set.head()

Unnamed: 0,translation
13033,"{'en': 'so, i'm nesting it. i'm re-expanding w..."
13866,{'en': 'and then the agent has its own state s...
10207,"{'en': 'then the midpoint of this line, um,', ..."
796,"{'en': 'uh, today is supervised learning.', 'a..."
447,"{'en': 'uh, for example, um,', 'ar': 'آه ، على..."


In [24]:
import json

import mpu.io
# https://www.py4u.net/discuss/191017 
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html


with open('data.json', 'w', encoding='utf-8') as file:
    train_set.to_json(file, orient="records", force_ascii=False, lines = True)

translation = load_dataset('json', data_files= 'data.json')
# https://huggingface.co/docs/datasets/loading_datasets.html#json-files 

Using custom data configuration default-03f72d56c3c1234e


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-03f72d56c3c1234e/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-03f72d56c3c1234e/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
with open('data_val.json', 'w', encoding='utf-8') as file:
    validate_set.to_json(file, orient="records", force_ascii=False, lines = True)

translation_val = load_dataset('json', data_files= 'data_val.json')
# https://huggingface.co/docs/datasets/loading_datasets.html#json-files 

Using custom data configuration default-252c6e62a66f7a83


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-252c6e62a66f7a83/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-252c6e62a66f7a83/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
translation

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 13299
    })
})

In [27]:
translation_val

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1676
    })
})

# Fine tuning 
I will be using 'data' dataset. It contains (2 coursera courses, springer file, DS_codata_org, yt_stanford). 

I will mainly depend on this Nb from huggingface notebooks
https://github.com/huggingface/notebooks/blob/master/examples/translation.ipynb 

In [28]:
cuda = torch.device('cuda') 

### Create custom Datasets Class

## Preprocessing the data

In [29]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False)  #https://discuss.huggingface.co/t/error-with-new-tokenizers-urgent/2847/4

Downloading:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [30]:
if "t5" in model_checkpoint:
    tokenizer.src_lang = "en_XX"
    tokenizer.tgt_lang = "ar_AR"  
    tokenizer.source_prefix = "translate English to Arabic: "

In [31]:
tokenizer("Hello, this one sentence!")

{'input_ids': [30273, 261, 714, 1371, 259, 98923, 309, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [32]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[30273, 261, 714, 1371, 259, 98923, 309, 1], [1494, 339, 259, 7845, 259, 98923, 260, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [33]:
# if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
#     prefix = "translate English to Arabic: "
# else:
#     prefix = ""

since we are using mt5 small, I will add this manually

In [34]:
prefix = "translate English to Arabic: "

In [35]:
model_checkpoint

'google/mt5-small'

In [36]:
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "ar"

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [37]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [38]:
show_random_elements(translation["train"])

Unnamed: 0,translation
0,"{'en': 'but, but whenever we insert into s, um,', 'ar': 'ولكن ، ولكن عندما ندخل في حرف S ،'}"
1,"{'en': 'just yet just- just- just- you wait,', 'ar': 'فقط حتى الآن - فقط - فقط - أنت انتظر ،'}"
2,"{'en': 'you want to go from this black square to this side of the island and here we have', 'ar': 'أن ننتقل من هذا المربع الأسود إلى هذا الجانب من الجزيرة ، وهنا'}"
3,"{'en': '[background].', 'ar': '[الخلفية].'}"
4,"{'en': 'uh, similarly you can go from mu back to eta with the inverse of this,', 'ar': 'آه ، بالمثل يمكنك العودة من Mu إلى Eta بعكس هذا ،'}"


In [39]:
metric

Metric(name: "sacrebleu", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: The system stream (a sequence of segments).
    references: A list of one or more reference streams (each a sequence of segments).
    smooth_method: The smoothing method to use. (Default: 'exp').
    smooth_value: The smoothing value. Only valid for 'floor' and 'add-k'. (Defaults: floor: 0.1, add-k: 1).
    tokenize: Tokenization method to use for BLEU. If not provided, defaults to 'zh' for Chinese, 'ja-mecab' for
        Japanese and '13a' (mteval) otherwise.
    lowercase: Lowercase the data. If True, enables case-insensitivity. (Default: False).
    force: Insist that your tokenized input is actually detokenized.

Returns:
    'score': BLEU score,
    'counts'

### Tokenization

In [40]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

To apply this function on all the pairs of sentences in our dataset, we just use the map method of our dataset object we created earlier. This will apply the function on all the elements of all the splits in dataset, so our training, validation and testing data will be preprocessed in one single command.

In [41]:
tokenized_datasets = translation.map(preprocess_function, batched=True)

  0%|          | 0/14 [00:00<?, ?ba/s]

In [42]:
tokenized_datasets_val = translation_val.map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

##Fine-tuning the model


In [43]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [44]:
batch_size = 4
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False, # to avoid nan error but this will make training much slower
    # push_to_hub=True,
)

In [45]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

we have to do a bit of pre-processing to decode the predictions into texts:

In [46]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [47]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets_val["train"], # this is validation, But I couldn't create a dataset that is composed of training and validation. 
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running training *****
  Num examples = 13299
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3325


Epoch,Training Loss,Validation Loss


Saving model checkpoint to mt5-small-finetuned-en-to-ar/checkpoint-500
Configuration saved in mt5-small-finetuned-en-to-ar/checkpoint-500/config.json
Model weights saved in mt5-small-finetuned-en-to-ar/checkpoint-500/pytorch_model.bin
tokenizer config file saved in mt5-small-finetuned-en-to-ar/checkpoint-500/tokenizer_config.json
Special tokens file saved in mt5-small-finetuned-en-to-ar/checkpoint-500/special_tokens_map.json
Copy vocab file to mt5-small-finetuned-en-to-ar/checkpoint-500/spiece.model
Saving model checkpoint to mt5-small-finetuned-en-to-ar/checkpoint-1000
Configuration saved in mt5-small-finetuned-en-to-ar/checkpoint-1000/config.json
Model weights saved in mt5-small-finetuned-en-to-ar/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in mt5-small-finetuned-en-to-ar/checkpoint-1000/tokenizer_config.json
Special tokens file saved in mt5-small-finetuned-en-to-ar/checkpoint-1000/special_tokens_map.json
Copy vocab file to mt5-small-finetuned-en-to-ar/checkpoint-10

## Inference

In [None]:
# https://huggingface.co/docs/transformers/model_doc/mt5 

#  from transformers import MT5Model, T5Tokenizer
# >>> model = MT5Model.from_pretrained("google/mt5-small")
# >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")

ext = 'I spent a lot of time obsessing over whether I should continue learning data science, or invest my time learning something else'
ext = prefix + ext

input_ids = tokenizer(ext, return_tensors="pt").to('cuda').input_ids  # Batch size 1
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
# OR 

# from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# model_size = "small"
# model_name = f"persiannlp/mt5-{model_size}-parsinlu-translation_en_fa"
# tokenizer = MT5Tokenizer.from_pretrained(model_name)
# model = MT5ForConditionalGeneration.from_pretrained(model_name)

# https://huggingface.co/persiannlp/mt5-small-parsinlu-translation_en_fa 
def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt").to('cuda')
    res = model.generate(input_ids, **generator_args)
    output = tokenizer.batch_decode(res, skip_special_tokens=True)
    print(output)
    return output

run_model(ext)
