In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn
import torch.optim as optim
import os

device_gpu = torch.device("cuda:0") if torch.cuda.is_available else ("cpu") 
device_gpu

device(type='cuda', index=0)

In [None]:
!pip install simpletransformers
from simpletransformers.t5 import T5Model, T5Args

Collecting simpletransformers
  Downloading simpletransformers-0.61.13-py3-none-any.whl (221 kB)
[?25l[K     |█▌                              | 10 kB 38.7 MB/s eta 0:00:01[K     |███                             | 20 kB 32.3 MB/s eta 0:00:01[K     |████▍                           | 30 kB 19.4 MB/s eta 0:00:01[K     |██████                          | 40 kB 16.0 MB/s eta 0:00:01[K     |███████▍                        | 51 kB 8.8 MB/s eta 0:00:01[K     |████████▉                       | 61 kB 9.2 MB/s eta 0:00:01[K     |██████████▍                     | 71 kB 9.0 MB/s eta 0:00:01[K     |███████████▉                    | 81 kB 10.1 MB/s eta 0:00:01[K     |█████████████▎                  | 92 kB 10.2 MB/s eta 0:00:01[K     |██████████████▉                 | 102 kB 8.4 MB/s eta 0:00:01[K     |████████████████▎               | 112 kB 8.4 MB/s eta 0:00:01[K     |█████████████████▊              | 122 kB 8.4 MB/s eta 0:00:01[K     |███████████████████▎            | 133 

In [None]:
def prepare_translation_datasets(data_path):
    with open(os.path.join(data_path, "train.trg"), "r", encoding="utf-8") as f:
        hindi_text = f.readlines()
        hindi_text = [text.strip("\n") for text in hindi_text]

    with open(os.path.join(data_path, "train.src"), "r") as f:
        english_text = f.readlines()
        english_text = [text.strip("\n") for text in english_text]

    data = []
    for hindi, english in zip(hindi_text, english_text):
        data.append(["translate hindi to english", hindi, english])
        data.append(["translate english to hindi", english, hindi])

    train_df = pd.DataFrame(data, columns=["prefix", "input_text", "target_text"])

    with open(os.path.join(data_path, "test.trg"), "r", encoding="utf-8") as f:
        hindi_text = f.readlines()
        hindi_text = [text.strip("\n") for text in hindi_text]

    with open(os.path.join(data_path, "test.src"), "r") as f:
        english_text = f.readlines()
        english_text = [text.strip("\n") for text in english_text]

    data = []
    for hindi, english in zip(hindi_text, english_text):
        data.append(["translate hindi to english", hindi, english])
        data.append(["translate english to hindi", english, hindi])

    eval_df = pd.DataFrame(data, columns=["prefix", "input_text", "target_text"])

    return train_df, eval_df

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
if (not os.path.isfile('TatoebaChallenge.zip')):
  #Download a file based on its id
  file_id = '1IPjBpKayleATwvorsKXn6fysDaZuX5ue' #https://colab.research.google.com/drive/1RzzAb7fV60h8SKT6bsRYeGtPPwqbXXPi?usp=sharing
  downloaded = drive.CreateFile({'id':file_id})
  #print('Downloaded content "{}"'.format(downloaded.GetContentString()))
  downloaded.GetContentFile('TatoebaChallenge.zip')

if (not os.path.isdir('TatoebaChallenge')):
  !unzip TatoebaChallenge.zip -d TatoebaChallenge

Archive:  TatoebaChallenge.zip
  inflating: TatoebaChallenge/TatoebaChallenge/test.id  
  inflating: TatoebaChallenge/TatoebaChallenge/test.src  
  inflating: TatoebaChallenge/TatoebaChallenge/test.trg  
  inflating: TatoebaChallenge/TatoebaChallenge/train.id  
  inflating: TatoebaChallenge/TatoebaChallenge/train.src  
  inflating: TatoebaChallenge/TatoebaChallenge/train.trg  


In [None]:
train_df_full, eval_df_full = prepare_translation_datasets("/content/TatoebaChallenge/TatoebaChallenge")

In [None]:
print(train_df_full.shape)
print(eval_df_full.shape)
print(train_df_full.head(5))
print(eval_df_full.head(5))

(3005330, 3)
(10000, 3)
                       prefix  ...                                        target_text
0  translate hindi to english  ...  “ I quickly recognized the ring of truth, ” he...
1  translate english to hindi  ...  मुझे याद है, मैंने यह भी कहा था कि दुनिया में ...
2  translate hindi to english  ...                                         Select All
3  translate english to hindi  ...                                          सभी चुनें
4  translate hindi to english  ...  I will give advice with my eye upon you. ” — P...

[5 rows x 3 columns]
                       prefix  ...                          target_text
0  translate hindi to english  ...             I like studying English.
1  translate english to hindi  ...  मुझे अंग्रेज़ी पढ़ना अच्छा लगता है।
2  translate hindi to english  ...                 Tom has a black cat.
3  translate english to hindi  ...        टॉम के पास एक काली बिल्ली है।
4  translate hindi to english  ...                       You came back.

[5 ro

In [None]:
train_df_full.to_csv("./train.tsv", sep="\t")
eval_df_full.to_csv("./eval.tsv", sep="\t")

train_df_full = pd.read_csv("./train.tsv", sep="\t").astype(str)
eval_df_full = pd.read_csv("./eval.tsv", sep="\t").astype(str)

#train_df_converted = pd.read_csv("./train.tsv", sep="\t").astype(str)
#eval_df_converted = pd.read_csv("./eval.tsv", sep="\t").astype(str)



train_rows_perstep = 20000
eval_rows_perstep = 60

In [None]:
#print(train_df_full.head(5))
#print(eval_df_full.head(5))
#print(train_df_converted.head(5))
#print(eval_df_converted.head(5))
#train_df_full["prefix"] = ""
#eval_df_full["prefix"] = ""

                       prefix  ...                                        target_text
0  translate hindi to english  ...  “ I quickly recognized the ring of truth, ” he...
1  translate english to hindi  ...  मुझे याद है, मैंने यह भी कहा था कि दुनिया में ...
2  translate hindi to english  ...                                         Select All
3  translate english to hindi  ...                                          सभी चुनें
4  translate hindi to english  ...  I will give advice with my eye upon you. ” — P...

[5 rows x 3 columns]
                       prefix  ...                          target_text
0  translate hindi to english  ...             I like studying English.
1  translate english to hindi  ...  मुझे अंग्रेज़ी पढ़ना अच्छा लगता है।
2  translate hindi to english  ...                 Tom has a black cat.
3  translate english to hindi  ...        टॉम के पास एक काली बिल्ली है।
4  translate hindi to english  ...                       You came back.

[5 rows x 3 columns]
  Unname

In [None]:
model_args = T5Args()
model_args.max_seq_length = 96
model_args.train_batch_size = 20
model_args.eval_batch_size = 20
model_args.num_train_epochs = 1
model_args.evaluate_during_training = True
#model_args.evaluate_during_training_steps = 30000 #change from original code
#model_args.evaluate_during_training_steps = 3000
model_args.evaluate_during_training_steps = 10000
model_args.use_multiprocessing = False
model_args.fp16 = False
model_args.save_steps = -1
model_args.save_eval_checkpoints = False
model_args.no_cache = True
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.preprocess_inputs = False
model_args.num_return_sequences = 1
model_args.wandb_project = "MT5 Hindi-English Translation_trial"

model = T5Model("mt5", "google/mt5-small", args=model_args)

#First time

In [None]:
#train_df = train_df_full.head(train_rows_perstep)
train_df = train_df_full.head(288000)
#288000
#train_df = train_df_full
#eval_df = eval_df_full.head(train_rows_perstep) 
#eval_df = eval_df_full.head(eval_rows_perstep) 
eval_df = eval_df_full

print(train_df.shape)
print(eval_df.shape)

(288000, 4)
(10000, 4)


In [None]:
train_df.head(5)

Unnamed: 0.1,Unnamed: 0,prefix,input_text,target_text
0,0,translate hindi to english,"मुझे याद है, मैंने यह भी कहा था कि दुनिया में ...","“ I quickly recognized the ring of truth, ” he..."
1,1,translate english to hindi,"“ I quickly recognized the ring of truth, ” he...","मुझे याद है, मैंने यह भी कहा था कि दुनिया में ..."
2,2,translate hindi to english,सभी चुनें,Select All
3,3,translate english to hindi,Select All,सभी चुनें
4,4,translate hindi to english,"जितना ज़्यादा हम यीशु के बारे में सीखते हैं, उ...",I will give advice with my eye upon you. ” — P...


In [None]:
eval_df.head(5)
eval_df.columns

Index(['Unnamed: 0', 'prefix', 'input_text', 'target_text'], dtype='object')

In [None]:
train_df.shape

(288000, 4)

In [None]:
import time
from datetime import datetime
dt_object = datetime.fromtimestamp(time.time())
print(dt_object)


model.train_model(train_df, eval_data=eval_df)

dt_object = datetime.fromtimestamp(time.time())
print(dt_object)

2021-08-27 00:57:13.687310


  0%|          | 0/288000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Using Adafactor for T5


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 1:   0%|          | 0/14400 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

2021-08-27 02:48:43.201794


#Subsequent loops

In [None]:
print(eval_df_full.shape)
eval_df = eval_df_full.iloc[eval_rows_perstep*3:eval_rows_perstep*(4)]
print(eval_df.shape)

(10000, 4)
(60, 4)


In [None]:
#eval_rows_perstep = 1000 #Changed to 1000 after first crash
#for n in range(1,200):
train_rows_perstep = 288000
eval_rows_perstep = 10000
n=1
print("value of n is",n)  
train_df = train_df_full.iloc[train_rows_perstep*n:train_rows_perstep*(n+1)]
eval_df = eval_df_full.iloc[0:10000]
print(train_df.shape)
print(eval_df.shape)

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fd90726fb10>> (for pre_run_cell):


Exception: ignored

value of n is 1
(288000, 4)
(10000, 4)
Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fd90726fb10>> (for post_run_cell):


Exception: ignored

In [None]:
model_args = T5Args()
model_args.max_seq_length = 96
model_args.train_batch_size = 20
model_args.eval_batch_size = 20
model_args.num_train_epochs = 1
model_args.evaluate_during_training = True
#model_args.evaluate_during_training_steps = 30000 #change from original code
#model_args.evaluate_during_training_steps = 3000
model_args.evaluate_during_training_steps = 10000
model_args.use_multiprocessing = False
model_args.fp16 = False
model_args.save_steps = -1
model_args.save_eval_checkpoints = False
model_args.no_cache = True
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.preprocess_inputs = False
model_args.num_return_sequences = 1
model_args.wandb_project = "MT5 Hindi-English Translation_v1_26Aug_continue_thirdtime"
model = T5Model("mt5", "google/mt5-small", args=model_args)

#model = T5Model("mt5", "/content/outputs", args=model_args)
#model.train_model(train_df, eval_data=eval_df)  

Downloading:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_rows_perstep = 288000
eval_rows_perstep = 10000
n = 10
print("value of n is",n)  
train_df = train_df_full.iloc[train_rows_perstep*n:train_rows_perstep*(n+1)]
eval_df = eval_df_full.iloc[0:10000]
print(train_df_full.shape)
print(train_df.shape)
print(eval_df.shape)

value of n is 10
(3005330, 4)
(125330, 4)
(10000, 4)


In [None]:
model = T5Model("mt5", "/content/outputs", args=model_args)
model.train_model(train_df, eval_data=eval_df)  

  0%|          | 0/125330 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Using Adafactor for T5


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 1:   0%|          | 0/6267 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la

(6267,
 {'eval_loss': [1.4394848656654358],
  'global_step': [6267],
  'train_loss': [1.1695255041122437]})

In [None]:
!zip -r /content/outputs.zip /content/outputs/

  adding: content/outputs/ (stored 0%)
  adding: content/outputs/training_args.bin (deflated 50%)
  adding: content/outputs/best_model/ (stored 0%)
  adding: content/outputs/best_model/training_args.bin (deflated 50%)
  adding: content/outputs/best_model/model_args.json (deflated 62%)
  adding: content/outputs/best_model/spiece.model (deflated 46%)
  adding: content/outputs/best_model/scheduler.pt (deflated 51%)
  adding: content/outputs/best_model/pytorch_model.bin (deflated 21%)
  adding: content/outputs/best_model/optimizer.pt (deflated 44%)
  adding: content/outputs/best_model/tokenizer_config.json (deflated 37%)
  adding: content/outputs/best_model/eval_results.txt (stored 0%)
  adding: content/outputs/best_model/special_tokens_map.json (deflated 34%)
  adding: content/outputs/best_model/config.json (deflated 45%)
  adding: content/outputs/checkpoint-14400-epoch-1/ (stored 0%)
  adding: content/outputs/checkpoint-14400-epoch-1/training_args.bin (deflated 50%)
  adding: content/out

In [None]:
!unzip /content/outputs.zip -d /content/outputs

Archive:  /content/outputs.zip
   creating: /content/outputs/best_model/
  inflating: /content/outputs/best_model/config.json  
 extracting: /content/outputs/best_model/eval_results.txt  
  inflating: /content/outputs/best_model/model_args.json  
  inflating: /content/outputs/best_model/optimizer.pt  
  inflating: /content/outputs/best_model/pytorch_model.bin  
  inflating: /content/outputs/best_model/scheduler.pt  
  inflating: /content/outputs/best_model/special_tokens_map.json  
  inflating: /content/outputs/best_model/spiece.model  
  inflating: /content/outputs/best_model/tokenizer_config.json  
  inflating: /content/outputs/best_model/training_args.bin  
   creating: /content/outputs/checkpoint-14400-epoch-1/
  inflating: /content/outputs/checkpoint-14400-epoch-1/config.json  
  inflating: /content/outputs/checkpoint-14400-epoch-1/model_args.json  
  inflating: /content/outputs/checkpoint-14400-epoch-1/optimizer.pt  
  inflating: /content/outputs/checkpoint-14400-epoch-1/pytorch_

#Create dataset from Sadhguru's data
##Tatoba dataset format:
###A dataframe with first 2 columns containing no.s starting 0,1,2...
###Third column "..."
###Fourth column: English line in first row and Hindi line in 2nd row
###Each english index matches corresponding hindi index

  Unnamed: 0  ....                                       target_text
0          0  ...  “ I quickly recognized the ring of truth, ” he...
1          1  ...  मुझे याद है, मैंने यह भी कहा था कि दुनिया में ...
2          2  ...                                         Select All
3          3  ...                                          सभी चुनें
4          4  ...  I will give advice with my eye upon you. ” — P...

[5 rows x 4 columns]
  Unnamed: 0  ...                          target_text
0          0  ...             I like studying English.
1          1  ...  मुझे अंग्रेज़ी पढ़ना अच्छा लगता है।
2          2  ...                 Tom has a black cat.
3          3  ...        टॉम के पास एक काली बिल्ली है।
4          4  ...                       You came back.

[5 rows x 4 columns]