In [1]:
from transformers import AutoTokenizer , AutoConfig , AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "t5-small"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name , use_fast = True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name , config = config)



In [3]:

from datasets import load_dataset
dataset = load_dataset('findnitai/english-to-hinglish')

In [4]:
master = []
for line in dataset['train']['translation']:
    master.append(line['en'])
    master.append(line['hi_ng'])

def gen_training_data():
    return (master[i : i+500]
    for i in range(0, len(master), 500)
    )
tokenizer_training_data = gen_training_data()

In [5]:
tokenizer_training_data
#generator function

<generator object gen_training_data.<locals>.<genexpr> at 0x7fb63337c190>

In [5]:
trained_tokenizer = tokenizer.train_new_from_iterator(tokenizer_training_data , vocab_size = 32128)
#training the tokenizer on the dataset





In [6]:
#saving the tokenizer
trained_tokenizer.save_pretrained("english_to_hinglish_tokenizer")

('english_to_hinglish_tokenizer/tokenizer_config.json',
 'english_to_hinglish_tokenizer/special_tokens_map.json',
 'english_to_hinglish_tokenizer/tokenizer.json')

In [17]:
dataset['train']['translation']

[{'en': "What's the name of the movie",
  'hi_ng': 'film ka kya naam hai',
  'source': 1},
 {'en': 'Hi, the rotten tomatoes score is great but the meta critic score seems a little low a movie of this quality. ',
  'hi_ng': 'namaste, sada hua tomatoes score mahaan hai, lekin meta critic score is gunavatta kee philm se thoda kam lagata hai.',
  'source': 1},
 {'en': 'Do you think you will like the movie',
  'hi_ng': 'kya aapako lagata hai ki aapako film pasand aaegee',
  'source': 1},
 {'en': 'What kind of movie is it',
  'hi_ng': 'yah kis tarah kee philm hai',
  'source': 1},
 {'en': 'when was the movie made?',
  'hi_ng': 'film  kab banee thee?',
  'source': 1},
 {'en': 'Wonder woman, I think i would enjoy this movie very much',
  'hi_ng': 'aashchary hai ki mahila, mujhe lagata hai ki mujhe is film mein bahut maja aaega',
  'source': 1},
 {'en': 'Whats the name of the movie',
  'hi_ng': 'film ka kya naam hai',
  'source': 1},
 {'en': 'It is a action movie set in the DC comic world',
  '

In [19]:
dataset_dict = dataset.data

In [7]:
output_json_path = "english_to_hinglish.json"

In [16]:
'''
formatted_dataset = {"translation": []}
data_list = [{"en": example["en"], "hi_ng": example["hi_ng"]} for example in dataset['train']['translation']]
for data in data_list:
    formatted_dataset["translation"].append(data)
'''
formatted_dataset = [{'translation':{"en": example["en"], "hi_ng": example["hi_ng"]}} for example in dataset['train']['translation']]

In [12]:
import json
with open(output_json_path, 'w', encoding='utf-8') as json_file:
    json.dump(formatted_dataset, json_file, ensure_ascii=False)


In [None]:
'''
formatted_dataset = load_dataset(
    "json",
    data_files="english_to_hinglish.json"
  )
'''
#load again when needed

In [26]:
source_prefix = "Translate English to Hinglish : "
source_lang = "en"
target_lang = "hi_ng"
max_source_length = 128 
max_target_length = 128 # target and source length task dependent (translation, summary etc.)
padding = "max_length" # padding to max length
num_epochs = 3


In [10]:
def preprocess(source_data):
    inputs = [k[source_lang] for k in source_data['translation']]
    targets = [k[target_lang] for k in source_data['translation']]
    inputs = [source_prefix + inp for inp in inputs]
    #using the previously trained tokenizer
    model_inputs = trained_tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    labels = trained_tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    labels["input_id"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
raw_datasets = load_dataset(
    "json",
    data_files="english_to_hinglish.json"
  )

In [15]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 189102
    })
})

In [19]:
train_dataset = raw_datasets["train"]

In [20]:
train_dataset

Dataset({
    features: ['translation'],
    num_rows: 189102
})

In [19]:
train_dataset['translation']

[{'en': "What's the name of the movie", 'hi_ng': 'film ka kya naam hai'},
 {'en': 'Hi, the rotten tomatoes score is great but the meta critic score seems a little low a movie of this quality. ',
  'hi_ng': 'namaste, sada hua tomatoes score mahaan hai, lekin meta critic score is gunavatta kee philm se thoda kam lagata hai.'},
 {'en': 'Do you think you will like the movie',
  'hi_ng': 'kya aapako lagata hai ki aapako film pasand aaegee'},
 {'en': 'What kind of movie is it', 'hi_ng': 'yah kis tarah kee philm hai'},
 {'en': 'when was the movie made?', 'hi_ng': 'film  kab banee thee?'},
 {'en': 'Wonder woman, I think i would enjoy this movie very much',
  'hi_ng': 'aashchary hai ki mahila, mujhe lagata hai ki mujhe is film mein bahut maja aaega'},
 {'en': 'Whats the name of the movie', 'hi_ng': 'film ka kya naam hai'},
 {'en': 'It is a action movie set in the DC comic world',
  'hi_ng': 'yah deesee komik duniya mein sthaapit ek ekshan philm hai'},
 {'en': 'Who stars in the movie', 'hi_ng': 

In [20]:
train_dataset = train_dataset.map(preprocess, batched=True, remove_columns="translation")

Map: 100%|██████████| 189102/189102 [00:43<00:00, 4348.46 examples/s]


In [21]:
train_dataset

Dataset({
    features: ['translation'],
    num_rows: 189102
})

In [21]:
from transformers import default_data_collator


In [24]:
data_collator = default_data_collator

trainer_args_in = {
    'output_dir': 'full-hinglish-translator',
    'overwrite_output_dir' : True,
    'do_train' : True,
    # 'do_valid' : False,
    'per_device_train_batch_size' : 8,
    'num_train_epochs' : num_epochs,
}


In [28]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, HfArgumentParser
parser = HfArgumentParser((Seq2SeqTrainingArguments))
training_args = parser.parse_dict(trainer_args_in)

trainer = Seq2SeqTrainer(model=model, args=training_args[0], train_dataset=train_dataset, tokenizer=trained_tokenizer, data_collator=data_collator)

train_result = trainer.train(resume_from_checkpoint='my-t5-hinglish-translator') #None
trainer.save_model()

You are resuming training from a checkpoint trained with 4.30.2 of Transformers but your current version is 4.34.0. This is not recommended and could yield to errors or unwanted behaviors.
  0%|          | 264/118190 [1:21:34<607:20:05, 18.54s/it]
  0%|          | 149/118190 [01:08<12:18:20,  2.66it/s]

KeyboardInterrupt: 