# Installing Necessary Libraries

In [1]:
# !pip install transformers[torch]
# !pip install transformers[sentencepiece]
# !pip install sentencepiece
# !pip install datasets

# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorWithPadding




In [2]:
dataset_name = "eng_to_kn.csv"
model_name = "t5-small"

In [3]:
df = pd.read_csv(dataset_name)
df.shape

(4093524, 3)

In [4]:
df.head()

Unnamed: 0,idx,src,tgt
0,0,Hes a scientist.,ಇವರು ಸಂಶೋಧಕ ಸ್ವಭಾವದವರು.
1,1,'But we speak the truth aur ye sach hai ke Guj...,"""ಆದರೆ ಸತ್ಯ ಹೊರ ಬಂದೇ ಬರುತ್ತದೆ ಎಂದು ಹೇಳಿದ ರಾಹುಲ್..."
2,2,8 lakh crore have been looted.,ಕಳ್ಳತನವಾಗಿದ್ದ 8 ಲಕ್ಷ ರೂ.
3,3,I read a lot into this as well.,ಇದರ ಬಗ್ಗೆ ನಾನೂ ಸಾಕಷ್ಟು ಓದಿದ್ದೇನೆ.
4,4,She was found dead with the phone's battery ex...,ಆಕೆಯ ತಲೆಯ ಹತ್ತಿರ ಇರಿಸಿಕೊಂಡಿದ್ದ ಫೋನ್‌ನ ಬ್ಯಾಟರಿ ...


In [5]:
df.drop(['idx'], axis=1, inplace=True)

In [57]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['src', 'tgt'],
    num_rows: 4093524
})

# Model and tokenizer

In [54]:
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Dataset

In [8]:
# train_dataset = load_dataset(dataset_name, split='train')
# test_dataset = load_dataset(dataset_name, split='test')

In [9]:
# train_dataset.remove_columns(['date'])
# test_dataset.remove_columns(['date'])

# Pre-process

In [55]:
def add_prompt(examples):
    examples['prompt'] = f""" Context: Translate the following English Sentence to Kannada
        English: {examples['src']}"""
        # Kannada: {examples['tgt']}
    return examples


In [58]:
dataset = dataset.shuffle(42).select(range(52002)).train_test_split(test_size=0.1, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt'],
        num_rows: 46801
    })
    test: Dataset({
        features: ['src', 'tgt'],
        num_rows: 5201
    })
})

In [59]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [60]:
train_dataset = train_dataset.map(add_prompt)
test_dataset = test_dataset.map(add_prompt)

Map:   0%|          | 0/46801 [00:00<?, ? examples/s]

Map:   0%|          | 0/5201 [00:00<?, ? examples/s]

In [26]:
train_dataset

Dataset({
    features: ['src', 'tgt', 'prompt'],
    num_rows: 46801
})

In [61]:
max_length = 128
def preprocess_data(examples):
    inputs = tokenizer(examples['prompt'], padding="max_length", truncation=True, max_length=max_length)
    targets = tokenizer(examples['tgt'], padding="max_length", truncation=True, max_length=max_length)
    target_input_ids = []
    for ids in targets['input_ids']:
      target_input_ids.append([id if id != tokenizer.pad_token_id else -100 for id in ids])
    inputs.update({'labels': target_input_ids})
    return inputs

In [62]:
train_dataset = train_dataset.map(
      preprocess_data,
      batched=True
    )

test_dataset = test_dataset.map(
      preprocess_data,
      batched=True
    )

Map:   0%|          | 0/46801 [00:00<?, ? examples/s]

Map:   0%|          | 0/5201 [00:00<?, ? examples/s]

In [42]:
tokenizer.pad_token_id

0

In [47]:
train_dataset

Dataset({
    features: ['src', 'tgt', 'prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 46801
})

In [49]:
train_dataset['labels'][0]

[3,
 2,
 3,
 2,
 3,
 2,
 3,
 2,
 3,
 2,
 3,
 2,
 3,
 2,
 5,
 1,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

In [29]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [30]:
TRAINING_OUTPUT = "./models/translate_en_to_kn"
batch_size = 12
epochs = 1
training_args = TrainingArguments(
    output_dir=TRAINING_OUTPUT,
    gradient_accumulation_steps=batch_size,
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    save_strategy="no",
    save_total_limit=2,
    fp16=True,
    learning_rate=1e-05,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

In [31]:
trainer.train()



Step,Training Loss
500,0.6953


TrainOutput(global_step=975, training_loss=0.5733911915314504, metrics={'train_runtime': 1743.9147, 'train_samples_per_second': 80.51, 'train_steps_per_second': 0.559, 'total_flos': 4749752851759104.0, 'train_loss': 0.5733911915314504, 'epoch': 3.0})

In [33]:
trainer.save_model(TRAINING_OUTPUT)

In [34]:
MODEL_PATH = "Sharathhebbar24/t5_translate_en_to_kn"
tokenizer.push_to_hub(
    MODEL_PATH,
    token="<HF_TOKEN>"
)
model.push_to_hub(
    MODEL_PATH,
    token="<HF_TOKEN>"
)




Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

HTTP Error 503 thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/c9/40/c940edfc29d3d115424e614b05717e81694dbe0f3d851182e9da7a3bfb1887fb/d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQFN2FTF47%2F20240105%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240105T070304Z&X-Amz-Expires=900&X-Amz-Signature=e8bd3db9dbae0b4432f03b91be9f961d5b35fdc7f042819117b6fbe0f3ab9976&X-Amz-SignedHeaders=host&x-amz-storage-class=INTELLIGENT_TIERING&x-id=PutObject
Retrying in 1s [Retry 1/5].


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sharathhebbar24/t5_translate_en_to_kn/commit/239b1472c5057671006babbd1f97f502059bd56c', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='239b1472c5057671006babbd1f97f502059bd56c', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
mod = "Sharathhebbar24/t5_translate_en_to_kn"
mod1 = T5ForConditionalGeneration.from_pretrained(mod)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
testing = """Context: Translate the following English Sentence to Kannada
English: My name is Sharath"""
inputs = tokenizer(testing, return_tensors='pt', max_length=512, padding="max_length", truncation=True)
outputs = model.generate(inputs['input_ids'],
        max_length=128,
        no_repeat_ngram_size=3,
        num_beams=6,
        early_stopping=True
      )
summary = tokenizer.decode(
      outputs[0],
      skip_special_tokens=True,
)
summary