In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import files
import io

# Upload CSV
uploaded = files.upload()

# Load and split dataset
for filename in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[filename]))
    df.columns = ["src", "tgt"]  # Rename to standard format
    df.dropna(inplace=True)

    train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)
    train_df.to_csv("train.csv", index=False)
    val_df.to_csv("val.csv", index=False)

    files.download("train.csv")
    files.download("val.csv")


Saving Tel_to_Eng.csv to Tel_to_Eng (1).csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df.head()

Unnamed: 0,src,tgt
0,అతని కాళ్ళు పొడవుగా ఉన్నాయి.,His legs are long.
1,సాయికి మంచి అనుభూతి కలుగుతోంది,Sai is feeling good
2,మీరు ఎక్కడికి వెళుతున్నారు,Where are you going
3,"తెలుగు, ఇంగ్లీష్, తమిళం, ఫ్రెంచ్","Telugu, english, tamil, french"
4,అధికారిక సమాచారం,Official Information


In [None]:
!pip install transformers datasets sentencepiece accelerate




In [None]:
!pip install sacrebleu
!pip install evaluate
!pip install numpy



In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch
from torch.utils.data import DataLoader
from transformers import logging
logging.set_verbosity_error()

# 1. Load dataset
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")
train_df.columns = ["translation_input", "translation_target"]
val_df.columns = ["translation_input", "translation_target"]

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df)
})

# 2. Initialize tokenizer and model
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# 3. Preprocessing function
max_input_length = 64
max_target_length = 64

def preprocess_function(examples):
    inputs = ["translate Telugu to English: " + src for src in examples["translation_input"]]
    targets = [tgt for tgt in examples["translation_target"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    # Properly tokenize targets using tokenizer in "target mode"
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )

    # Replace pad tokens with -100 to ignore in loss computation
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



# 4. Tokenize dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 5. Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/mt5_checkpoints",   # ✅ Save to Drive
    save_strategy="epoch",                                # or "steps"
    save_total_limit=10,                                    # ✅ Keep only 2 latest checkpoints
    eval_strategy="epoch",
    #eval_strategy="steps",
    #eval_steps=200,   # ✅ evaluate every 200 steps
    learning_rate=5e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=30,
    generation_num_beams=5,
    warmup_steps=200,
    weight_decay=0.01,
    optim="adafactor",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    predict_with_generate=True,
    report_to="none"
    #lr_scheduler_type="linear",   # ✅ gradual decay
)


# 6. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 7. Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # ✅ stops if no improvement for 3 evals
)

Map:   0%|          | 0/939 [00:00<?, ? examples/s]



Map:   0%|          | 0/166 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


In [None]:
# 8. Train
trainer.train()

{'eval_loss': 29.846744537353516, 'eval_runtime': 104.1059, 'eval_samples_per_second': 1.595, 'eval_steps_per_second': 0.403, 'epoch': 1.0}
{'loss': 28.2976, 'grad_norm': 4340.7509765625, 'learning_rate': 2.45e-07, 'epoch': 1.6808510638297873}
{'eval_loss': 29.798160552978516, 'eval_runtime': 101.5916, 'eval_samples_per_second': 1.634, 'eval_steps_per_second': 0.413, 'epoch': 2.0}
{'train_runtime': 5398.8211, 'train_samples_per_second': 0.348, 'train_steps_per_second': 0.011, 'train_loss': 28.11660410563151, 'epoch': 2.0}


TrainOutput(global_step=60, training_loss=28.11660410563151, metrics={'train_runtime': 5398.8211, 'train_samples_per_second': 0.348, 'train_steps_per_second': 0.011, 'train_loss': 28.11660410563151, 'epoch': 2.0})

In [None]:
#autuo resume
trainer.train(resume_from_checkpoint=True)

{'loss': 28.0126, 'grad_norm': 1614.7989501953125, 'learning_rate': 7.45e-07, 'epoch': 5.0}
{'eval_loss': 29.400402069091797, 'eval_runtime': 91.2471, 'eval_samples_per_second': 1.819, 'eval_steps_per_second': 0.46, 'epoch': 5.0}
{'eval_loss': 29.260543823242188, 'eval_runtime': 96.0335, 'eval_samples_per_second': 1.729, 'eval_steps_per_second': 0.437, 'epoch': 6.0}
{'train_runtime': 4892.2981, 'train_samples_per_second': 1.152, 'train_steps_per_second': 0.037, 'train_loss': 9.114385986328125, 'epoch': 6.0}


TrainOutput(global_step=180, training_loss=9.114385986328125, metrics={'train_runtime': 4892.2981, 'train_samples_per_second': 1.152, 'train_steps_per_second': 0.037, 'train_loss': 9.114385986328125, 'epoch': 6.0})

In [None]:
#autuo resume
trainer.train(resume_from_checkpoint=True)

{'loss': 28.0194, 'grad_norm': 6884.53515625, 'learning_rate': 9.95e-07, 'epoch': 6.680851063829787}
{'eval_loss': 28.98779296875, 'eval_runtime': 99.7169, 'eval_samples_per_second': 1.665, 'eval_steps_per_second': 0.421, 'epoch': 7.0}
{'eval_loss': 28.8094482421875, 'eval_runtime': 100.3284, 'eval_samples_per_second': 1.655, 'eval_steps_per_second': 0.419, 'epoch': 8.0}
{'loss': 26.8814, 'grad_norm': 2988.59716796875, 'learning_rate': 5.1e-07, 'epoch': 8.340425531914894}
{'eval_loss': 28.671979904174805, 'eval_runtime': 97.625, 'eval_samples_per_second': 1.7, 'eval_steps_per_second': 0.43, 'epoch': 9.0}
{'loss': 27.1539, 'grad_norm': 3401.314208984375, 'learning_rate': 1e-08, 'epoch': 10.0}
{'eval_loss': 28.618934631347656, 'eval_runtime': 107.752, 'eval_samples_per_second': 1.541, 'eval_steps_per_second': 0.39, 'epoch': 10.0}
{'train_runtime': 10304.8537, 'train_samples_per_second': 0.911, 'train_steps_per_second': 0.029, 'train_loss': 10.873833618164063, 'epoch': 10.0}


TrainOutput(global_step=300, training_loss=10.873833618164063, metrics={'train_runtime': 10304.8537, 'train_samples_per_second': 0.911, 'train_steps_per_second': 0.029, 'train_loss': 10.873833618164063, 'epoch': 10.0})

In [None]:
#autuo resume
trainer.train(resume_from_checkpoint=True)

{'eval_loss': 28.454113006591797, 'eval_runtime': 105.2406, 'eval_samples_per_second': 1.577, 'eval_steps_per_second': 0.399, 'epoch': 11.0}
{'loss': 27.0504, 'grad_norm': 2000.451416015625, 'learning_rate': 4.04e-07, 'epoch': 11.680851063829786}
{'eval_loss': 28.321855545043945, 'eval_runtime': 105.0442, 'eval_samples_per_second': 1.58, 'eval_steps_per_second': 0.4, 'epoch': 12.0}
{'eval_loss': 28.22585678100586, 'eval_runtime': 104.0915, 'eval_samples_per_second': 1.595, 'eval_steps_per_second': 0.403, 'epoch': 13.0}
{'loss': 26.7651, 'grad_norm': 3917.4267578125, 'learning_rate': 2.0399999999999997e-07, 'epoch': 13.340425531914894}
{'eval_loss': 28.156118392944336, 'eval_runtime': 101.5933, 'eval_samples_per_second': 1.634, 'eval_steps_per_second': 0.413, 'epoch': 14.0}
{'loss': 26.1949, 'grad_norm': 1834.6971435546875, 'learning_rate': 4e-09, 'epoch': 15.0}
{'eval_loss': 28.12410545349121, 'eval_runtime': 103.7543, 'eval_samples_per_second': 1.6, 'eval_steps_per_second': 0.405, 'ep

TrainOutput(global_step=450, training_loss=8.89004150390625, metrics={'train_runtime': 13091.0007, 'train_samples_per_second': 1.076, 'train_steps_per_second': 0.034, 'train_loss': 8.89004150390625, 'epoch': 15.0})

In [None]:
#autuo resume
trainer.train(resume_from_checkpoint=True)

{'eval_loss': 27.973684310913086, 'eval_runtime': 100.688, 'eval_samples_per_second': 1.649, 'eval_steps_per_second': 0.417, 'epoch': 16.0}
{'loss': 26.5018, 'grad_norm': 4711.1220703125, 'learning_rate': 2.5249999999999996e-07, 'epoch': 16.680851063829788}
{'eval_loss': 27.866628646850586, 'eval_runtime': 98.1586, 'eval_samples_per_second': 1.691, 'eval_steps_per_second': 0.428, 'epoch': 17.0}
{'eval_loss': 27.80292320251465, 'eval_runtime': 99.9508, 'eval_samples_per_second': 1.661, 'eval_steps_per_second': 0.42, 'epoch': 18.0}
{'loss': 26.1125, 'grad_norm': 2361.188232421875, 'learning_rate': 1.275e-07, 'epoch': 18.340425531914892}


In [None]:
#autuo resume
trainer.train(resume_from_checkpoint=True)

{'loss': 27.3819, 'grad_norm': 2361.188232421875, 'learning_rate': 1.275e-07, 'epoch': 18.340425531914892}
{'eval_loss': 27.72854232788086, 'eval_runtime': 102.7793, 'eval_samples_per_second': 1.615, 'eval_steps_per_second': 0.409, 'epoch': 19.0}
{'loss': 26.496, 'grad_norm': 1709.9044189453125, 'learning_rate': 2.5e-09, 'epoch': 20.0}
{'eval_loss': 27.707263946533203, 'eval_runtime': 95.3995, 'eval_samples_per_second': 1.74, 'eval_steps_per_second': 0.44, 'epoch': 20.0}
{'train_runtime': 5067.1431, 'train_samples_per_second': 3.706, 'train_steps_per_second': 0.118, 'train_loss': 2.664367370605469, 'epoch': 20.0}


TrainOutput(global_step=600, training_loss=2.664367370605469, metrics={'train_runtime': 5067.1431, 'train_samples_per_second': 3.706, 'train_steps_per_second': 0.118, 'train_loss': 2.664367370605469, 'epoch': 20.0})

In [None]:
#autuo resume
trainer.train(resume_from_checkpoint=True)

{'eval_loss': 27.6241512298584, 'eval_runtime': 85.5726, 'eval_samples_per_second': 1.94, 'eval_steps_per_second': 0.491, 'epoch': 21.0}
{'loss': 26.244, 'grad_norm': 6425.9951171875, 'learning_rate': 1.836363636363636e-07, 'epoch': 21.680851063829788}
{'eval_loss': 27.563472747802734, 'eval_runtime': 93.0795, 'eval_samples_per_second': 1.783, 'eval_steps_per_second': 0.451, 'epoch': 22.0}
{'eval_loss': 27.514848709106445, 'eval_runtime': 91.0059, 'eval_samples_per_second': 1.824, 'eval_steps_per_second': 0.462, 'epoch': 23.0}
{'loss': 25.5397, 'grad_norm': 2366.50634765625, 'learning_rate': 9.272727272727272e-08, 'epoch': 23.340425531914892}
{'eval_loss': 27.483154296875, 'eval_runtime': 87.6279, 'eval_samples_per_second': 1.894, 'eval_steps_per_second': 0.479, 'epoch': 24.0}
{'loss': 26.1701, 'grad_norm': 3107.159912109375, 'learning_rate': 1.8181818181818182e-09, 'epoch': 25.0}
{'eval_loss': 27.474637985229492, 'eval_runtime': 83.111, 'eval_samples_per_second': 1.997, 'eval_steps_pe

TrainOutput(global_step=750, training_loss=5.196923502604166, metrics={'train_runtime': 11236.744, 'train_samples_per_second': 2.089, 'train_steps_per_second': 0.067, 'train_loss': 5.196923502604166, 'epoch': 25.0})

In [None]:
#autuo resume
trainer.train(resume_from_checkpoint=True)

{'eval_loss': 27.393049240112305, 'eval_runtime': 82.639, 'eval_samples_per_second': 2.009, 'eval_steps_per_second': 0.508, 'epoch': 26.0}
{'loss': 26.0747, 'grad_norm': 4073.810546875, 'learning_rate': 1.442857142857143e-07, 'epoch': 26.680851063829788}
{'eval_loss': 27.320003509521484, 'eval_runtime': 89.3087, 'eval_samples_per_second': 1.859, 'eval_steps_per_second': 0.47, 'epoch': 27.0}
{'eval_loss': 27.254470825195312, 'eval_runtime': 83.3916, 'eval_samples_per_second': 1.991, 'eval_steps_per_second': 0.504, 'epoch': 28.0}
{'loss': 25.4936, 'grad_norm': 12136.3359375, 'learning_rate': 7.285714285714286e-08, 'epoch': 28.340425531914892}
{'eval_loss': 27.23776626586914, 'eval_runtime': 89.2422, 'eval_samples_per_second': 1.86, 'eval_steps_per_second': 0.471, 'epoch': 29.0}
{'loss': 25.4114, 'grad_norm': 1643.6024169921875, 'learning_rate': 1.4285714285714286e-09, 'epoch': 30.0}
{'eval_loss': 27.22842025756836, 'eval_runtime': 82.4206, 'eval_samples_per_second': 2.014, 'eval_steps_pe

TrainOutput(global_step=900, training_loss=4.276646728515625, metrics={'train_runtime': 11114.1284, 'train_samples_per_second': 2.535, 'train_steps_per_second': 0.081, 'train_loss': 4.276646728515625, 'epoch': 30.0})

In [None]:
#Resume from a specific checkpoint (if you want control)
trainer.train(resume_from_checkpoint="/content/drive/MyDrive/mt5_checkpoints/checkpoint-60")

{'train_runtime': 2.7455, 'train_samples_per_second': 684.025, 'train_steps_per_second': 21.854, 'train_loss': 0.0, 'epoch': 2.0}


TrainOutput(global_step=60, training_loss=0.0, metrics={'train_runtime': 2.7455, 'train_samples_per_second': 684.025, 'train_steps_per_second': 21.854, 'train_loss': 0.0, 'epoch': 2.0})

In [None]:
#Check if training continued correctly
trainer.state.global_step


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Seq2SeqTrainer

checkpoint_dir = "/content/drive/MyDrive/mt5_checkpoints"

# Resume training from the last checkpoint
trainer.train(resume_from_checkpoint=True)
#When Colab crashes or you disconnect....Hugging Face will automatically pick the latest checkpoint if you pass resume_from_checkpoint=True.


In [None]:
#Check which checkpoint is latest
import os

checkpoints = [c for c in os.listdir(checkpoint_dir) if c.startswith("checkpoint")]
print("Available checkpoints:", checkpoints)
