In [None]:
# !pip install -U transformers
# !pip install -U sentence-transformers

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, GenerationConfig
import nltk
import torch
from datasets import load_metric

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Smruti-GEC-for-Gujarati/data/mt5-training-data.csv", usecols=["input", "reference"])
dataset = Dataset.from_pandas(df)

In [None]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)
# dataset = dataset.shuffle(seed=42).select(range(100)).train_test_split(test_size=0.2, seed=42) #0.01

In [None]:
from transformers import MT5ForConditionalGeneration , MT5Tokenizer

model_name = "vrund1346/smruti-gujarati"
# model_name = "google/mt5-base"
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = MT5Tokenizer.from_pretrained(model_name)#, src_lang="gu_IN", tgt_lang="gu_IN")

gen_config = GenerationConfig.from_pretrained("google/mt5-base")
gen_config.early_stopping = 'never'

gen_config.max_length = 64
gen_config.num_beams = 4
gen_config.length_penalty = 1.0
gen_config.no_repeat_ngram_size = 0
gen_config.use_cache = False

In [None]:
max_length = 64
def preprocess(examples):
    model_inputs = tokenizer(
      examples["input"], max_length=max_length, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        examples["reference"], max_length=max_length, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/19806 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

In [None]:
ex = dataset["train"][0]
print("Error Sentence:", ex["input"])
print("Correct Sentence:", ex["reference"])
print(tokenizer(ex['input'])["input_ids"])
print("Tokenized Input:", tokenizer.decode(tokenizer(ex["input"])["input_ids"]))
print("Tokenized Target:", tokenizer.decode(tokenizer(ex["reference"])["input_ids"]))

Error Sentence: કૉડલીવર અને કાળજી હોઈશ કુસુમાયુધ ખરેખર માંદો પડ્યો .
Correct Sentence: કૉડલીવર અને કાળજી છતાં કુસુમાયુધ ખરેખર માંદો પડ્યો .
[6601, 40625, 5747, 8354, 46453, 259, 2641, 28956, 3263, 12120, 6993, 2800, 13253, 60260, 4252, 196507, 68933, 7907, 11448, 6978, 64030, 259, 2252, 36672, 27574, 10127, 259, 260, 1]
Tokenized Input: કૉડલીવર અને કાળજી હોઈશ કુસુમાયુધ ખરેખર માંદો પડ્યો .</s>
Tokenized Target: કૉડલીવર અને કાળજી છતાં કુસુમાયુધ ખરેખર માંદો પડ્યો .</s>


In [None]:
print(tokenizer.pad_token_id)
print(model.config.pad_token_id)

0
0


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./smruti-guj-gec",
    eval_strategy="steps",
    eval_steps=5000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=5000,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id="vrund1346/smruti-gujarati",
    hub_strategy="every_save",
    hub_token="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",
    logging_steps=5000,
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    gradient_checkpointing=True,
    save_strategy="steps",
    # load_best_model_at_end=False,
    fp16=False,
    # gradient_accumulation_steps=8,
    report_to="none",
    predict_with_generate=True,
    generation_config=gen_config
)

In [None]:
bleu = load_metric("bleu")

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8
)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # print(preds)
    label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    for p, l in zip(pred_texts, label_texts):
        print(f"\nPrediction: {p}\nReference:  {l}")

    pred_tokens = [p.split() for p in pred_texts]
    label_tokens = [[l.split()] for l in label_texts]
    return {"bleu": bleu.compute(predictions=pred_tokens, references=label_tokens)["bleu"]}

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

  bleu = load_metric("bleu")
  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


TrainOutput(global_step=2476, training_loss=0.06744318208709865, metrics={'train_runtime': 1811.8322, 'train_samples_per_second': 10.931, 'train_steps_per_second': 1.367, 'total_flos': 2968538460585984.0, 'train_loss': 0.06744318208709865, 'epoch': 1.0})

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()