In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import sys
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from datasets import Dataset as DS
from datasets import load_metric

import seaborn as sn

from torchmetrics.text import WordErrorRate, CharErrorRate

import matplotlib.pyplot as plt

import random

import time

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler


import gc

import inspect
    
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    pipeline
)

    
print("All libraries have been installed successfully!", end="\r")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
train_df = pd.read_csv("/kaggle/input/full-regipa-dataset/trainset.csv")
test_df = pd.read_csv("/kaggle/input/full-regipa-dataset/testset.csv")

In [None]:
train_df.dropna(inplace=True)

In [None]:
train_df, val_df = train_test_split(train_df, test_size=0.1, shuffle=True)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [None]:
torch.cuda.empty_cache()

In [None]:
MODEL_NAME = "google/umt5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
t = train_df["Contents"][0]
f = tokenizer.encode(t)

In [None]:
t

In [None]:
tokenizer.decode(f)

In [None]:
i = train_df["IPA"][0]
ig = tokenizer.encode(i)

In [None]:
i

In [None]:
ig

In [None]:
districts = ["<Kishoreganj>", "<Narail>", "<Narsingdi>", "<Rangpur>", "<Tangail>", "<Chittagong>"]

In [None]:
tokenizer.add_tokens(districts, special_tokens=True)

In [None]:
len(tokenizer)

In [None]:
model.resize_token_embeddings(len(tokenizer))

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
alpha_pat = "[a-zA-z0-9]"

train_df["Contents"] = train_df["Contents"].str.replace(alpha_pat, "", regex=True)

In [None]:
ds_train = DS.from_pandas(train_df)
ds_eval = DS.from_pandas(val_df)

In [None]:
def prepare_dataset(sample):
    # creation of the example with the DGT.
    output = tokenizer(f"<{sample['District']}> {sample['Contents']}")
    output["labels"] = tokenizer(sample["IPA"])['input_ids']
    output["length"] = len(output["labels"])
    return output


ds_train = ds_train.map(prepare_dataset, remove_columns=ds_train.column_names)
ds_eval = ds_eval.map(prepare_dataset, remove_columns=ds_eval.column_names)

In [None]:
cer = CharErrorRate()
wer = WordErrorRate()


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    wer_res = wer(decoded_preds, decoded_labels).item()
    cer_res = cer(decoded_preds, decoded_labels).item()
    
    torch.cuda.empty_cache()
    
    return {"wer": wer_res, "cer": cer_res}

In [None]:
model_id = "mt5-base-bangla-text-to-reg-ipa"

training_args = Seq2SeqTrainingArguments(
    output_dir=model_id,
    group_by_length=True,
    length_column_name="length",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    metric_for_best_model="wer",
    greater_is_better=False,
    load_best_model_at_end=True,
    num_train_epochs=10,
    save_steps=10000,
    eval_steps=10000,
    logging_steps=10000,
    learning_rate=3e-4,
    weight_decay=1e-2,
    warmup_steps=1000,
    save_total_limit=1,
    predict_with_generate=True,
    generation_max_length=512,
    push_to_hub=False,
    report_to="none",
    fp16=True
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer.train()


torch.cuda.empty_cache()

In [None]:
trainer.save_model(model_id)

In [None]:
logs = trainer.state.log_history
logs

In [None]:
logsdf = pd.DataFrame(logs)
logsdf.to_csv("logs.csv", index=False)

In [None]:
torch.cuda.empty_cache()

In [None]:
test_df["Contents"] = test_df["Contents"].str.replace(alpha_pat, "", regex=True)

In [None]:
pipe = pipeline("text2text-generation", model=model_id, device=device)

In [None]:
texts = test_df["Contents"].tolist()
dists = test_df["District"].tolist()

In [None]:
reformed_texts = [f"<{dists[i]}> {texts[i]}" for i in range(len(texts))]

In [None]:
ipas = pipe(reformed_texts, max_length=2048, batch_size=128)
gen_txt = []

In [None]:
for ipa in ipas:
    gen_txt.append(ipa["generated_text"])
    torch.cuda.empty_cache()

ipas = gen_txt
del gen_txt
gc.collect()

In [None]:
torch.cuda.empty_cache()

In [None]:
test_df["string"] = ipas
test_df = test_df.sort_index()

In [None]:
preds = test_df["string"].tolist()
gts = test_df["IPA"].tolist()

In [None]:
wer_res = wer(preds, gts).item()
cer_res = cer(preds, gts).item()

In [None]:
print(f"""
    Word error rate: {wer_res},
    Char error rate: {cer_res},
""")