In [2]:

!pip install --upgrade transformers sentencepiece datasets sacrebleu rouge-score accelerate --quiet


In [3]:
import sys
import pandas as pd
import torch
from datasets import Dataset, DatasetDict

import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer

print("Python:", sys.version.splitlines()[0])
print("Transformers version:", transformers.__version__)


Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Transformers version: 4.57.1


In [4]:
!pip install chardet




In [5]:
!pip install chardet
import chardet

def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read())
    return result

print(detect_encoding("/content/tel1.csv"))



{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


In [6]:
import csv

# Load tel1.csv safely
rows1 = []
with open("/content/tel1.csv", "r", encoding="utf-8", errors="ignore") as f:
    reader = csv.reader(f)
    for row in reader:
        rows1.append(row)

tel1_df = pd.DataFrame(rows1[1:], columns=rows1[0])
print("tel1 loaded:", tel1_df.shape)
print("Columns:", tel1_df.columns)





tel1 loaded: (32681, 3)
Columns: Index(['instruction', 'input', 'output'], dtype='object')


In [7]:
# Rename columns
tel1_df = tel1_df.rename(columns={"input": "prompt", "output": "response"})


# Drop the 'instruction' column
tel1_df = tel1_df.drop(columns=["instruction"])


# Check results
print(tel1_df.head())



                                              prompt  \
0  నా ఉపాధ్యాయులు నన్ను తీవ్రంగా వేధిస్తున్నారు (...   
1  నేను ఎవరికైనా సంతోషం కోసం అవకాశం ఇవ్వగలిగానని ...   
2  నేను నా జీవితాన్ని చాలా ద్వేషిస్తున్నాను, నేను...   
3  మీరు ముఖ్యంగా తక్కువ స్థాయికి చేరుకున్నప్పుడు ...   
4  నేనే ఆత్మహత్య చేసుకుంటే రేపు పనికి వెళ్లనవసరం ...   

                             response  
0  Class: depression, Compound: -0.99  
1   Class: depression, Compound: 0.80  
2   Class: depression, Compound: 0.50  
3  Class: depression, Compound: -0.92  
4  Class: depression, Compound: -0.99  


In [8]:
df = pd.concat([tel1_df], ignore_index=True)

# Remove duplicate prompts
df = df.drop_duplicates(subset=["prompt"])

# Remove empty rows
df = df.dropna(subset=["prompt", "response"])

# Shuffle
df = df.sample(frac=1).reset_index(drop=True)

print("Final merged dataset size:", df.shape)
df.head()


Final merged dataset size: (31131, 2)


Unnamed: 0,prompt,response
0,వెండి సామాను నుండి కొద్ది మొత్తంలో ఎపాక్సీ/రెస...,"Class: anxiety, Compound: -0.39"
1,పువ్వుల గురించి మీరు ఏమనుకుంటున్నారు (టీవీ షో)...,"Class: depression, Compound: -0.66"
2,కాబట్టి నేను ఒప్పుకోలులో ఇలాంటి పోస్ట్ చేసాను ...,"Class: depression, Compound: -0.99"
3,"నేను నిస్పృహలో ఉన్నాను, నేను సున్తీ చేశాను మరి...","Class: depression, Compound: -0.99"
4,heidimontag lol నేను కాంట్ నమ్ముతున్నాను కామ్ ...,"Class: normal, Compound: 0.42"


In [9]:
train_df = df.sample(frac=0.8, random_state=42)
remaining = df.drop(train_df.index)

val_df = remaining.sample(frac=0.5, random_state=42)
test_df = remaining.drop(val_df.index)

print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)


Train: (24905, 2)
Val: (3113, 2)
Test: (3113, 2)


In [10]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True))
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['prompt', 'response'],
        num_rows: 24905
    })
    validation: Dataset({
        features: ['prompt', 'response'],
        num_rows: 3113
    })
    test: Dataset({
        features: ['prompt', 'response'],
        num_rows: 3113
    })
})


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/mt5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print("Tokenizer and Model Loaded!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizer and Model Loaded!


In [12]:
def preprocess(batch):
    inputs = tokenizer(
        batch["prompt"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

    labels = tokenizer(
        batch["response"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

    inputs["labels"] = labels["input_ids"]
    return inputs


In [13]:
tokenized_ds = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print("Tokenization Completed!")


Map:   0%|          | 0/24905 [00:00<?, ? examples/s]

Map:   0%|          | 0/3113 [00:00<?, ? examples/s]

Map:   0%|          | 0/3113 [00:00<?, ? examples/s]

Tokenization Completed!


In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)


In [15]:
from transformers import TrainingArguments

batch_size = 4
num_epochs = 4
output_dir = "./telugu_chatbot_model"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2
)

print("TrainingArguments Ready!")



TrainingArguments Ready!


In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("Trainer Ready!")


  trainer = Trainer(


Trainer Ready!


In [17]:
import os, sys, types
os.environ["WANDB_DIABLED"] = "true"
os.environ["WANDB_SILENT"] = "true"
os.environ["WANDB_MODE"] = "disabled"

fake_wandb = types.SimpleNamespace(
    init= lambda *a, **k:None,
    log=lambda*a, **k:None,
    finish=lambda *a, **k:None,
    run=None,
    config={}
)
sys.modules["wandb"]=fake_wandb
print("disabled")

disabled


In [18]:
pip uninstall -y wandb

Found existing installation: wandb 0.22.3
Uninstalling wandb-0.22.3:
  Successfully uninstalled wandb-0.22.3


In [19]:
trainer.train()


  | |_| | '_ \/ _` / _` |  _/ -_)


Step,Training Loss
50,26.1006
100,7.9345
150,2.2338
200,1.1093
250,1.0066
300,0.4745
350,0.3166
400,0.2085
450,0.1332
500,0.081


TrainOutput(global_step=24908, training_loss=0.12323275098395574, metrics={'train_runtime': 11573.1411, 'train_samples_per_second': 8.608, 'train_steps_per_second': 2.152, 'total_flos': 1.31685110513664e+16, 'train_loss': 0.12323275098395574, 'epoch': 4.0})

In [20]:
metrics = trainer.evaluate(tokenized_ds["test"])
metrics


{'eval_loss': 0.04215646907687187,
 'eval_runtime': 49.2564,
 'eval_samples_per_second': 63.2,
 'eval_steps_per_second': 15.815,
 'epoch': 4.0}

In [21]:
trainer.save_model("./final_telugu_chatbot_tel1")
tokenizer.save_pretrained("./final_telugu_chatbot_tel1")

print("Model saved successfully!")


Model saved successfully!


In [25]:
def te_chatbot(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(model.device)

    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=100,
        min_length=5,
        num_beams=4,
        no_repeat_ngram_size=3,
        repetition_penalty=2.0,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [29]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("./final_telugu_chatbot_tel1")
model = AutoModelForSeq2SeqLM.from_pretrained("./final_telugu_chatbot_tel1")

print("Chatbot model loaded!")


Chatbot model loaded!


In [30]:
print("Telugu Chatbot Ready!\n")

while True:
    text = input("మీరు: ")
    if text.lower() in ["exit", "quit"]:
        break
    print("బాట్:", te_chatbot(text))


Telugu Chatbot Ready!

మీరు: నాకు చాలా ఒత్తిడి గా ఉంది.
బాట్: Class: normal, Compound: 0.00
మీరు: నా లక్ష్యం చేరుకోవడం చాలా కష్టంగా అనిపిస్తోంది. నేను ఏమి చేయాలి?
బాట్: Class: normal, Compound: 0.00
మీరు: నమస్తే! ఎలా ఉన్నావు?
బాట్: Class: normal, Compound: 0.00
మీరు: నేను చాలా బాధగా ఉన్నాను. నాకు సపోర్ట్ కావాలి.
బాట్: Class: normal, Compound: 0.00
మీరు: నా పరిస్థితి చాలా కఠినంగా ఉంది. నేను ఒంటరిగా ఉన్న భావన వస్తోంది.
బాట్: Class: depression, Compound: -0.99
మీరు: exit
