In [None]:
### Install Dependencies
!pip install torch transformers datasets bitsandbytes unsloth peft

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting unsloth
  Downloading unsloth-2025.2.15-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metad

In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import pandas as pd
from datasets import Dataset

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

### Enable CPU (since Unsloth requires NVIDIA GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

### Load Pretrained Model
MODEL_NAME = "Helsinki-NLP/opus-mt-en-hi"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


### Manually Created Hindi Colloquial Dataset
hindi_data = [
    {"input": "How are you?", "output": "Tu kaisa hai?"},
    {"input": "Where are you?", "output": "Tu kahan hai?"},
    {"input": "What’s up?", "output": "Kya chal raha hai?"},
    {"input": "Let’s go!", "output": "Chal nikal!"},
    {"input": "Don’t stress.", "output": "Tension mat le."},
    {"input": "Are you coming?", "output": "Tu aa raha hai?"},
    {"input": "What’s the plan?", "output": "Plan kya hai?"},
    {"input": "Can’t believe it!", "output": "Yeh nahi ho sakta!"},
    {"input": "It’s too late.", "output": "Bahut der ho gayi hai."},
    {"input": "I’ll just check.", "output": "Main dekh leta hoon."},
    {"input": "I’m broke.", "output": "Mere paas paise nahi hain."},
    {"input": "This is crazy.", "output": "Yeh bilkul pagalpan hai."},
    {"input": "Stop messing with me.", "output": "Mujhse khelna band kar."},
    {"input": "You’re annoying.", "output": "Tu tang kar raha hai."},
    {"input": "Mind your business.", "output": "Apna kaam dekh."},
    {"input": "I was just joking.", "output": "Main bas mazaak kar raha tha."},
    {"input": "What’s wrong with you?", "output": "Tere saath kya ho raha hai?"},
    {"input": "Leave me alone.", "output": "Mujhe akela chhod de."},
    {"input": "Don’t make excuses.", "output": "Bahane mat bana."},
    {"input": "What’s the point?", "output": "Kya faayda?"},
    {"input": "I don’t care.", "output": "Mujhe farak nahi padta."},
    {"input": "No way!", "output": "Bilkul nahi!"},
    {"input": "Let it be.", "output": "Jaane de."},
    {"input": "Hurry up!", "output": "Jaldi kar!"},
    {"input": "What’s going on?", "output": "Kya ho raha hai?"},
    {"input": "I’m hungry.", "output": "Mujhe bhookh lagi hai."},
    {"input": "I’m tired.", "output": "Main thak gaya hoon."},
    {"input": "It’s not my problem.", "output": "Yeh mera problem nahi hai."},
    {"input": "Don’t worry.", "output": "Fikar mat kar."},
    {"input": "Let’s chill.", "output": "Aaram se baithte hain."},
    {"input": "He’s acting smart.", "output": "Woh bada tez ban raha hai."},
    {"input": "Forget it.", "output": "Bhool ja."},
    {"input": "You deserve it.", "output": "Tujhe yeh milna hi chahiye."},
    {"input": "Keep it up!", "output": "Aise hi karte raho!"},
    {"input": "That’s awesome!", "output": "Wah! Zabardast!"},
    {"input": "It’s so boring.", "output": "Yeh bohot bore kar raha hai."},
    {"input": "You did well.", "output": "Tune accha kiya."},
    {"input": "It’s too expensive.", "output": "Yeh bohot mehenga hai."},
    {"input": "Don’t be shy.", "output": "Sharma mat."},
    {"input": "You never listen.", "output": "Tu kabhi nahi sunta."},
    {"input": "Come here!", "output": "Idhar aa!"},
    {"input": "Don’t touch it.", "output": "Usse mat chho."},
    {"input": "It’s your turn.", "output": "Ab teri baari hai."},
    {"input": "Take care.", "output": "Apna khayal rakh."},
    {"input": "I’m feeling sleepy.", "output": "Mujhe neend aa rahi hai."},
    {"input": "Stop shouting.", "output": "Chillane band kar."},
    {"input": "I’m just kidding.", "output": "Main mazaak kar raha hoon."},
    {"input": "Listen to me.", "output": "Meri baat sun."},
    {"input": "Tell me the truth.", "output": "Sach bata."},
    {"input": "This is ridiculous!", "output": "Yeh to hadd ho gayi!"},
    {"input": "Everything will be fine.", "output": "Sab theek ho jayega."},
    {"input": "It’s not a big deal.", "output": "Koi badi baat nahi hai."},
    {"input": "I don’t get it.", "output": "Mujhe samajh nahi aaya."},
    {"input": "Don’t overthink.", "output": "Zyada mat soch."},
    {"input": "It’s all good.", "output": "Sab theek hai."},
    {"input": "I knew it!", "output": "Mujhe pehle se pata tha!"},
    {"input": "What nonsense!", "output": "Kya bakwaas hai!"},
    {"input": "I’ll call you later.", "output": "Baad mein call karunga."},
    {"input": "Come fast!", "output": "Jaldi aa!"},
    {"input": "It’s not that hard.", "output": "Yeh itna mushkil nahi hai."},
    {"input": "That’s not my fault.", "output": "Yeh meri galti nahi hai."},
    {"input": "I’m getting late.", "output": "Main late ho raha hoon."},
    {"input": "It’s not a big deal.", "output": "Koi badi baat nahi hai."},
    {"input": "Long time no see!", "output": "Kaafi waqt ho gaya milke!"},
    {"input": "What are you up to?", "output": "Kya kar raha hai?"},
    {"input": "Just kidding.", "output": "Bas mazaak kar raha tha."},
    {"input": "You rock!", "output": "Tu kamaal hai!"},
    {"input": "That’s nuts!", "output": "Yeh to paagalpan hai!"},
    {"input": "Good luck!", "output": "Shubh kaamnaayein!"},
    {"input": "Let me know.", "output": "Mujhe bata dena."},
    {"input": "See you soon.", "output": "Jaldi milte hain."},
    {"input": "Take it easy.", "output": "Dheere rehna."},
    {"input": "Hang on a sec.", "output": "Ek minute ruk."},
    {"input": "I got this.", "output": "Main sambhal lunga."},
    {"input": "You bet!", "output": "Bilkul!"},
    {"input": "Chill out.", "output": "Shaant ho ja."},
    {"input": "I’m on my way.", "output": "Main aa raha hoon."},
    {"input": "Hold on.", "output": "Ruk ja."},
    {"input": "I’ll be there.", "output": "Main wahan hoon jaaunga."},
    {"input": "Sounds good.", "output": "Achha lag raha hai."},
    {"input": "My bad.", "output": "Meri galti hai."},
    {"input": "No problem.", "output": "Koi baat nahi."},
    {"input": "What a pity.", "output": "Kitni pacht!"},
    {"input": "That’s enough.", "output": "Yeh kafi hai."},
    {"input": "Calm down.", "output": "Shaant ho ja."},
    {"input": "Good job!", "output": "Shabash!"},
    {"input": "Way to go!", "output": "Yeh hui na baat!"},
    {"input": "Nice one!", "output": "Wah bhai!"},
    {"input": "That’s weird.", "output": "Yeh ajeeb hai."},
    {"input": "Don’t mind me.", "output": "Mere baare mein mat soch."},
    {"input": "Just saying.", "output": "Bas keh raha tha."},
    {"input": "You know what?", "output": "Pata hai kya?"},
    {"input": "No biggie.", "output": "Koi badi baat nahi."},
    {"input": "Looking forward.", "output": "Umeed hai milne ki."},
    {"input": "Tell me more.", "output": "Aur batao."},
    {"input": "Will do.", "output": "Kar dunga."},
    {"input": "Count me in.", "output": "Mujhe bhi ginto."},
    {"input": "My treat.", "output": "Mera kharch."},
    {"input": "Make yourself at home.", "output": "Apne ghar jaisa samajh lo."},
    {"input": "Good to see you.", "output": "Tumhe dekh kar accha laga."},
    {"input": "I’m proud of you.", "output": "Mujhe tum par garv hai."},
    {"input": "It’s a deal.", "output": "Pakki baat hai."},
    {"input": "Stay safe.", "output": "Surakshit rehna."},
    {"input": "No way, José!", "output": "Bilkul nahi!"},
    {"input": "You nailed it.", "output": "Tumne kamaal kar diya."},
    {"input": "Don’t be late.", "output": "Der mat hona."},
    {"input": "See you tomorrow.", "output": "Kal milte hain."},
    {"input": "Break a leg!", "output": "All the best!"},
    {"input": "Take a break.", "output": "Aram kar."},
    {"input": "Don’t give up.", "output": "Himmat mat haarna."},
    {"input": "You got it.", "output": "Samajh gaya."},
    {"input": "I owe you one.", "output": "Maine tera farz ada kiya."},
    {"input": "Right on!", "output": "Bilkul sahi!"},
    {"input": "You’re kidding me!", "output": "Tu mazaak kar raha hai!"},
    {"input": "Get over it.", "output": "Chhodo yaar."},
    {"input": "That’s enough.", "output": "Bas bahut ho gaya."},
    {"input": "Let’s bounce.", "output": "Chalte hain yahan se."},
    {"input": "You got this!", "output": "Tu kar lega!"},
    {"input": "You wish!", "output": "Sochna bhi mat!"},
    {"input": "Deal with it.", "output": "Jhelo ise."},
    {"input": "Zip it!", "output": "Chup ho ja!"},
    {"input": "Not a chance.", "output": "Bilkul bhi nahi."},
    {"input": "You tell me.", "output": "Tu hi bata."},
    {"input": "For real?", "output": "Sach mein?"},
    {"input": "Exactly!", "output": "Bilkul sahi!"},
    {"input": "You okay?", "output": "Sab thik hai?"},
    {"input": "That sucks!", "output": "Kitna bekar hai!"},
    {"input": "I messed up.", "output": "Mujhse galti ho gayi."},
    {"input": "Chill, bro.", "output": "Shaant ho ja bhai."},
    {"input": "That’s wild!", "output": "Wah kya baat hai!"},
    {"input": "I don’t buy it.", "output": "Main ispe vishwas nahi karta."},
    {"input": "Seriously?", "output": "Sach mein?"},
    {"input": "You’re the best!", "output": "Tu best hai!"},
    {"input": "You scared me.", "output": "Tu ne dara diya."},
    {"input": "I’m in.", "output": "Main ready hoon."},
    {"input": "You’re lucky.", "output": "Tu lucky hai."},
    {"input": "I’m listening.", "output": "Main sun raha hoon."},
    {"input": "Give me a sec.", "output": "Ek second de."},
    {"input": "Trust me.", "output": "Mujh par bharosa kar."},
    {"input": "Just a minute.", "output": "Ek minute."},
    {"input": "I can’t wait.", "output": "Mujhse intezar nahi ho raha."},
    {"input": "Fingers crossed!", "output": "Umeed hai sab accha hoga!"},
    {"input": "Take a seat.", "output": "Baith ja."},
    {"input": "That’s the spirit!", "output": "Yeh hui na baat!"},
    {"input": "I get it.", "output": "Samajh gaya."},
    {"input": "That’s the point.", "output": "Yahi baat hai."},
    {"input": "Got it!", "output": "Samajh gaya!"},
    {"input": "I’m out.", "output": "Main nikal raha hoon."},
    {"input": "No offense.", "output": "Bura mat manna."},
    {"input": "I’m done.", "output": "Main ho gaya."},
    {"input": "That’s lit!", "output": "Wah, kamaal hai!"},
    {"input": "No kidding!", "output": "Mazak nahi kar raha!"},
    {"input": "He’s so full of himself.", "output": "Usko bahut ghamand hai."},
    {"input": "That’s insane!", "output": "Yeh to pagalpan hai!"},
    {"input": "Don’t jinx it.", "output": "Kaala na kar de."},
    {"input": "I got lucky.", "output": "Meri kismat achhi thi."},
    {"input": "Let me think.", "output": "Sochne de."},
    {"input": "Speak up!", "output": "Zor se bol!"},
    {"input": "Get a life!", "output": "Kuch aur kar le zindagi mein!"},
    {"input": "Whatever!", "output": "Jo bhi!"},
    {"input": "No clue.", "output": "Bilkul idea nahi."},
    {"input": "Don’t push it.", "output": "Zyada mat daba."},
    {"input": "That’s deep.", "output": "Yeh gehri baat hai."},
    {"input": "It happens.", "output": "Kabhi kabhi hota hai."},
    {"input": "Don’t even ask.", "output": "Mat puchh."},
    {"input": "I’ll think about it.", "output": "Main sochta hoon."},
    {"input": "I’m on it.", "output": "Main kaam pe lag gaya hoon."},
    {"input": "You talk too much.", "output": "Tu bahut bolta hai."},
    {"input": "Wait a second.", "output": "Ek second ruk."},
    {"input": "That’s the worst.", "output": "Yeh to sabse bekar hai."},
    {"input": "He’s always late.", "output": "Woh hamesha late hota hai."},
    {"input": "Don’t stare.", "output": "Ghoor mat."},
    {"input": "It’s my bad.", "output": "Meri galti hai."},
    {"input": "Don't be rude.", "output": "Badtameezi mat kar."},
    {"input": "Move aside.", "output": "Hato zara."},
    {"input": "Not happening.", "output": "Nahi hone wala."},
    {"input": "It’s useless.", "output": "Koi faayda nahi."},
    {"input": "Get in line.", "output": "Line mein lag ja."},
    {"input": "You again?", "output": "Phir tu?"},
    {"input": "Stop it right now!", "output": "Abhi band kar!"},
    {"input": "That was close.", "output": "Thoda bach gaye."},
    {"input": "Who cares?", "output": "Kise farak padta hai?"},
    {"input": "Seriously man?", "output": "Sach mein yaar?"},
    {"input": "What a mess!", "output": "Kitna gandh hai!"},
    {"input": "Don’t drag it.", "output": "Zyada kheench mat."},
    {"input": "I’m impressed.", "output": "Main prabhavit hoon."},
    {"input": "That’s hilarious!", "output": "Wah! Bohot mazedaar!"},
    {"input": "Don’t show off.", "output": "Dikhawa mat kar."},
    {"input": "Let me handle it.", "output": "Mujhe sambhalne de."},
    {"input": "You’ll see.", "output": "Dekhle tu khud."},
    {"input": "Let it go.", "output": "Chhod de yaar."},
    {"input": "I give up.", "output": "Main haar maan gaya."},
    {"input": "Back off!", "output": "Door ho ja!"},
    {"input": "He’s weird.", "output": "Woh ajeeb hai."},
    {"input": "Let’s party!", "output": "Party karte hain!"},
    {"input": "Why not?", "output": "Kyun nahi?"},
    {"input": "Calm your mind.", "output": "Dimaag shaant kar."},
    {"input": "Speak your mind.", "output": "Jo sochta hai, bol."},
    {"input": "I’m not okay.", "output": "Main thik nahi hoon."},
    {"input": "That’s crazy talk!", "output": "Yeh kya bakwaas hai!"},
    {"input": "Laugh it off.", "output": "Hanso aur bhool jao."},
    {"input": "No big deal.", "output": "Koi khaas baat nahi."},
    {"input": "You made it!", "output": "Tu ne kar dikhaya!"},
    {"input": "Watch your tone.", "output": "Zubaan sambhal ke."},
    {"input": "I was shocked!", "output": "Main hairaan ho gaya!"},
    {"input": "Guess what?", "output": "Soch kya hua?"},
    {"input": "You said it!", "output": "Bilkul sahi bola!"},
    {"input": "I owe you.", "output": "Main tera ehsaanmand hoon."},
    {"input": "Think before you speak.", "output": "Baat karne se pehle soch."},
    {"input": "Don't act smart.", "output": "Zyada chatur mat ban."},
    {"input": "Just go with it.", "output": "Bas chalte raho."},
    {"input": "It’s not fair.", "output": "Yeh theek nahi hai."},
    {"input": "I’m speechless.", "output": "Mere paas shabd nahi hain."},
    {"input": "Don’t interrupt.", "output": "Beech mein mat bol."},
    {"input": "That’s dope!", "output": "Wah kya baat hai!"},
    {"input": "He’s showing off.", "output": "Woh dikhawa kar raha hai."},
    {"input": "Make it quick.", "output": "Jaldi nipta le."},
    {"input": "You scared?", "output": "Darr gaya kya?"},
    {"input": "Let me guess.", "output": "Mujhe andaza lagane de."},
    {"input": "What are you saying?", "output": "Tu kya bol raha hai?"},
    {"input": "That’s enough for today.", "output": "Aaj ke liye kaafi hai."},
    {"input": "You lied!", "output": "Tu jhooth bola!"},
    {"input": "Why so serious?", "output": "Itna serious kyun hai?"},
    {"input": "Relax bro!", "output": "Aaram se bhai!"},
    {"input": "He’s not worth it.", "output": "Uski koi aukaat nahi."},
    {"input": "You better not!", "output": "Behtar hoga tu na kare."},
    {"input": "Stay back.", "output": "Peeche raho."},
    {"input": "Leave it to me.", "output": "Mujh par chhod do."},
    {"input": "Stop lying.", "output": "Jhooth bolna band kar."},
    {"input": "I’m so done!", "output": "Ab aur nahi hota!"},
    {"input": "Don’t play games.", "output": "Chaalakiyan mat kar."},
    {"input": "Mind your words.", "output": "Zubaan sambhal ke."},
    {"input": "What’s your problem?", "output": "Teri dikkat kya hai?"},
    {"input": "I’m serious.", "output": "Main sach keh raha hoon."},
    {"input": "Let’s move on.", "output": "Aage badhte hain."},
    {"input": "That’s ridiculous.", "output": "Yeh to hadd ho gayi."},
    {"input": "You forgot again?", "output": "Phir se bhool gaya?"},
    {"input": "Don’t test my patience.", "output": "Mera sabr mat azma."},
    {"input": "It happens.", "output": "Aisa hota hai."},
    {"input": "Go for it!", "output": "Kar daal!"},
    {"input": "Let’s fix this.", "output": "Isse theek karte hain."},
    {"input": "Be yourself.", "output": "Khud par vishwas rakho."},
    {"input": "I messed up.", "output": "Mere se galti ho gayi."},
    {"input": "You tell me.", "output": "Tu hi bata."},
    {"input": "You okay?", "output": "Sab theek hai kya?"},
    {"input": "He’s gone mad.", "output": "Woh pagal ho gaya hai."},
    {"input": "Watch this.", "output": "Yeh dekh."},
    {"input": "Try harder.", "output": "Aur mehnat kar."},
    {"input": "I knew this would happen.", "output": "Mujhe pata tha aisa hoga."},
    {"input": "Let’s wrap it up.", "output": "Chalo khatam karte hain."},
    {"input": "You’re impossible.", "output": "Tu to na mumkin hai."},
    {"input": "Guess again.", "output": "Dobara andaza laga."},
    {"input": "Don’t blame me.", "output": "Mujh par ilzaam mat laga."},
    {"input": "I'm not in the mood.", "output": "Mera mood nahi hai."},
    {"input": "Give me a break.", "output": "Thoda araam de."},
    {"input": "You crossed the line.", "output": "Tu hadh paar kar gaya."},
    {"input": "Enough already!", "output": "Bas bahut ho gaya!"},
    {"input": "Don’t be so mean.", "output": "Itna bhi rude mat ho."},
    {"input": "That’s not fair.", "output": "Yeh galat baat hai."},
    {"input": "Try to understand.", "output": "Samajhne ki koshish kar."},
    {"input": "I’m not convinced.", "output": "Main nahi maana."},
    {"input": "What’s your deal?", "output": "Tera scene kya hai?"},
    {"input": "Get over it.", "output": "Bhool ja us baat ko."},
    {"input": "Don’t push it.", "output": "Zyada mat chadha."},
    {"input": "It’s not working.", "output": "Yeh kaam nahi kar raha."},
    {"input": "I’ve had enough.", "output": "Bas ab aur nahi."},
    {"input": "Don’t forget.", "output": "Mat bhoolna."},
    {"input": "You promised!", "output": "Tu ne wada kiya tha!"},
    {"input": "I’m disappointed.", "output": "Main niraash hoon."},
    {"input": "Take a wild guess.", "output": "Aandaza maar le."},
    {"input": "You ruined it.", "output": "Tu ne sab bigaad diya."},
    {"input": "Chill, dude!", "output": "Shaant ho ja yaar!"},
    {"input": "What now?", "output": "Ab kya?"},
    {"input": "Don’t confuse me.", "output": "Mujhe confuse mat kar."},
    {"input": "I didn’t expect this.", "output": "Mujhe yeh ummeed nahi thi."},
    {"input": "Can you believe that?", "output": "Tu soch sakta hai yeh?"},
    {"input": "No chance!", "output": "Koi mauka nahi!"},
    {"input": "Let’s bounce.", "output": "Chalte hain yahan se."},
    {"input": "Speak louder.", "output": "Zor se bol."},
    {"input": "You're too much!", "output": "Tu to kamaal hai!"},
    {"input": "I’m not buying it.", "output": "Main nahi maan raha."},
    {"input": "Why so late?", "output": "Itni der kyun ho gayi?"},
    {"input": "That’s shocking!", "output": "Woh to chaukane wala tha!"},
    {"input": "You’re overthinking.", "output": "Tu zyada soch raha hai."},
    {"input": "Let’s talk later.", "output": "Baad mein baat karte hain."},
    {"input": "I’m counting on you.", "output": "Main tere bharose hoon."},
    {"input": "You started it.", "output": "Tu ne shuru kiya tha."},
    {"input": "Mark my words.", "output": "Meri baat yaad rakhna."},
    {"input": "What are friends for?", "output": "Dost kis din kaam aayenge?"},
    {"input": "Trust me.", "output": "Mujh par bharosa kar."},
    {"input": "I’m with you.", "output": "Main tere saath hoon."},
    {"input": "Stay with me.", "output": "Mere saath rehna."},
    {"input": "What luck!", "output": "Kya naseeb hai!"},
    {"input": "Let me try.", "output": "Mujhe koshish karne de."},
    {"input": "You're so lazy.", "output": "Tu bohot sust hai."},
    {"input": "Enough drama!", "output": "Zyada nautanki mat kar."},
]

### Convert to DataFrame
df = pd.DataFrame(hindi_data)

### Save Dataset to CSV
df.to_csv("hindi_colloquial_dataset.csv", index=False)

### Load Dataset into Hugging Face format
dataset = Dataset.from_pandas(df)

def format_data(example):
    # Tokenize inputs and labels
    inputs = tokenizer(example["input"], truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(example["output"], truncation=True, padding="max_length", return_tensors="pt")

    # Get input_ids and labels and ensure they are not empty
    input_ids = inputs["input_ids"].squeeze()
    labels_ids = labels["input_ids"].squeeze()

    # Pad to the maximum length if necessary
    max_length = max(len(input_ids), len(labels_ids))
    input_ids = torch.cat([input_ids, torch.zeros(max_length - len(input_ids), dtype=torch.long)])
    labels_ids = torch.cat([labels_ids, torch.zeros(max_length - len(labels_ids), dtype=torch.long)])


    return {
        "input_ids": input_ids,
        "labels": labels_ids,
    }

dataset = dataset.map(format_data)

dataset = dataset.train_test_split(test_size=0.2)

### Fine-tune Model on CPU
training_args = TrainingArguments(
    output_dir="fine_tuned_hindi_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

trainer.train()

Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



Map:   0%|          | 0/296 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss




TrainOutput(global_step=177, training_loss=0.18819899478201138, metrics={'train_runtime': 3813.8502, 'train_samples_per_second': 0.186, 'train_steps_per_second': 0.046, 'total_flos': 96000169476096.0, 'train_loss': 0.18819899478201138, 'epoch': 3.0})

In [2]:
def translate_text(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate translation
    translated = model.generate(**inputs)

    # Decode the generated tokens
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

In [3]:
# Sample input sentences to test
test_sentences = [
    "How are you?",
    "Where are you?",
    "What’s up?",
    "Let’s go!"
]

for sentence in test_sentences:
    translated_sentence = translate_text(sentence)
    print(f"Input: {sentence}")
    print(f"Translated: {translated_sentence}")
    print("-" * 50)

Input: How are you?
Translated: आप कैसे हैं?
--------------------------------------------------
Input: Where are you?
Translated: तुम कहाँ हो?
--------------------------------------------------
Input: What’s up?
Translated: क्या हो रहा है?
--------------------------------------------------
Input: Let’s go!
Translated: चलो!
--------------------------------------------------


In [4]:
# Evaluate on test dataset
results = trainer.evaluate()

print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.07215041667222977, 'eval_runtime': 106.1737, 'eval_samples_per_second': 0.565, 'eval_steps_per_second': 0.141, 'epoch': 3.0}


In [5]:
# Define some unseen test data
unseen_data = [
    {"input": "Can we talk?", "expected_output": "Kya hum baat kar sakte hain?"},
    {"input": "I don’t have time.", "expected_output": "Mere paas time nahi hai."},
    {"input": "Let’s eat something.", "expected_output": "Chalo kuch kha lete hain."},
    {"input": "I’m feeling cold.", "expected_output": "Mujhe thand lag rahi hai."},
    {"input": "Where are we going?", "expected_output": "Hum kahan jaa rahe hain?"},
    {"input": "You look happy today.", "expected_output": "Aaj tu bohot khush lag raha hai."},
    {"input": "Don’t lie to me.", "expected_output": "Mujhse jhooth mat bol."},
    {"input": "I forgot my phone.", "expected_output": "Main apna phone bhool gaya."},
    {"input": "Tell me something new.", "expected_output": "Kuch naya bata."},
    {"input": "It’s raining outside.", "expected_output": "Bahar baarish ho rahi hai."},
    {"input": "I’ll wait for you.", "expected_output": "Main tera intezaar karunga."},
    {"input": "Don’t be scared.", "expected_output": "Darna mat."},
    {"input": "Wake up!", "expected_output": "Uth ja!"},
    {"input": "Don’t disturb me.", "expected_output": "Mujhe tang mat kar."},
    {"input": "I’m not joking.", "expected_output": "Main mazaak nahi kar raha hoon."},
    {"input": "You look tired.", "expected_output": "Tu thaka hua lag raha hai."},
    {"input": "She is very sweet.", "expected_output": "Woh bohot pyaari hai."},
    {"input": "What time is it?", "expected_output": "Kitne baje hain?"},
    {"input": "Give me a minute.", "expected_output": "Ek minute de."},
    {"input": "I missed you.", "expected_output": "Mujhe teri yaad aayi."},
    {"input": "I’ll be right back.", "expected_output": "Main abhi aata hoon."},
    {"input": "He’s not at home.", "expected_output": "Woh ghar par nahi hai."},
    {"input": "You’re always late.", "expected_output": "Tu hamesha late hota hai."},
    {"input": "Let’s watch a movie.", "expected_output": "Chalo koi movie dekhte hain."},
    {"input": "Turn off the lights.", "expected_output": "Light band kar do."},
    {"input": "I am not ready.", "expected_output": "Main tayyar nahi hoon."},
    {"input": "That’s your choice.", "expected_output": "Woh teri marzi hai."},
    {"input": "It’s getting dark.", "expected_output": "Andhera ho raha hai."},
    {"input": "Keep your phone silent.", "expected_output": "Apna phone silent pe rakh."},
    {"input": "Let’s go outside.", "expected_output": "Chalo bahar chalte hain."},
]

# Run the model on the unseen test data
for example in unseen_data:
    input_text = example["input"]
    expected_output = example["expected_output"]

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate translation
    output = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True)

    # Decode the output
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Input: {input_text}")
    print(f"Expected Output: {expected_output}")
    print(f"Model Output: {decoded_output}")
    print("-" * 50)

Input: Can we talk?
Expected Output: Kya hum baat kar sakte hain?
Model Output: क्या हम बात कर सकते हैं?
--------------------------------------------------
Input: I don’t have time.
Expected Output: Mere paas time nahi hai.
Model Output: मेरे पास समय नहीं है.
--------------------------------------------------
Input: Let’s eat something.
Expected Output: Chalo kuch kha lete hain.
Model Output: के कुछ खाते हैं.
--------------------------------------------------
Input: I’m feeling cold.
Expected Output: Mujhe thand lag rahi hai.
Model Output: मैं ठंड महसूस कर रहा हूँ.
--------------------------------------------------
Input: Where are we going?
Expected Output: Hum kahan jaa rahe hain?
Model Output: हम कहाँ जा रहे हैं?
--------------------------------------------------
Input: You look happy today.
Expected Output: Aaj tu bohot khush lag raha hai.
Model Output: आप आज खुश लग रहे हैं.
--------------------------------------------------
Input: Don’t lie to me.
Expected Output: Mujhse jhooth ma

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("Saryu10/fine-tuned-hindi-model")
tokenizer.push_to_hub("Saryu10/fine-tuned-hindi-model")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/304M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Saryu10/fine-tuned-hindi-model/commit/2fcaadfad991f1e34389588b84727bc14b930f74', commit_message='Upload tokenizer', commit_description='', oid='2fcaadfad991f1e34389588b84727bc14b930f74', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Saryu10/fine-tuned-hindi-model', endpoint='https://huggingface.co', repo_type='model', repo_id='Saryu10/fine-tuned-hindi-model'), pr_revision=None, pr_num=None)

In [None]:
from datasets import Dataset
dataset.push_to_hub("Saryu10/colloquial-hindi-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/479 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Saryu10/colloquial-hindi-dataset/commit/f64547692ca2840d6e339d97ee741e363457a2e7', commit_message='Upload dataset', commit_description='', oid='f64547692ca2840d6e339d97ee741e363457a2e7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Saryu10/colloquial-hindi-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Saryu10/colloquial-hindi-dataset'), pr_revision=None, pr_num=None)