In [128]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/hinge-english-to-hinglish-machine-translation/no_label.csv
/kaggle/input/hinge-english-to-hinglish-machine-translation/synthetic-dataset/valid.csv
/kaggle/input/hinge-english-to-hinglish-machine-translation/synthetic-dataset/train.csv
/kaggle/input/hinge-english-to-hinglish-machine-translation/synthetic-dataset/.ipynb_checkpoints/train-checkpoint.csv
/kaggle/input/hinge-english-to-hinglish-machine-translation/human-generated-dataset/valid_human_generated.pkl
/kaggle/input/hinge-english-to-hinglish-machine-translation/human-generated-dataset/train_human_generated.pkl


In [129]:
import warnings
warnings.filterwarnings("ignore")

In [130]:
df = pd.read_json("hf://datasets/findnitai/english-to-hinglish/hinglish_upload_v1.json", lines=True)
df.head()

Unnamed: 0,translation
0,"{'en': 'What's the name of the movie', 'hi_ng'..."
1,"{'en': 'Hi, the rotten tomatoes score is great..."
2,"{'en': 'Do you think you will like the movie',..."
3,"{'en': 'What kind of movie is it', 'hi_ng': 'y..."
4,"{'en': 'when was the movie made?', 'hi_ng': 'f..."


In [131]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Keep only rows where 'translation' is a dictionary
df = df[df['translation'].apply(lambda x: isinstance(x, dict))].copy()

# Step 2: Normalize the dictionary to separate columns
normalized_df = pd.json_normalize(df['translation'])

# Step 3: Rename columns
normalized_df = normalized_df.rename(columns={'en': 'English', 'hi_ng': 'Hinglish'})

# Step 4: Drop the original 'translation' column and merge
df = df.drop(columns=['translation']).join(normalized_df)

# Step 5: Drop any rows with missing English or Hinglish translations
df = df.dropna(subset=['English', 'Hinglish'])

# Step 6: Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

# Optional: Display results
print("Train set:\n", train_df.head())
print("Validation set:\n", val_df.head())


Train set:
                                              English  \
0                             has my timer started ?   
1                      Set a weekend alarm for 10 am   
2  Show Christmas lighting events near me this month   
3  Should I avoid any highways on the way to New ...   
4                         how sunny will it be today   

                                            Hinglish  source  
0                            mera timer shuru hoga ?       0  
1           10 am ke liye weekend alarm ko set kardo       1  
2  is mahine mere aas pas ke Christmas lighting e...       0  
3  Kya muje New York jaate samay raaste me koi hi...       0  
4                       Aaj kitna sunny hone wala he       0  
Validation set:
                                              English  \
0  tell my boyfriend to meet for drinks at Outbac...   
1  i need to get rid of all of my weekend reminde...   
2  Please show me alternative routes to Northeast...   
3  how long will it take me to g

In [132]:
print(len(df))

189102


In [133]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)


In [134]:
import torch

# Force use of only one CUDA device (cuda:0)
if torch.cuda.is_available():
    torch.cuda.set_device(0)  # Ensures only CUDA device 0 is used
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


Using device: cuda:0


In [135]:
# Load CSV containing 'English' and 'Hinglish' columns
train_dataset = Dataset.from_pandas(train_df[['English', 'Hinglish']])
test_dataset = Dataset.from_pandas(val_df[['English', 'Hinglish']])

In [136]:
print(train_dataset)

Dataset({
    features: ['English', 'Hinglish'],
    num_rows: 151281
})


In [137]:
# Load tokenizer and model
model_name = "ai4bharat/INDICBART"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--ai4bharat--INDICBART/snapshots/78466a0c0e29f9229f7005623ecd6bc4243c0ae0/config.json
Model config MBartConfig {
  "_name_or_path": "ai4bharat/INDICBART",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 64000,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 64001,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_position_embeddings": 1024,
  "model_type": "mbart",
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "scale_embedding": false,
  "tokenizer_class"

In [138]:
def preprocess(example):
    inputs = tokenizer(
        example["English"], padding="max_length", truncation=True, max_length=128
    )
    targets = tokenizer(
        example["Hinglish"], padding="max_length", truncation=True, max_length=128
    )
    inputs["labels"] = targets["input_ids"]

    # Only return relevant fields
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": inputs["labels"]
    }


In [139]:
# Apply preprocessing
tokenized_train_dataset = train_dataset.map(preprocess, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/151281 [00:00<?, ? examples/s]

Map:   0%|          | 0/37821 [00:00<?, ? examples/s]

In [140]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [141]:
from transformers import logging

# Suppress extra warnings (optional)
logging.set_verbosity_info()

# Show tqdm progress bar in Kaggle
training_args = Seq2SeqTrainingArguments(
    output_dir="./indicbart_hinglish",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-3,
    num_train_epochs=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    predict_with_generate=True,
    save_total_limit=2,
    logging_steps=10,       
    report_to="none",       
)


PyTorch: setting up devices


In [142]:
# Trainer (updated)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
)

In [143]:
# Start training
trainer.train()

The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: Hinglish, English. If Hinglish, English are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 151,281
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9,456
  Number of trainable parameters = 244,017,152


Epoch,Training Loss,Validation Loss
1,0.0904,0.085314


Saving model checkpoint to ./indicbart_hinglish/checkpoint-9456
Configuration saved in ./indicbart_hinglish/checkpoint-9456/config.json
Configuration saved in ./indicbart_hinglish/checkpoint-9456/generation_config.json
Model weights saved in ./indicbart_hinglish/checkpoint-9456/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: Hinglish, English. If Hinglish, English are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 37821
  Batch size = 16
Saving model checkpoint to ./indicbart_hinglish/checkpoint-9456
Configuration saved in ./indicbart_hinglish/checkpoint-9456/config.json
Configuration saved in ./indicbart_hinglish/checkpoint-9456/generation_config.json
Model weights saved in ./indicbart_hinglish/checkpoint-9456/model.safetensors


Training completed. Do not forget to share your

TrainOutput(global_step=9456, training_loss=0.14130312286304741, metrics={'train_runtime': 5597.0777, 'train_samples_per_second': 27.029, 'train_steps_per_second': 1.689, 'total_flos': 2.0490823290322944e+16, 'train_loss': 0.14130312286304741, 'epoch': 1.0})

In [144]:
# Save model and tokenizer
model.save_pretrained("./indicbart_hinglish_final")
tokenizer.save_pretrained("./indicbart_hinglish_final")

Configuration saved in ./indicbart_hinglish_final/config.json
Configuration saved in ./indicbart_hinglish_final/generation_config.json
Model weights saved in ./indicbart_hinglish_final/model.safetensors
tokenizer config file saved in ./indicbart_hinglish_final/tokenizer_config.json
Special tokens file saved in ./indicbart_hinglish_final/special_tokens_map.json


('./indicbart_hinglish_final/tokenizer_config.json',
 './indicbart_hinglish_final/special_tokens_map.json',
 './indicbart_hinglish_final/spiece.model',
 './indicbart_hinglish_final/added_tokens.json',
 './indicbart_hinglish_final/tokenizer.json')

In [145]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("./indicbart_hinglish_final")
tokenizer = AutoTokenizer.from_pretrained("./indicbart_hinglish_final")


loading configuration file ./indicbart_hinglish_final/config.json
Model config MBartConfig {
  "_name_or_path": "./indicbart_hinglish_final",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 64000,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 64001,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_position_embeddings": 1024,
  "model_type": "mbart",
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "scale_embedding": false,
  "tokenizer_class": "AlbertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.47.0",
  "use_cache": t

In [146]:
model = model.to(device)

In [147]:
def translate(text):
    model.eval()

    # Move tokenizer inputs to the same device
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Generate translation
    with torch.no_grad():
        output_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=5)

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [148]:
print("has my timer started ?")
print(translate("has my timer started ?"))
print("\nset an alarm for me")
print(translate("set an alarm for me"))
print("\nDid I get new messages ?")
print(translate("Did I get new messages ?"))
print("\nWhat is the time right now ?")
print(translate("What is the time right now ?"))
print("\nIt will be sunny today")
print(translate("It will be sunny today"))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


has my timer started ?
kya mera timer shuru hoga ?

set an alarm for me
mere liye ek alarm set karen

Did I get new messages ?
kya maine naye messages milgaye hai ?

What is the time right now ?
abhi ka time kya hai ?

It will be sunny today
Aaj dhoop hogi


In [151]:
print("My smartwatch just died in the middle of a workout.")
print(translate("My smartwatch just died in the middle of a workout."))

print("\nThe Wi-Fi router is acting up again.")
print(translate("The Wi-Fi router is acting up again."))

print("\nI need to clear my browser history.")
print(translate("I need to clear my browser history."))

print("\nI forgot to cancel my subscription to that streaming service.")
print(translate("I forgot to cancel my subscription to that streaming service."))

print("\nThe game crashed right before I reached the final boss.")
print(translate("The game crashed right before I reached the final boss."))

print("\nMy phone froze when I was about to check an important message.")
print(translate("My phone froze when I was about to check an important message."))

print("\nI need to fix the bug in my code before the deadline.")
print(translate("I need to fix the bug in my code before the deadline."))

print("\nI'm trying to get my hands on the new gaming console.")
print(translate("I'm trying to get my hands on the new gaming console."))

print("\nI ordered food online, but they gave me the wrong item.")
print(translate("I ordered food online, but they gave me the wrong item."))

print("\nI need a caffeine boost to survive this meeting.")
print(translate("I need a caffeine boost to survive this meeting."))


My smartwatch just died in the middle of a workout.
meri smartwatch just died in the middle of a workout.

The Wi-Fi router is acting up again.
Wi-Fi router fir se chal raha hai.

I need to clear my browser history.
mujhe mere browser history clear karna hai

I forgot to cancel my subscription to that streaming service.
I forgot to cancel my subscription to that streaming service.

The game crashed right before I reached the final boss.
The game crashed right before I reached the final boss.

My phone froze when I was about to check an important message.
mera phone froze kab tha jab mujhe important message check karne ke liye

I need to fix the bug in my code before the deadline.
mujhe deadline se pehle mere code me bug fix karna hai

I'm trying to get my hands on the new gaming console.
mai new gaming console par apne hands laana chahta hoon

I ordered food online, but they gave me the wrong item.
I ordered food online, but they gave me the wrong item.

I need a caffeine boost to surv

In [149]:
!pip install evaluate sacrebleu nltk



In [155]:
model.eval()

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(64014, 1024, padding_idx=0)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(64014, 1024, padding_idx=0)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-5): 6 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True

In [None]:
# import pandas as pd
# import torch
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# # Load CSV
# df = pd.read_csv("/kaggle/input/hinge-english-to-hinglish-machine-translation/synthetic-dataset/valid.csv")
# df_sample = df.sample(n=50, random_state=42)  # random_state ensures reproducibility

# # Choose source column (e.g., "English")
# sentences = df["English"].astype(str).tolist()

# # Tokenize
# inputs = tokenizer(
#     sentences,
#     return_tensors="pt",
#     padding=True,
#     truncation=True,
#     max_length=100
# )
# inputs = {k: v.to(device) for k, v in inputs.items()}

# device = torch.device("cpu")
# model = model.to(device)
# for k in inputs:
#     inputs[k] = inputs[k].to(device)

# inputs.pop("token_type_ids", None)

# with torch.no_grad():
#     outputs = model.generate(**inputs, max_length=128, num_beams=5)

# # Remove token_type_ids if present
# if 'token_type_ids' in inputs:
#     inputs.pop('token_type_ids')

# # Generate translations
# with torch.no_grad():
#     outputs = model.generate(**inputs, max_length=128, num_beams=5)

# # Decode predictions
# preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# # Optionally, add to DataFrame and save
# df["Translated"] = preds
# df.to_csv("translated_output.csv", index=False)

# print("Translations saved to translated_output.csv")


In [None]:
# pip install rouge_score

In [None]:
# import evaluate

# # Get references from your CSV
# references = df["Hinglish"].astype(str).tolist()

# # Make sure both preds and references are aligned
# assert len(preds) == len(references), "Mismatch between predictions and references."

# # BLEU
# bleu = evaluate.load("bleu")
# print("BLEU:", bleu.compute(predictions=preds, references=[[ref] for ref in references]))

# # SacreBLEU
# sacrebleu = evaluate.load("sacrebleu")
# print("SacreBLEU:", sacrebleu.compute(predictions=preds, references=[[ref] for ref in references]))

# # chrF
# chrf = evaluate.load("chrf")
# print("chrF:", chrf.compute(predictions=preds, references=[[ref] for ref in references]))

# # ROUGE
# rouge = evaluate.load("rouge")
# print("ROUGE:", rouge.compute(predictions=preds, references=references))

# # Exact Match Accuracy
# exact_match = [int(p.strip() == r.strip()) for p, r in zip(preds, references)]
# print("Exact Match Accuracy:", sum(exact_match) / len(exact_match))
