In [10]:
import torch
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
)
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType

# Load dataset
dataset = load_dataset("findnitai/english-to-hinglish")

# Load tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Configure QLoRA (4-bit quantization)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load model in 4-bit
model = T5ForConditionalGeneration.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Add LoRA adapter
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],  # LoRA will be injected into T5 attention layers
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, peft_config)

# Preprocessing function
def preprocess(sample):
    source_texts = ["Translate English to Hinglish: " + ex["en"] for ex in sample["translation"]]
    target_texts = [ex["hi_ng"] for ex in sample["translation"]]

    model_inputs = tokenizer(source_texts, max_length=128, padding="max_length", truncation=True)
    labels = tokenizer(target_texts, max_length=128, padding="max_length", truncation=True)

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Preprocess the dataset
train_dataset = dataset["train"].map(preprocess, batched=True, remove_columns=["translation"])

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-hinglish-qlora",
    per_device_train_batch_size=32,
    num_train_epochs=3,
    learning_rate=1e-5,  # <-- Lowered learning rate
    save_steps=500,
    save_total_limit=2,
    logging_dir="logs",
    logging_steps=100,
    report_to="none",
    max_steps=100000
)


# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train
trainer.train()

# Save model
trainer.save_model("t5-hinglish-qlora")


  trainer = Seq2SeqTrainer(


Step,Training Loss
100,5.5306
200,5.3527
300,5.1791
400,4.9766
500,4.8608
600,4.7289
700,4.5678
800,4.5084
900,4.391
1000,4.3225


In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch
print("k")
# Load the fine-tuned model and tokenizer
model_path = "t5-hinglish-qlora"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Function to translate English to Hinglish
def translate_to_hinglish(text):
    prompt = "Translate English to Hinglish: " + text
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test sentences
test_sentences = [
    "I know that go home today and come back later in the day",
    "Where are you going right now?",
    "Please call me when you reach there.",
    "What are you doing tomorrow?",
    "I will meet you at the station at 5 PM.",
    "She is not feeling well today.",
    "Don't forget to bring the documents.",
    "It is very hot outside, drink water.",
    "We will go to the market after lunch.",
    "Can you help me with this work?",
    "He said he will be late for the party.",
    "I think I lost my phone in the cab.",
    "Let's watch a movie tonight.",
    "Your performance was really good!",
    "Why are you so silent today?",
]

# Translate and print results
for sentence in test_sentences:
    translated = translate_to_hinglish(sentence)
    print(f"English: {sentence}")
    print(f"Hinglish: {translated}\n")


In [None]:
kaggle kernels output shravyaponugoti/indicbart -p /path/to/dest