In [1]:
!pip install transformers datasets sacrebleu nltk sentencepiece --root-user-action=ignore


Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.0.0 sacrebleu-2.4.3


In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import MarianMTModel, MarianTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sacrebleu.metrics import BLEU
import nltk

nltk.download('punkt')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load dataset
path = '/kaggle/input/hindi-english-dataset/hindi_english_parallel_reduced (1).csv'  # Replace with your dataset path
data = pd.read_csv(path)

# Rename columns for HuggingFace compatibility
data = data.rename(columns={"english": "en", "hindi": "hi"})

# Preview the dataset
print(data.head())


                                                  hi  \
0    अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें   
1                    एक्सेर्साइसर पहुंचनीयता अन्वेषक   
2              निचले पटल के लिए डिफोल्ट प्लग-इन खाका   
3               ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका   
4  उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...   

                                               en  
0  Give your application an accessibility workout  
1               Accerciser Accessibility Explorer  
2  The default plugin layout for the bottom panel  
3     The default plugin layout for the top panel  
4  A list of plugins that are disabled by default  


In [4]:
# Drop rows with missing values
data = data.dropna(subset=['en', 'hi'])

# Ensure all entries are strings
data = data[data['en'].apply(lambda x: isinstance(x, str))]
data = data[data['hi'].apply(lambda x: isinstance(x, str))]

print(f"Cleaned dataset contains {len(data)} rows.")


Cleaned dataset contains 9993 rows.


In [5]:
# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(data)

# Split into train, validation, and test sets
train_test_split = dataset.train_test_split(test_size=0.1)
train_valid_split = train_test_split['train'].train_test_split(test_size=0.1)
dataset = DatasetDict({
    'train': train_valid_split['train'],
    'validation': train_valid_split['test'],
    'test': train_test_split['test']
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['hi', 'en', '__index_level_0__'],
        num_rows: 8093
    })
    validation: Dataset({
        features: ['hi', 'en', '__index_level_0__'],
        num_rows: 900
    })
    test: Dataset({
        features: ['hi', 'en', '__index_level_0__'],
        num_rows: 1000
    })
})


In [6]:
model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

print("Model and tokenizer loaded successfully!")


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Model and tokenizer loaded successfully!


In [7]:
def preprocess_data(batch):
    return tokenizer(
        batch['en'],
        text_target=batch['hi'],
        truncation=True,
        padding="max_length",
        max_length=128
    )

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_data, batched=True)


Map:   0%|          | 0/8093 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)




In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [10]:
bleu_metric = BLEU()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Align references
    decoded_labels = [[label] for label in decoded_labels]

    # Compute BLEU score
    bleu = bleu_metric.corpus_score(decoded_preds, decoded_labels)
    return {"bleu": bleu.score}


In [11]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


UsageError: api_key not configured (no-tty). call wandb.login(key=[your_api_key])

In [None]:
predictions = trainer.predict(test_dataset=tokenized_dataset['test'])
decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions.predictions]
decoded_refs = [[ref] for ref in tokenized_dataset['test']['hi']]

# Compute BLEU score
bleu_score = bleu_metric.corpus_score(decoded_preds, decoded_refs).score
print("BLEU Score:", bleu_score)


In [None]:
model.save_pretrained('./trained_nmt_model')
tokenizer.save_pretrained('./trained_nmt_model')


In [None]:
# Check and set the device (CPU or GPU)
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the appropriate device
model = model.to(device)

# Translation function with device compatibility
def translate(text):
    # Tokenize the input and move to the same device as the model
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the same device
    
    # Generate translation
    outputs = model.generate(**inputs)
    
    # Decode and return the translation
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the function with an example
example_text = "what are you doing?"
print("Translation:", translate(example_text))
