In [None]:
# 1️⃣ Install packages (optional for local users)
!pip install transformers sentencepiece nltk gradio --quiet

# 2️⃣ Import required libraries
from transformers import MarianTokenizer, MarianMTModel
import torch
import zipfile
import os
from nltk.translate.bleu_score import corpus_bleu
import gradio as gr

# 3️⃣ Load pre-trained MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
print("Model and tokenizer loaded successfully!")

# 4️⃣ Extract dataset from zip
zip_path = "parallel.zip"  # User should place the zip in repo folder
extract_path = "parallel_dataset"
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

en_file = "parallel_dataset/parallel-n/IITB.en-hi.en"
hi_file = "parallel_dataset/parallel-n/IITB.en-hi.hi"

# 5️⃣ Load sentences
with open(en_file, "r", encoding="utf-8") as f_en, open(hi_file, "r", encoding="utf-8") as f_hi:
    en_sentences = [line.strip() for line in f_en if line.strip() != ""]
    hi_sentences = [line.strip() for line in f_hi if line.strip() != ""]

# 6️⃣ Demo translation (first 5 sentences)
sample_sentences = en_sentences[:5]
inputs = tokenizer(sample_sentences, padding=True, return_tensors="pt")
translated_tokens = model.generate(**inputs)
translated_sentences = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]

for en, hi in zip(sample_sentences, translated_sentences):
    print("English:", en)
    print("Predicted Hindi:", hi)
    print("-" * 50)

# 7️⃣ Evaluation (optional)
references = [[hi_sentences[i].split()] for i in range(5)]
candidates = [translated_sentences[i].split() for i in range(5)]
bleu_score = corpus_bleu(references, candidates)
print("BLEU score:", bleu_score)

# 8️⃣ Gradio UI
def translate_en_to_hi(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    output_tokens = model.generate(**inputs)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

interface = gr.Interface(
    fn=translate_en_to_hi,
    inputs=gr.Textbox(lines=3, placeholder="Enter English sentence here...", label="English Input"),
    outputs=gr.Textbox(lines=3, label="Hindi Translation"),
    title="English to Hindi Translation",
    description="This app uses a pre-trained MarianMT Transformer model to translate English text into Hindi."
)

interface.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Model and tokenizer loaded successfully!
English: Give your application an accessibility workout
Predicted Hindi: अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
--------------------------------------------------
English: Accerciser Accessibility Explorer
Predicted Hindi: एक्सेर्साइसर पहुंचनीयता अन्वेषक
--------------------------------------------------
English: The default plugin layout for the bottom panel
Predicted Hindi: निचले पटल के लिए डिफोल्ट प्लग-इन खाका
--------------------------------------------------
English: The default plugin layout for the top panel
Predicted Hindi: ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
--------------------------------------------------
English: A list of plugins that are disabled by default
Predicted Hindi: उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है
--------------------------------------------------
BLEU score: 0.9889485799795921
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automat

