<a href="https://colab.research.google.com/github/Rishabhsrivastav21/AI-powered-Multilingual-Translator/blob/main/AI_powered_Multilingual_Translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# == CELL 1: Install Dependencies ==
!pip install -q torch transformers gradio langdetect sentencepiece

In [None]:
# == CELL 2: Load Optimized Model ==
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr
from langdetect import detect as detect_lang, LangDetectException

print("Loading optimized NLLB-200 model for 100 languages...")

model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")

# Configure for 100-language support
tokenizer.model_max_length = 1024
print(f"✅ Model ready for 100 languages (max {tokenizer.model_max_length} tokens)")

Loading optimized NLLB-200 model for 100 languages...


In [None]:
 #== CELL 3: Optimized Language Support ==
# Top 100 languages with enhanced coverage
lang_codes = {
    # European Languages (20)
    "english": "eng_Latn", "spanish": "spa_Latn", "french": "fra_Latn",
    "german": "deu_Latn", "italian": "ita_Latn", "portuguese": "por_Latn",
    "russian": "rus_Cyrl", "dutch": "nld_Latn", "polish": "pol_Latn",
    "ukrainian": "ukr_Cyrl", "romanian": "ron_Latn", "greek": "ell_Grek",
    "hungarian": "hun_Latn", "bulgarian": "bul_Cyrl", "czech": "ces_Latn",
    "swedish": "swe_Latn", "danish": "dan_Latn", "finnish": "fin_Latn",
    "norwegian": "nob_Latn", "croatian": "hrv_Latn",

    # Asian Languages (25)
    "chinese": "zho_Hans", "japanese": "jpn_Jpan", "korean": "kor_Hang",
    "hindi": "hin_Deva", "arabic": "ara_Arab", "bengali": "ben_Beng",
    "tamil": "tam_Taml", "telugu": "tel_Telu", "marathi": "mar_Deva",
    "urdu": "urd_Arab", "thai": "tha_Thai", "vietnamese": "vie_Latn",
    "indonesian": "ind_Latn", "malay": "msa_Latn", "filipino": "fil_Latn",
    "persian": "pes_Arab", "punjabi": "pan_Guru", "gujarati": "guj_Gujr",
    "kannada": "kan_Knda", "malayalam": "mal_Mlym", "sinhala": "sin_Sinh",
    "nepali": "npi_Deva", "khmer": "khm_Khmr", "lao": "lao_Laoo",
    "burmese": "mya_Mymr",

    # African Languages (15)
    "swahili": "swh_Latn", "yoruba": "yor_Latn", "hausa": "hau_Latn",
    "igbo": "ibo_Latn", "amharic": "amh_Ethi", "somali": "som_Latn",
    "zulu": "zul_Latn", "xhosa": "xho_Latn", "shona": "sna_Latn",
    "afrikaans": "afr_Latn", "kinyarwanda": "kin_Latn", "luganda": "lug_Latn",
    "tigrinya": "tir_Ethi", "oromo": "gaz_Latn", "sesotho": "sot_Latn",

    # Other Key Languages (40)
    "turkish": "tur_Latn", "hebrew": "heb_Hebr", "farsi": "pes_Arab",
    "pashto": "pbt_Arab", "kazakh": "kaz_Cyrl", "uzbek": "uzn_Latn",
    "azerbaijani": "azj_Latn", "armenian": "hye_Armn", "georgian": "kat_Geor",
    "mongolian": "khk_Cyrl", "tibetan": "bod_Tibt", "sanskrit": "san_Deva",
    "albanian": "als_Latn", "belarusian": "bel_Cyrl", "bosnian": "bos_Latn",
    "catalan": "cat_Latn", "estonian": "est_Latn", "galician": "glg_Latn",
    "icelandic": "isl_Latn", "irish": "gle_Latn", "latvian": "lvs_Latn",
    "lithuanian": "lit_Latn", "macedonian": "mkd_Cyrl", "maltese": "mlt_Latn",
    "serbian": "srp_Cyrl", "slovak": "slk_Latn", "slovenian": "slv_Latn",
    "welsh": "cym_Latn", "basque": "eus_Latn", "breton": "bre_Latn",
    "frisian": "fry_Latn", "hawaiian": "haw_Latn", "luxembourgish": "ltz_Latn",
    "scottish gaelic": "gla_Latn", "yiddish": "ydd_Hebr", "sindhi": "snd_Arab",
    "kurdish": "kmr_Latn", "tajik": "tgk_Cyrl", "turkmen": "tuk_Latn"

}

# Enhanced ISO mapping for better auto-detection
iso_to_nllb = {
    "af": "afr_Latn", "am": "amh_Ethi", "ar": "ara_Arab", "az": "azb_Arab",
    "be": "bel_Latn", "bg": "bul_Cyrl", "bn": "ben_Beng", "bs": "bos_Latn",
    "ca": "cat_Latn", "ceb": "ceb_Latn", "cs": "ces_Latn", "cy": "cym_Latn",
    "da": "dan_Latn", "de": "deu_Latn", "el": "ell_Grek", "en": "eng_Latn",
    "eo": "epo_Latn", "es": "spa_Latn", "et": "est_Latn", "eu": "eus_Latn",
    "fa": "pes_Arab", "fi": "fin_Latn", "fil": "fil_Latn", "fr": "fra_Latn",
    "fy": "fry_Latn", "ga": "gle_Latn", "gd": "gla_Latn", "gl": "glg_Latn",
    "gu": "guj_Gujr", "he": "heb_Hebr", "hi": "hin_Deva", "hmn": "hmn_Latn",
    "hr": "hrv_Latn", "ht": "hat_Latn", "hu": "hun_Latn", "hy": "hye_Armn",
    "id": "ind_Latn", "ig": "ibo_Latn", "is": "isl_Latn", "it": "ita_Latn",
    "ja": "jpn_Jpan", "jv": "jav_Latn", "ka": "kat_Geor", "kk": "kaz_Cyrl",
    "km": "khm_Khmr", "kn": "kan_Knda", "ko": "kor_Hang", "ku": "kmr_Latn",
    "ky": "kir_Cyrl", "la": "lat_Latn", "lb": "ltz_Latn", "lo": "lao_Laoo",
    "lt": "lit_Latn", "lv": "lvs_Latn", "mg": "mlg_Latn", "mi": "mri_Latn",
    "mk": "mkd_Cyrl", "ml": "mal_Mlym", "mn": "mon_Cyrl", "mr": "mar_Deva",
    "ms": "msa_Latn", "mt": "mlt_Latn", "my": "mya_Mymr", "ne": "npi_Deva",
    "nl": "nld_Latn", "no": "nor_Latn", "ny": "nya_Latn", "pa": "pan_Guru",
    "pl": "pol_Latn", "ps": "pus_Arab", "pt": "por_Latn", "ro": "ron_Latn",
    "ru": "rus_Cyrl", "sd": "snd_Arab", "si": "sin_Sinh", "sk": "slk_Latn",
    "sl": "slv_Latn", "sm": "smo_Latn", "sn": "sna_Latn", "so": "som_Latn",
    "sq": "als_Latn", "sr": "srp_Cyrl", "st": "sot_Latn", "su": "sun_Latn",
    "sv": "swe_Latn", "sw": "swh_Latn", "ta": "tam_Taml", "te": "tel_Telu",
    "tg": "tgk_Cyrl", "th": "tha_Thai", "tk": "tuk_Latn", "tl": "fil_Latn",
    "tr": "tur_Latn", "tt": "tat_Cyrl", "ug": "uig_Arab", "uk": "ukr_Cyrl",
    "ur": "urd_Arab", "uz": "uzn_Latn", "vi": "vie_Latn", "xh": "xho_Latn",
    "yi": "ydd_Hebr", "yo": "yor_Latn", "zh": "zho_Hans", "zu": "zul_Latn"
}


available_languages = ["auto"] + sorted(list(lang_codes.keys()))
target_languages = [lang for lang in available_languages if lang != "auto"]

In [None]:
# == CELL 4: Robust Translation Function ==
def translate_text_100(text, source_lang, target_lang, max_length=5000):
    # Input validation
    if not text.strip():
        return "⚠️ Please enter text to translate"

    if len(text) > max_length:
        return f"⚠️ Text exceeds {max_length} character limit"

    # Language resolution
    try:
        if source_lang.lower() == "auto":
            if len(text.strip()) < 10:
                return "⚠️ Enter at least 10 characters for auto-detection"
            try:
                detected = detect_lang(text)
                src_code = iso_to_nllb.get(detected.split('-')[0].lower())
                if not src_code:
                    return f"⚠️ Detected language '{detected}' not supported"
            except LangDetectException:
                return "⚠️ Language detection failed. Please specify source"
        else:
            src_code = lang_codes.get(source_lang.lower())
            if not src_code:
                return f"❌ Source language '{source_lang}' not supported"

        tgt_code = lang_codes.get(target_lang.lower())
        if not tgt_code:
            return f"❌ Target language '{target_lang}' not supported"

        # Tokenization and translation
        tokenizer.src_lang = src_code
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=1024
        ).to(model.device)

        # Language token handling
        if hasattr(tokenizer, 'lang_code_to_id'):
            bos_token_id = tokenizer.lang_code_to_id[tgt_code]
        else:
            bos_token_id = tokenizer.convert_tokens_to_ids(tgt_code)
            if bos_token_id is None:
                return f"❌ Language code '{tgt_code}' not in vocabulary"

        # Generation with optimized parameters
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                forced_bos_token_id=bos_token_id,
                max_new_tokens=1024,
                num_beams=5,
                early_stopping=True,
                no_repeat_ngram_size=3,
                length_penalty=1.2
            )

        return tokenizer.decode(outputs[0], skip_special_tokens=True)

    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            return "⚠️ GPU memory full. Try shorter text or restart Colab"
        return f"❌ Runtime error: {str(e)}"
    except Exception as e:
        return f"❌ Error: {str(e)}"
    finally:
        torch.cuda.empty_cache()


In [22]:
# == CELL 5: Enhanced Interface ==
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🌐 AI Translator (100 Languages)")

    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="📝 Input Text",
                placeholder="Enter text (max 5000 chars)...",
                lines=8,
                max_length=5000
            )
            with gr.Row():
                src_lang = gr.Dropdown(
                    choices=available_languages,
                    value="auto",
                    label="Source Language"
                )
                tgt_lang = gr.Dropdown(
                    choices=target_languages,
                    value="arabic",
                    label="Target Language"
                )
            btn = gr.Button("🔁 Translate", variant="primary")

        with gr.Column():
            output = gr.Textbox(
                label="✅ Translation",
                lines=8,
                interactive=False,
                show_copy_button=True
            )

    # Add examples
    examples = gr.Examples(
        examples=[
            ["Hello, how are you?", "english", "spanish"],
            ["This is a test of the translation system", "auto", "french"],
            ["اللغة العربية جميلة", "auto", "english"]
        ],
        inputs=[text_input, src_lang, tgt_lang]
    )

    btn.click(
        fn=translate_text_100,
        inputs=[text_input, src_lang, tgt_lang],
        outputs=output,
        show_progress="full"
    )

In [21]:
# == CELL 6: Launch ==
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1d5954c1574e068971.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


