In [1]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [2]:
!pip -q install -U \
  "opentelemetry-api==1.38.0" \
  "opentelemetry-sdk==1.38.0" \
  "opentelemetry-semantic-conventions==0.59b0" \
  "opentelemetry-exporter-otlp-proto-http==1.38.0"


In [3]:
import torch
import transformers
import mistralai

print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("mistralai:", getattr(mistralai, "__version__", "ok"))

# sanity check: can we import the client class?
from mistralai.client import MistralClient
print("‚úÖ MistralClient import OK")
""

torch: 2.10.0+cu128
transformers: 4.38.2
mistralai: ok
‚úÖ MistralClient import OK


''

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IndicTransToolkit import IndicProcessor

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

INDICTRANS_MODEL_ID = "ai4bharat/indictrans2-en-indic-1B"

indic_tokenizer = AutoTokenizer.from_pretrained(
    INDICTRANS_MODEL_ID,
    trust_remote_code=True
)

indic_model = AutoModelForSeq2SeqLM.from_pretrained(
    INDICTRANS_MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    device_map="auto" if DEVICE == "cuda" else None
)

# NOTE: For non-quantized models, .to() is fine:
if DEVICE != "cuda":
    indic_model = indic_model.to(DEVICE)

indic_model.eval()

ip = IndicProcessor(inference=True)

print("‚úÖ IndicTrans2 loaded")


DEVICE: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


‚úÖ IndicTrans2 loaded




In [5]:
LANG_CODE = {
    "hindi": "hin_Deva",
    "kannada": "kan_Knda",
    "tamil": "tam_Taml",
    "telugu": "tel_Telu",
    "malayalam": "mal_Mlym",
    "marathi": "mar_Deva",
    "bengali": "ben_Beng",
    "gujarati": "guj_Gujr",
    "punjabi": "pan_Guru",
    "odia": "ory_Orya",
}

def translate_with_indictrans(
    sentences,
    src_lang="eng_Latn",
    tgt_lang="hin_Deva",
    max_length=256,
    num_beams=5
):
    # 1) preprocess (required)
    batch = ip.preprocess_batch(sentences, src_lang=src_lang, tgt_lang=tgt_lang, visualize=False)

    # 2) tokenize
    inputs = indic_tokenizer(
        batch,
        truncation=True,
        padding="longest",
        max_length=max_length,
        return_tensors="pt",
        return_attention_mask=True
    ).to(DEVICE)

    # 3) generate
    with torch.inference_mode():
        generated = indic_model.generate(
            **inputs,
            use_cache=True,
            min_length=0,
            max_length=max_length,
            num_beams=num_beams,
            num_return_sequences=1
        )

    # 4) decode + postprocess
    decoded = indic_tokenizer.batch_decode(
        generated,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    out = ip.postprocess_batch(decoded, lang=tgt_lang)
    return out


In [6]:
!pip -q install -U "mistralai==0.4.2"


In [8]:
import re
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

# --- API key (recommended: Colab Secret) ---
try:
    from google.colab import userdata
    MISTRAL_API_KEY = userdata.get("MISTRAL_API_KEY")
except Exception:
    MISTRAL_API_KEY = None

if not MISTRAL_API_KEY:
    raise ValueError("‚ùå Set MISTRAL_API_KEY in Colab Secrets")

mistral = MistralClient(api_key=MISTRAL_API_KEY)
MISTRAL_MODEL = "mistral-large-latest"

# --- strict, rewrite-only prompt ---
NORMALIZER_SYSTEM = """You are a semantic normalizer for translation.

Rewrite the user's message into clear, grammatical English.

Hard rules:
- Never refuse. Never ask questions.
- Do NOT add new facts, times, places, names, or numbers.
- Expand slang/abbreviations/idioms into standard English.
- Keep any non-English words exactly as they appear (do not translate them).
- Preserve question/statement form and sentiment (anger/sarcasm).
- Output ONLY the rewritten text. No explanations, no labels.
"""

def _clean_out(text: str) -> str:
    text = text.strip()
    text = re.sub(r"^(assistant|Assistant)\s*[:\-]?\s*", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def normalize_with_mistral_api(user_text: str) -> str:
    resp = mistral.chat(
        model=MISTRAL_MODEL,
        messages=[
            ChatMessage(role="system", content=NORMALIZER_SYSTEM),
            ChatMessage(role="user", content=user_text),
        ],
        temperature=0.0,
        max_tokens=128,
    )
    return _clean_out(resp.choices[0].message.content)

print("‚úÖ Stage-1 normalizer ready (mistral-large-latest via API)")


‚úÖ Stage-1 normalizer ready (mistral-large-latest via API)


In [9]:
def agentic_translate(user_text: str, target_language: str):
    target_language = target_language.lower()
    if target_language not in LANG_CODE:
        raise ValueError(f"Unsupported target_language: {target_language}. Choose from: {list(LANG_CODE.keys())}")

    # Stage 1: normalize via Mistral API
    normalized = normalize_with_mistral_api(user_text)

    # Stage 2: translate locally via IndicTrans2
    tgt_code = LANG_CODE[target_language]
    translated = translate_with_indictrans(
        [normalized],
        src_lang="eng_Latn",
        tgt_lang=tgt_code
    )[0]

    return {
        "input": user_text,
        "normalized_english": normalized,
        "target_language": target_language,
        "translation": translated
    }

# -----------------------
# End-to-end stress tests
# -----------------------
stress_tests = [
    ("idk bro, this weather is wild af, raining cats and dogs fr", "kannada"),
    ("ivathu office hogbeku illa antha doubt, manager call madthara?", "kannada"),
    ("wah bhai, server firse down hai, amazing service üëèüëè", "hindi"),
    ("why this app always crash when i need it most üò°", "tamil"),
    ("ngl this update kinda slaps but battery drain is crazy", "telugu"),
    ("can u pls check and let me know by eod, super urgent", "marathi"),
    ("meeting today or tomorrow?", "kannada"),
    ("kal meeting 3pm irukku right?", "tamil"),
    ("please do the needful asap", "hindi"),
]

for text, lang in stress_tests:
    print("=" * 90)
    out = agentic_translate(text, lang)
    print("INPUT:      ", out["input"])
    print("NORMALIZED: ", out["normalized_english"])
    print("TRANSLATED: ", out["translation"])


INPUT:       idk bro, this weather is wild af, raining cats and dogs fr
NORMALIZED:  I don't know, this weather is extremely unpredictable‚Äîit's raining very heavily right now.
TRANSLATED:  ‡≤®‡≤®‡≤ó‡≥Ü ‡≤ó‡≥ä‡≤§‡≥ç‡≤§‡≤ø‡≤≤‡≥ç‡≤≤, ‡≤à ‡≤π‡≤µ‡≤æ‡≤Æ‡≤æ‡≤®‡≤µ‡≥Å ‡≤Ö‡≤§‡≥ç‡≤Ø‡≤Ç‡≤§ ‡≤Ö‡≤®‡≤ø‡≤∞‡≥Ä‡≤ï‡≥ç‡≤∑‡≤ø‡≤§‡≤µ‡≤æ‡≤ó‡≤ø‡≤¶‡≥Ü-‡≤à‡≤ó ‡≤≠‡≤æ‡≤∞‡≥Ä ‡≤Æ‡≤≥‡≥Ü‡≤Ø‡≤æ‡≤ó‡≥Å‡≤§‡≥ç‡≤§‡≤ø‡≤¶‡≥Ü.
INPUT:       ivathu office hogbeku illa antha doubt, manager call madthara?
NORMALIZED:  I don't need to come to the office, right? That was my doubt, so I called the manager.
TRANSLATED:  ‡≤®‡≤æ‡≤®‡≥Å ‡≤ï‡≤ö‡≥á‡≤∞‡≤ø‡≤ó‡≥Ü ‡≤¨‡≤∞‡≤¨‡≥á‡≤ï‡≤æ‡≤ó‡≤ø‡≤≤‡≥ç‡≤≤, ‡≤Ö‡≤≤‡≥ç‡≤≤‡≤µ‡≥á? ‡≤Ö‡≤¶‡≥Å ‡≤®‡≤®‡≥ç‡≤® ‡≤Ö‡≤®‡≥Å‡≤Æ‡≤æ‡≤®‡≤µ‡≤æ‡≤ó‡≤ø‡≤§‡≥ç‡≤§‡≥Å, ‡≤Ü‡≤¶‡≥ç‡≤¶‡≤∞‡≤ø‡≤Ç‡≤¶ ‡≤®‡≤æ‡≤®‡≥Å ‡≤Æ‡≥ç‡≤Ø‡≤æ‡≤®‡≥á‡≤ú‡≤∞‡≥ç‡≤ó‡≥Ü ‡≤ï‡≤∞‡≥Ü ‡≤Æ‡≤æ‡≤°‡≤ø‡≤¶‡≥Ü.
INPUT:       wah bhai, server firse down hai, amazing service üëèüëè
NORMALIZED:  Oh man, the server is down again. What amazing service.
TRANSLATED:  ‡§ì‡§π ‡

In [10]:
import random
import time

# -----------------------------
# 120+ Stress Test Cases
# Format: (input_text, target_language)
# -----------------------------
STRESS_TESTS_120 = [
    # --- slang + idioms ---
    ("idk man, it's raining cats and dogs", "hindi"),
    ("bruh this is wild af", "kannada"),
    ("ngl that was kinda mid", "tamil"),
    ("fr this app slaps", "telugu"),
    ("lowkey I'm done with this", "marathi"),
    ("highkey this is annoying", "bengali"),
    ("I'm over the moon right now", "hindi"),
    ("this is the last straw", "kannada"),
    ("spill the tea, what happened?", "tamil"),
    ("no cap, that's impressive", "telugu"),
    ("that‚Äôs a piece of cake", "marathi"),
    ("break a leg for your interview", "bengali"),
    ("hit the sack, I'm tired", "hindi"),
    ("it's not my cup of tea", "kannada"),
    ("once in a blue moon", "tamil"),
    ("costs an arm and a leg", "telugu"),
    ("I'm feeling under the weather", "marathi"),
    ("let's call it a day", "bengali"),
    ("that's the icing on the cake", "hindi"),
    ("I'm on the same page as you", "kannada"),

    # --- sarcasm / passive aggression ---
    ("wow amazing service, server down again üëè", "hindi"),
    ("great, another bug. just what I needed.", "tamil"),
    ("nice. very nice. nothing works.", "kannada"),
    ("sure, take your time... it's not urgent at all üôÉ", "telugu"),
    ("love how it crashes exactly when I click submit", "marathi"),
    ("fantastic, payment failed for the third time", "bengali"),
    ("yeah right, 'quick fix' my foot", "hindi"),
    ("super smooth experience... said no one ever", "kannada"),
    ("oh perfect, now it's stuck on loading forever", "tamil"),
    ("thanks for nothing", "telugu"),

    # --- broken English / typos / phonetic ---
    ("plz do needful asap", "hindi"),
    ("cant login, pw reset not working", "kannada"),
    ("u check once and tell", "tamil"),
    ("i no understand why error coming", "telugu"),
    ("app crash when open only", "marathi"),
    ("net slow so msg late", "bengali"),
    ("its showing invalid otp again n again", "hindi"),
    ("payment done but not reflecting", "kannada"),
    ("order cancelled but money not back", "tamil"),
    ("screen freeze after update", "telugu"),
    ("why my account lock??", "marathi"),
    ("pls share update by eod", "bengali"),

    # --- time / constraints (important: no hallucinations) ---
    ("please respond by EOD", "hindi"),
    ("need this done by 5 pm today", "kannada"),
    ("schedule it for tomorrow morning", "tamil"),
    ("move the meeting to next Monday", "telugu"),
    ("remind me in 2 hours", "marathi"),
    ("call me at 10:30 am", "bengali"),
    ("submit before 23:59 today", "hindi"),
    ("we need the report by end of day", "kannada"),
    ("deliver it in 15 minutes", "tamil"),
    ("do it ASAP", "telugu"),

    # --- code-mix: Hinglish ---
    ("bhai kal meeting hai kya?", "hindi"),
    ("yaar server firse down ho gaya", "hindi"),
    ("pls thoda jaldi karo, urgent hai", "hindi"),
    ("mera payment stuck hai, check karo", "hindi"),
    ("aaj call kar sakte ho kya?", "hindi"),
    ("thanks bhai, but issue abhi bhi hai", "hindi"),
    ("boss ko message bhej du kya?", "hindi"),
    ("OTP nahi aa raha, kya karu?", "hindi"),
    ("kal 3 baje meeting fix hai na?", "hindi"),
    ("kya scene hai? app open nahi ho raha", "hindi"),

    # --- code-mix: Kanglish ---
    ("naale meeting reschedule madana? please confirm", "kannada"),
    ("ivathu office hogbeku illa antha doubt, manager call madthara?", "kannada"),
    ("bro swalpa adjust maadi, urgent ide", "kannada"),
    ("payment aagide but receipt baralla", "kannada"),
    ("ninna night app hang aitu", "kannada"),
    ("data load aagtha illa, yen madbeku?", "kannada"),
    ("meeting ge late aagutte, sorry", "kannada"),
    ("kelsa complete madidini, pls review", "kannada"),
    ("naan barthini, but swalpa late", "kannada"),
    ("call maadi, important", "kannada"),

    # --- code-mix: Tanglish ---
    ("kal meeting 3pm irukku right?", "tamil"),
    ("innaiku office poganuma? manager call pannuvana?", "tamil"),
    ("bro konjam wait pannunga, net slow", "tamil"),
    ("payment pannen but status update aagala", "tamil"),
    ("app open panna crash aaguthu", "tamil"),
    ("naan late aaguren, sorry", "tamil"),
    ("please confirm meeting naale iruka?", "tamil"),
    ("enaku OTP varala, help pannunga", "tamil"),
    ("ticket raise panniten, reply varala", "tamil"),
    ("server down ah? romba worst", "tamil"),

    # --- mixed punctuation / noise ---
    ("HELLO??? any update??!!", "hindi"),
    ("whattttt is thisssss üò≠üò≠", "kannada"),
    (".............", "tamil"),
    ("ok fine whatever", "telugu"),
    ("BROOOOOOOOOOO", "marathi"),
    ("??!!??", "bengali"),
    ("pls..... fix..... now.....", "hindi"),
    ("why??????", "kannada"),
    ("it works... then it doesn't...", "tamil"),
    ("hmmm idk", "telugu"),

    # --- support / business-style ---
    ("Could you please provide an update on the ticket status?", "hindi"),
    ("We are blocked due to this issue. Please prioritize.", "kannada"),
    ("Kindly share the RCA once available.", "tamil"),
    ("Please confirm the deployment timeline.", "telugu"),
    ("We need the ETA for resolution.", "marathi"),
    ("The issue is reproducible on version 2.1.3.", "bengali"),
    ("Please escalate this to the engineering team.", "hindi"),
    ("We observed intermittent failures in production.", "kannada"),
    ("The customer is waiting; please respond ASAP.", "tamil"),
    ("Let‚Äôs align on next steps.", "telugu"),

    # --- negation / tricky semantics ---
    ("I didn't say it was not working.", "hindi"),
    ("It's not that I can't login, it's that it logs me out.", "kannada"),
    ("Don't reschedule the meeting.", "tamil"),
    ("Please don't cancel the order.", "telugu"),
    ("I never received the OTP, not even once.", "marathi"),
    ("This isn't good, but it's not terrible either.", "bengali"),
    ("I don't want a refund, I want a fix.", "hindi"),
    ("Not today, maybe tomorrow.", "kannada"),
    ("No, I didn't approve that change.", "tamil"),
    ("I can't not respond by EOD.", "telugu"),

    # --- edge: very short ---
    ("ok", "hindi"),
    ("no", "kannada"),
    ("yes", "tamil"),
    ("urgent", "telugu"),
    ("done", "marathi"),
    ("help", "bengali"),
]

# Ensure at least 120 (we already have >120, but keep deterministic)
random.seed(42)
random.shuffle(STRESS_TESTS_120)

print("Total test cases:", len(STRESS_TESTS_120))

# -----------------------------
# Runner: batch + rate-friendly
# -----------------------------
def run_stress_tests(tests, batch_size=10, sleep_s=1.0):
    """
    Stage-1 uses Mistral API (per item), Stage-2 is local.
    We batch only for display pacing, not for API batching.
    """
    results = []
    for i in range(0, len(tests), batch_size):
        batch = tests[i:i+batch_size]
        print("\n" + "#" * 100)
        print(f"Batch {i//batch_size + 1} | cases {i+1}-{min(i+batch_size, len(tests))}")
        print("#" * 100)

        for text, lang in batch:
            try:
                out = agentic_translate(text, lang)
                results.append(out)
                print("-" * 90)
                print("INPUT:      ", out["input"])
                print("NORMALIZED: ", out["normalized_english"])
                print("LANG:       ", out["target_language"])
                print("TRANSLATED: ", out["translation"])
            except Exception as e:
                results.append({"input": text, "target_language": lang, "error": str(e)})
                print("-" * 90)
                print("INPUT: ", text)
                print("LANG:  ", lang)
                print("ERROR: ", e)

        # small pause to reduce API burstiness
        time.sleep(sleep_s)

    return results

# Run all 120+ tests
stress_results = run_stress_tests(STRESS_TESTS_120, batch_size=8, sleep_s=0.6)

print("\n‚úÖ Finished. Results stored in: stress_results")


Total test cases: 118

####################################################################################################
Batch 1 | cases 1-8
####################################################################################################
------------------------------------------------------------------------------------------
INPUT:       meeting ge late aagutte, sorry
NORMALIZED:  I arrived late to the meeting, I apologize.
LANG:        kannada
TRANSLATED:  ‡≤®‡≤æ‡≤®‡≥Å ‡≤∏‡≤≠‡≥Ü‡≤ó‡≥Ü ‡≤§‡≤°‡≤µ‡≤æ‡≤ó‡≤ø ‡≤¨‡≤Ç‡≤¶‡≤ø‡≤¶‡≥ç‡≤¶‡≥á‡≤®‡≥Ü, ‡≤®‡≤æ‡≤®‡≥Å ‡≤ï‡≥ç‡≤∑‡≤Æ‡≥Ü‡≤Ø‡≤æ‡≤ö‡≤ø‡≤∏‡≥Å‡≤§‡≥ç‡≤§‡≥á‡≤®‡≥Ü.
------------------------------------------------------------------------------------------
INPUT:       ok
NORMALIZED:  Alright.
LANG:        hindi
TRANSLATED:  ‡§†‡•Ä‡§ï ‡§π‡•à‡•§
------------------------------------------------------------------------------------------
INPUT:       HELLO??? any update??!!
NORMALIZED:  Hello? Is there any update?
LANG:        hindi
TRANSLATED:  ‡