# IndicTrans2 HF Inference

We provide an example notebook on how to use our IndicTrans2 models which were originally trained with the fairseq to HuggingFace transformers for inference purpose.


## Setup

Please run the cells below to install the necessary dependencies.


In [None]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

In [None]:

%%capture
%cd /content/IndicTrans2/huggingface_interface

In [None]:
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers==4.53.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransToolkit.git
%cd IndicTransToolkit
!python3 -m pip install --editable ./
%cd ..

**IMPORTANT : Restart your run-time first and then run the cells below.**

## Inference


In [None]:
# Hugging Face Authentication
# IMPORTANT: Never commit your Hugging Face token to the repository!
# Get your token from: https://huggingface.co/settings/tokens
# For Google Colab, you can also use Colab secrets to store the token securely

# Run this cell BEFORE you import or call transformers.from_pretrained
!pip install -q huggingface_hub

from getpass import getpass
from huggingface_hub import login, whoami
import os

# Prompt user for token (secure method - token won't be displayed)
hf_token = getpass("Paste your Hugging Face token (hidden): ")
# set env vars so transformers/huggingface_hub will find it
os.environ["HF_TOKEN"] = hf_token
os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token
# record login (optional but helpful)
login(token=hf_token)

# sanity check
print("Logged in as:", whoami(token=hf_token)["name"])


Paste your Hugging Face token (hidden): ··········


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in as: singhsa6


In [None]:
import torch
import os
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
from IndicTransToolkit.processor import IndicProcessor

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None

In [None]:
##---------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
def prepare_data(input_csv='eng-dogri.csv', output_dir="data"):
    """
    Prepare data from CSV or use existing files
    """
    if input_csv and os.path.exists(input_csv):
        # Load dataset from CSV
        df = pd.read_csv(input_csv)

        # Ensure correct column names
        if "eng_Latn" not in df.columns or "doi_Deva" not in df.columns:
            raise ValueError("Dataset must contain 'eng_Latn' and 'doi_Deva' columns.")

        # Clean data
        df = df.dropna(subset=["eng_Latn", "doi_Deva"])
        df["eng_Latn"] = df["eng_Latn"].str.strip()
        df["doi_Deva"] = df["doi_Deva"].str.strip()

        # Split dataset
        train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
        dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

        # Create output directory
        os.makedirs(output_dir, exist_ok=True)

        # Save data to files
        with open(f"{output_dir}/train.en", "w", encoding="utf-8") as f:
            f.write("\n".join(train_df["eng_Latn"].tolist()))

        with open(f"{output_dir}/train.doi", "w", encoding="utf-8") as f:
            f.write("\n".join(train_df["doi_Deva"].tolist()))

        with open(f"{output_dir}/dev.en", "w", encoding="utf-8") as f:
            f.write("\n".join(dev_df["eng_Latn"].tolist()))

        with open(f"{output_dir}/dev.doi", "w", encoding="utf-8") as f:
            f.write("\n".join(dev_df["doi_Deva"].tolist()))

        with open(f"{output_dir}/test.en", "w", encoding="utf-8") as f:
            f.write("\n".join(test_df["eng_Latn"].tolist()))

        with open(f"{output_dir}/test.doi", "w", encoding="utf-8") as f:
            f.write("\n".join(test_df["doi_Deva"].tolist()))

        print(f" Dataset processed and saved in '{output_dir}/' directory")
        print(f"   Train: {len(train_df)} samples")
        print(f"   Dev: {len(dev_df)} samples")
        print(f"   Test: {len(test_df)} samples")

        # Match the file extensions actually used
        for split in ["train", "dev", "test"]:
            src_file = f"{output_dir}/{split}.en"
            tgt_file = f"{output_dir}/{split}.doi"

            if not os.path.exists(src_file) or not os.path.exists(tgt_file):
                if split == "test":
                    continue
                raise FileNotFoundError(f"Required data files not found: {src_file} or {tgt_file}")

    return output_dir

def load_dataset_from_files(data_dir, split):
    """Load dataset from text files"""
    src_path = os.path.join(data_dir, f"{split}.{SRC_LANG}")
    tgt_path = os.path.join(data_dir, f"{split}.{TGT_LANG}")

    if not os.path.exists(src_path) or not os.path.exists(tgt_path):
        raise FileNotFoundError(f"Data files not found: {src_path} or {tgt_path}")

    with open(src_path, encoding="utf-8") as src_file, open(tgt_path, encoding="utf-8") as tgt_file:
        src_lines = [line.strip() for line in src_file.readlines()]
        tgt_lines = [line.strip() for line in tgt_file.readlines()]

    if len(src_lines) != len(tgt_lines):
        raise ValueError(f"Mismatch in source and target file lengths for {split}")

    print(f"Loaded {len(src_lines)} examples for {split}")

    return Dataset.from_dict({
        "translation": [
            {"en": src, "doi": tgt}
            for src, tgt in zip(src_lines, tgt_lines)
        ]
    })

In [None]:
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text
        generated_tokens = tokenizer.batch_decode(
            generated_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

### English to Indic Example


In [None]:
# Cell 1 — load_en_indic_model(): run once. Provides unload_en_indic_model() to free memory.


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
hf_token = os.environ.get("HF_TOKEN")  # must be set by your login cell
quantization = None  # set "8-bit" or "4-bit" if you prefer quantization

def load_en_indic_model(ckpt_dir="ai4bharat/indictrans2-en-indic-1B", quantization=None, token=None):
    """
    Loads tokenizer, model and IndicProcessor into the global namespace:
      en_indic_tokenizer, en_indic_model, ip
    This function is safe to call multiple times: it won't reload if already present.
    """
    global en_indic_tokenizer, en_indic_model, ip

    if 'en_indic_model' in globals() and 'en_indic_tokenizer' in globals():
        print("Model already loaded. To reload, call unload_en_indic_model() first.")
        return

    # quantization config (unchanged logic)
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_compute_dtype=torch.bfloat16)
    else:
        qconfig = None

    print("Loading tokenizer (this may download code files)...")
    en_indic_tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True, use_auth_token=token)

    print("Loading model (this may take a while)...")
    en_indic_model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
        use_auth_token=token,
    )

    if qconfig is None:
        en_indic_model = en_indic_model.to(DEVICE)
        if DEVICE == "cuda":
            en_indic_model.half()

    en_indic_model.eval()
    ip = IndicProcessor(inference=True)

    print("Model loaded -> device:", next(en_indic_model.parameters()).device)

def unload_en_indic_model():
    """Delete the model/tokenizer/processor and free GPU memory."""
    global en_indic_model, en_indic_tokenizer, ip
    removed = False
    if 'en_indic_model' in globals():
        del en_indic_model
        removed = True
    if 'en_indic_tokenizer' in globals():
        del en_indic_tokenizer
        removed = True
    if 'ip' in globals():
        del ip
        removed = True

    import gc
    gc.collect()
    try:
        torch.cuda.empty_cache()
    except Exception:
        pass

    if removed:
        print("Model, tokenizer and processor unloaded; cache cleared.")
    else:
        print("Nothing to unload.")


In [None]:
load_en_indic_model(quantization=None, token=hf_token)

Loading tokenizer (this may download code files)...




tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

tokenization_indictrans.py:   0%|          | 0.00/8.04k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


dict.SRC.json:   0%|          | 0.00/645k [00:00<?, ?B/s]

dict.TGT.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

model.SRC:   0%|          | 0.00/759k [00:00<?, ?B/s]

model.TGT:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

Loading model (this may take a while)...




config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/79.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Model loaded -> device: cuda:0


In [None]:
import torch

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def translate_sentences(input_sentences, src_lang="eng_Latn", tgt_lang="doi_Deva"):
    """
    Translate a list of sentences using the preloaded en_indic_model and en_indic_tokenizer.
    Raises if model isn't loaded.
    """
    if 'en_indic_model' not in globals() or 'en_indic_tokenizer' not in globals() or 'ip' not in globals():
        raise RuntimeError("Model not loaded. Run the Load model cell and call load_en_indic_model() first.")

    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        inputs = en_indic_tokenizer(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            generated_tokens = en_indic_model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        generated = en_indic_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        translations += ip.postprocess_batch(generated, lang=tgt_lang)

        del inputs
        # free small amounts of intermediate GPU mem
        try:
            torch.cuda.empty_cache()
        except Exception:
            pass

    return translations

# Example usage — run this cell each time you want translations (model stays loaded)
en_sents = [
    "When tejas was young, tejas used to go to the park every day.",
    "He has many old books, which he inherited from his ancestors.",
    "I can't figure out how to solve my problem.",
    "She is very hardworking and intelligent, which is why she got all the good marks.",
    "We watched a new movie last week, which was very inspiring.",
    "If you had met me at that time, we would have gone out to eat.",
    "She went to the market with her sister to buy a new sari.",
    "Raj told me that he is going to his grandmother's house next month.",
    "All the kids were having fun at the party and were eating lots of sweets.",
    "My friend has invited me to his birthday party, and I will give him a gift.",
]

# Run translation (ensure load_en_indic_model() has been executed earlier in the session)
translations = translate_sentences(en_sents, src_lang="eng_Latn", tgt_lang="doi_Deva")
print("\neng_Latn - doi_Deva")
for src, tgt in zip(en_sents, translations):
    print("eng_Latn:", src)
    print("doi_Deva:", tgt)
    print("-" * 90)



eng_Latn - doi_Deva
eng_Latn: When tejas was young, tejas used to go to the park every day.
doi_Deva: जिसलै तेजा लौह्के हे, ते तेजा हर रोज बगीचे च जंदे हे।
------------------------------------------------------------------------------------------
eng_Latn: He has many old books, which he inherited from his ancestors.
doi_Deva: उंʼदे कोल केईं पुरानी किताबां न, जेह्ड़ियां उʼनें गी अपने पूर्वजें कोला विरासत च मिलियां न।
------------------------------------------------------------------------------------------
eng_Latn: I can't figure out how to solve my problem.
doi_Deva: में समझी नेईं सकनां जे अपनी समस्या दा समाधान किʼयां करां।
------------------------------------------------------------------------------------------
eng_Latn: She is very hardworking and intelligent, which is why she got all the good marks.
doi_Deva: ओह् बड़ी मेहनती ते बुद्घिमान ऐ, इस करियै उʼनें सारे अच्छे अंक हासल कीते।
------------------------------------------------------------------------------------------
eng_Latn

### Indic to English Example

In [None]:
%%capture
!pip install gradio
import gradio as gr

In [None]:
# Gradio UI cell — run this after you've set HF token and loaded (or allow it to load) the model.

def ensure_model_loaded():
    """
    Ensure en_indic_model, en_indic_tokenizer and ip exist in globals.
    If not present, call load_en_indic_model() which you defined earlier.
    """
    if 'en_indic_model' not in globals() or 'en_indic_tokenizer' not in globals() or 'ip' not in globals():
        try:
            # load_en_indic_model should be defined in your notebook (from previous cell)
            load_en_indic_model(quantization=None, token=os.environ.get("HF_TOKEN"))
        except NameError:
            raise RuntimeError("load_en_indic_model() not found — please run the model load cell first.")
    # sanity: move model to device if needed
    try:
        if next(en_indic_model.parameters()).device != torch.device(DEVICE):
            en_indic_model.to(DEVICE)
    except Exception:
        pass

def infer_lines(lines, src_lang, tgt_lang):
    """
    Translate list of lines using the already-loaded model/tokenizer/ip.
    Returns list of translations (same length).
    """
    ensure_model_loaded()
    translations = []
    for i in range(0, len(lines), BATCH_SIZE):
        batch = lines[i: i + BATCH_SIZE]
        # preprocess (handles entity masking etc.)
        proc_batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
        inputs = en_indic_tokenizer(
            proc_batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
        ).to(DEVICE)

        with torch.no_grad():
            gen = en_indic_model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        decoded = en_indic_tokenizer.batch_decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        post = ip.postprocess_batch(decoded, lang=tgt_lang)
        translations.extend(post)

        # free some intermediate mem
        try:
            del inputs, gen
            torch.cuda.empty_cache()
        except Exception:
            pass

    return translations

def gradio_translate(text_block, src_lang, tgt_lang):
    """
    Gradio-friendly wrapper: accepts a multiline text input (one sentence per line)
    and returns a multiline translations string.
    """
    # split by newline and filter empty lines
    lines = [ln.strip() for ln in text_block.splitlines() if ln.strip()]
    if not lines:
        return "No input provided."
    try:
        out_lines = infer_lines(lines, src_lang, tgt_lang)
        return "\n".join(out_lines)
    except Exception as e:
        # show useful error to the user (no token printed)
        return f"Error during translation: {type(e).__name__}: {e}"

# Build Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## IndicTrans2 — English → Indic translation")
    with gr.Row():
        inp = gr.Textbox(label="Input sentences (one per line)", lines=6, value=""" Madeira, Manchester, Madrid, Turin and Manchester again.

Wreathed in red.

Restored to this great gallery of the game.

A walking work of art.

Vintage.

Beyond valuation, beyond forgery or imitation.

18 years since that trembling teenager of touch and tease, first tiptoed on this storied stage;

Now in his immaculate maturity;

CR7 - Reunited """)
        with gr.Column():
            src = gr.Dropdown(label="Source", choices=["English Indian"], value="eng_Latn")
            tgt = gr.Dropdown(label="Target", choices=["Dogri Kashmiri"], value="doi_Deva")
            btn = gr.Button("Translate")
    out = gr.Textbox(label="Translations", lines=6)
    btn.click(fn=gradio_translate, inputs=[inp, src, tgt], outputs=out)

# Launch the app. In Colab, set share=True if you want a public URL.
demo.launch(share=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9e61b374ed9ee39e5b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


