In [1]:
import json
import pandas as pd
import os

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install transformers accelerate sentencepiece



In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, json, csv
from tqdm import tqdm

model_id = "sarvamai/sarvam-1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token  # avoid padding errors

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.float16,
    device_map="auto"
)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.77G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/279M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/193 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(68096, 2048)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (up_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-06)
   

**ENGLISH**

In [3]:
import json
import csv
import torch
from tqdm import tqdm

json_path = "/content/drive/MyDrive/Project/Dataset/IndicSquad/cleaned_data/english_cleaned.json"
output_csv = "/content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_english_batched-1f.csv"

device = "cuda" if torch.cuda.is_available() else "cpu"

# ------------------------
# 1. Load dataset
# ------------------------
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

all_qas = []
for article in data["data"]:
    for para in article["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            all_qas.append({
                "id": qa["id"],
                "question": qa["question"],
                "context": context,
                "expected": qa["answers"][0]["text"]
            })

print(f"Total questions: {len(all_qas)}")




Total questions: 5928


In [4]:
# ------------------------
# 2. Batched Generation
# ------------------------
BATCH_SIZE = 64  # you can increase to 64 depending on GPU memory

with open(output_csv, "w", newline='', encoding="utf-8") as f_out:
    writer = csv.writer(f_out)
    writer.writerow(["id", "model_answer", "expected_answer"])

    # number of batches
    num_batches = (len(all_qas) + BATCH_SIZE - 1) // BATCH_SIZE

    for batch_idx in tqdm(range(num_batches), desc="Generating English QA (batched)"):

        batch = all_qas[batch_idx * BATCH_SIZE : (batch_idx+1) * BATCH_SIZE]

        prompts = [
            f"Context: {entry['context']}\nQuestion: {entry['question']}\nAnswer:"
            for entry in batch
        ]

        # Tokenize batch
        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id
            )

        # Decode all answers
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for i, entry in enumerate(batch):
            # Remove prompt → keep only generated continuation
            prompt_len = inputs.input_ids[i].shape[0]
            full_text = decoded[i]

            # Remove the prompt portion
            generated = full_text[len(prompts[i]):].strip()
            generated = generated.split("\n")[0].strip()

            writer.writerow([entry["id"], generated, entry["expected"]])

print("Done! CSV saved at:", output_csv)


Generating English QA (batched):   0%|          | 0/93 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating English QA (batched): 100%|██████████| 93/93 [09:43<00:00,  6.27s/it]

Done! CSV saved at: /content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_english_batched-1f.csv





**HINDI**

In [7]:
import json
import csv
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------------------------------
# Change only this to the target language
# ---------------------------------------
lang = "hindi"   # or tamil / kannada / marathi
json_path = f"/content/drive/MyDrive/Project/Dataset/IndicSquad/cleaned_data/{lang}_cleaned.json"
output_csv = f"/content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_{lang}_batched.csv"

# ---------------------------------------
# Load Indic flat-style format
# ---------------------------------------
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

all_qas = []
for item in data["data"]:
    qid = item.get("id", "")
    context = item.get("context", "")
    question = item.get("question", "")

    # Safe answer extraction
    ans_list = item.get("answers", {}).get("text", [])
    expected = ans_list[0] if len(ans_list) > 0 else ""

    all_qas.append({
        "id": qid,
        "question": question,
        "context": context,
        "expected": expected
    })


print(f"Total questions: {len(all_qas)}")



Total questions: 11873


In [8]:
# ---------------------------------------
# SAME BATCHED GENERATION AS ENGLISH
# ---------------------------------------
BATCH_SIZE = 64

with open(output_csv, "w", newline='', encoding="utf-8") as f_out:
    writer = csv.writer(f_out)
    writer.writerow(["id", "model_answer", "expected_answer"])

    num_batches = (len(all_qas) + BATCH_SIZE - 1) // BATCH_SIZE

    for batch_idx in tqdm(range(num_batches), desc=f"Generating {lang} QA (batched)"):

        batch = all_qas[batch_idx * BATCH_SIZE : (batch_idx+1) * BATCH_SIZE]

        prompts = [
            f"Context: {entry['context']}\nQuestion: {entry['question']}\nAnswer:"
            for entry in batch
        ]

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for i, entry in enumerate(batch):
            prompt_len = inputs.input_ids[i].shape[0]
            full_text = decoded[i]
            generated = full_text[len(prompts[i]):].strip()
            generated = generated.split("\n")[0].strip()

            writer.writerow([entry["id"], generated, entry["expected"]])

print("Done! CSV saved at:", output_csv)


Generating hindi QA (batched): 100%|██████████| 186/186 [19:30<00:00,  6.29s/it]

Done! CSV saved at: /content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_hindi_batched.csv





 **TAMIL**

In [9]:
import json
import csv
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------------------------------
# Change only this to the target language
# ---------------------------------------
lang = "tamil"   # or tamil / kannada / marathi
json_path = f"/content/drive/MyDrive/Project/Dataset/IndicSquad/cleaned_data/{lang}_cleaned.json"
output_csv = f"/content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_{lang}_batched.csv"

# ---------------------------------------
# Load Indic flat-style format
# ---------------------------------------
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

all_qas = []
for item in data["data"]:
    qid = item.get("id", "")
    context = item.get("context", "")
    question = item.get("question", "")

    # Safe answer extraction
    ans_list = item.get("answers", {}).get("text", [])
    expected = ans_list[0] if len(ans_list) > 0 else ""

    all_qas.append({
        "id": qid,
        "question": question,
        "context": context,
        "expected": expected
    })


print(f"Total questions: {len(all_qas)}")



Total questions: 11873


In [10]:
# ---------------------------------------
# SAME BATCHED GENERATION AS ENGLISH
# ---------------------------------------
BATCH_SIZE = 64

with open(output_csv, "w", newline='', encoding="utf-8") as f_out:
    writer = csv.writer(f_out)
    writer.writerow(["id", "model_answer", "expected_answer"])

    num_batches = (len(all_qas) + BATCH_SIZE - 1) // BATCH_SIZE

    for batch_idx in tqdm(range(num_batches), desc=f"Generating {lang} QA (batched)"):

        batch = all_qas[batch_idx * BATCH_SIZE : (batch_idx+1) * BATCH_SIZE]

        prompts = [
            f"Context: {entry['context']}\nQuestion: {entry['question']}\nAnswer:"
            for entry in batch
        ]

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for i, entry in enumerate(batch):
            prompt_len = inputs.input_ids[i].shape[0]
            full_text = decoded[i]
            generated = full_text[len(prompts[i]):].strip()
            generated = generated.split("\n")[0].strip()

            writer.writerow([entry["id"], generated, entry["expected"]])

print("Done! CSV saved at:", output_csv)


Generating tamil QA (batched): 100%|██████████| 186/186 [19:52<00:00,  6.41s/it]

Done! CSV saved at: /content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_tamil_batched.csv





KANNADA

In [11]:
import json
import csv
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"


lang = "kannada"   # or tamil / kannada / marathi
json_path = f"/content/drive/MyDrive/Project/Dataset/IndicSquad/cleaned_data/{lang}_cleaned.json"
output_csv = f"/content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_{lang}_batched.csv"

# ---------------------------------------
# Load Indic flat-style format
# ---------------------------------------
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

all_qas = []
for item in data["data"]:
    qid = item.get("id", "")
    context = item.get("context", "")
    question = item.get("question", "")

    # Safe answer extraction
    ans_list = item.get("answers", {}).get("text", [])
    expected = ans_list[0] if len(ans_list) > 0 else ""

    all_qas.append({
        "id": qid,
        "question": question,
        "context": context,
        "expected": expected
    })


print(f"Total questions: {len(all_qas)}")



Total questions: 11873


In [12]:
BATCH_SIZE = 64

with open(output_csv, "w", newline='', encoding="utf-8") as f_out:
    writer = csv.writer(f_out)
    writer.writerow(["id", "model_answer", "expected_answer"])

    num_batches = (len(all_qas) + BATCH_SIZE - 1) // BATCH_SIZE

    for batch_idx in tqdm(range(num_batches), desc=f"Generating {lang} QA (batched)"):

        batch = all_qas[batch_idx * BATCH_SIZE : (batch_idx+1) * BATCH_SIZE]

        prompts = [
            f"Context: {entry['context']}\nQuestion: {entry['question']}\nAnswer:"
            for entry in batch
        ]

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for i, entry in enumerate(batch):
            prompt_len = inputs.input_ids[i].shape[0]
            full_text = decoded[i]
            generated = full_text[len(prompts[i]):].strip()
            generated = generated.split("\n")[0].strip()

            writer.writerow([entry["id"], generated, entry["expected"]])

print("Done! CSV saved at:", output_csv)


Generating kannada QA (batched): 100%|██████████| 186/186 [19:44<00:00,  6.37s/it]

Done! CSV saved at: /content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_kannada_batched.csv





**MARATHI**

In [15]:
import json
import csv
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"


lang = "marathi"   # or tamil / kannada / marathi
json_path = f"/content/drive/MyDrive/Project/Dataset/IndicSquad/cleaned_data/{lang}_cleaned.json"
output_csv = f"/content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_{lang}_batched.csv"

# Load Marathi dataset
# ------------------------
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Marathi has nested structure: data -> data
items = data["data"]["data"]

all_qas = []
for item in items:
    qid = item.get("id", "")
    context = item.get("context", "")
    question = item.get("question", "")

    expected = ""
    if "answers" in item and "text" in item["answers"] and len(item["answers"]["text"]) > 0:
        expected = item["answers"]["text"][0]

    all_qas.append({
        "id": qid,
        "question": question,
        "context": context,
        "expected": expected
    })

print("Total QAs:", len(all_qas))


Total QAs: 11873


In [16]:
BATCH_SIZE = 64

with open(output_csv, "w", newline='', encoding="utf-8") as f_out:
    writer = csv.writer(f_out)
    writer.writerow(["id", "model_answer", "expected_answer"])

    num_batches = (len(all_qas) + BATCH_SIZE - 1) // BATCH_SIZE

    for batch_idx in tqdm(range(num_batches), desc=f"Generating {lang} QA (batched)"):

        batch = all_qas[batch_idx * BATCH_SIZE : (batch_idx+1) * BATCH_SIZE]

        prompts = [
            f"Context: {entry['context']}\nQuestion: {entry['question']}\nAnswer:"
            for entry in batch
        ]

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for i, entry in enumerate(batch):
            prompt_len = inputs.input_ids[i].shape[0]
            full_text = decoded[i]
            generated = full_text[len(prompts[i]):].strip()
            generated = generated.split("\n")[0].strip()

            writer.writerow([entry["id"], generated, entry["expected"]])

print("Done! CSV saved at:", output_csv)


Generating marathi QA (batched): 100%|██████████| 186/186 [19:24<00:00,  6.26s/it]

Done! CSV saved at: /content/drive/MyDrive/Project/Dataset/IndicSquad/OUTPUTS/sarvam_marathi_batched.csv



