In [1]:
!pip install -U transformers accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [2]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
from huggingface_hub import login

In [3]:
try:
    login(new_session=False)
    print("✅ Successfully logged in to Hugging Face!")
except Exception as e:
    print(f"⚠️ Warning: Could not login to Hugging Face: {e}")
    print("You may need to authenticate manually or some models might not be accessible.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

✅ Successfully logged in to Hugging Face!


In [4]:
# 1. Load Gemma
model_id = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map={"": 0}) # Prevents RAM spikes during load

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [5]:
def generate_gemma_quest(question):

    prompt = (
        "You are a great language model.\n"
        "You will answer the following questions in the manner shown\n"
        "Question: What is the capital of India?\nAnswer: New Delhi\n"
        "Now answer the following\n"
        f"Question: {question}\nAnswer:"
    )
    messages = [
            {"role": "user", "content": f"Context: ...\nQuestion: {prompt}\nAnswer in English."}
        ]
        # Convert to string using the official template
    full_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100, # Keep short for Factoid QA
            min_length=1,
            do_sample=False,   # Greedy decoding = Reproducible
            pad_token_id=tokenizer.eos_token_id
        )

    # Cut off prompt, keep only new text
    input_len = inputs.input_ids.shape[1]
    generated_ids = outputs[:, input_len:]
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_text.strip()


In [6]:
# Sanity check. See if this works before running other cells
generate_gemma_quest("What is the financial capital of India?")

'Answer: Mumbai'

In [7]:
from tqdm import tqdm
import math

In [8]:
languages = ["English", "Hindi", "Kannada", "Marathi", "Tamil"]

In [9]:
TEMPLATES = {
    "Hindi": "आप एक बेहतरीन भाषा मॉडल हैं।\nप्रश्न: भारत की राजधानी क्या है?\nउत्तर: नई दिल्ली\nप्रश्न: {q}\nउत्तर:",
    "Tamil": "நீங்க ஒரு சிறந்த மொழி மாதிரி.\nகேள்வி: இந்தியாவின் தலைநகரம் என்ன?\nபதில்: புது தில்லி\nகேள்வி: {q}\nபதில்:",
    "Kannada": "ನೀವು ಉತ್ತಮ ಭಾಷಾ ಮಾದರಿ.\nಪ್ರಶ್ನೆ: ಭಾರತದ ರಾಜಧಾನಿ ಯಾವುದು?\nಉತ್ತರ: ನವದೆಹಲಿ\nಪ್ರಶ್ನೆ: {q}\nಉತ್ತರ:",
    "Marathi": "तुम्ही एक उत्तम भाषेचे मॉडेल आहात.\nप्रश्न: भारताची राजधानी कोणती आहे?\nउत्तर: नवी दिल्ली\nप्रश्न: {q}\nउत्तर:",
    "English": "You are a great language model.\nQuestion: What is the capital of India?\nAnswer: New Delhi\nQuestion: {q}\nAnswer:"
    }

In [10]:
def gemma_e2e(language):
  print(f"Processing for {language}")
  # --- 1. CONFIGURATION ---
  BATCH_SIZE = 32  # Sarvam-1 (2B) is tiny. T4 can handle 32 or 64 easily.


  # --- 2. CRITICAL: SET PADDING SIDE ---
  # For generation, padding must be on the LEFT so it doesn't mess up the output
  tokenizer.padding_side = "left"
  # Set pad token if missing (common in base models)
  if tokenizer.pad_token is None:
      tokenizer.pad_token = tokenizer.eos_token

  # --- 3. BATCH GENERATION FUNCTION ---
  def generate_batch(questions):
      # A. Apply Prompt Template to the whole batch
      prompts = []
      for q in questions:
          prompt = TEMPLATES[language].format(q=q) + f"\nAnswer in {language}"
          messages = [
            {"role": "user", "content": prompt}
          ]
          # Convert to string using the official template
          full_prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
          )
          prompts.append(full_prompt)

      # B. Tokenize the whole batch (Pad to longest in batch)
      inputs = tokenizer(
          prompts,
          return_tensors="pt",
          padding=True,
          truncation=True,
          max_length=256
      ).to("cuda")

      # C. Generate for the whole batch at once
      with torch.no_grad():
          outputs = model.generate(
              **inputs,
              max_new_tokens=100,
              min_length=1,
              do_sample=False,
              pad_token_id=tokenizer.eos_token_id
          )

      input_len = inputs.input_ids.shape[1]
      generated_ids = outputs[:, input_len:]

      decoded_batch = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

      return [ans.strip() for ans in decoded_batch]

  # --- 4. THE LOOP (Real-time Progress) ---


  results = []

  data_indic_quest = pd.read_csv(f"/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/{language}.csv")
  print(f"🚀 Processing {len(data_indic_quest)} rows in batches of {BATCH_SIZE}...")
  questions = data_indic_quest['Question'].tolist()
  total_batches = math.ceil(len(questions) / BATCH_SIZE)

  # TQDM gives you the progress bar
  for i in tqdm(range(0, len(questions), BATCH_SIZE), total=total_batches, desc="Generating"):
      # Get batch of questions
      batch_q = questions[i : i + BATCH_SIZE]

      # Run Inference
      batch_answers = generate_batch(batch_q)
      results.extend(batch_answers)

  # --- 5. FINAL SAVE ---
  data_indic_quest['model_response'] = results
  final_path = f"/content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_{language}.csv"
  data_indic_quest.to_csv(final_path, index=False)
  print(f"✅ Done! Saved to {final_path}")

In [11]:
for l in languages:
  gemma_e2e(l)

Processing for English
🚀 Processing 200 rows in batches of 32...


Generating: 100%|██████████| 7/7 [02:39<00:00, 22.80s/it]


✅ Done! Saved to /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_English.csv
Processing for Hindi
🚀 Processing 200 rows in batches of 32...


Generating: 100%|██████████| 7/7 [02:54<00:00, 24.97s/it]


✅ Done! Saved to /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_Hindi.csv
Processing for Kannada
🚀 Processing 200 rows in batches of 32...


Generating: 100%|██████████| 7/7 [03:27<00:00, 29.65s/it]


✅ Done! Saved to /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_Kannada.csv
Processing for Marathi
🚀 Processing 200 rows in batches of 32...


Generating: 100%|██████████| 7/7 [03:02<00:00, 26.03s/it]


✅ Done! Saved to /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_Marathi.csv
Processing for Tamil
🚀 Processing 200 rows in batches of 32...


Generating: 100%|██████████| 7/7 [03:13<00:00, 27.70s/it]

✅ Done! Saved to /content/drive/MyDrive/685-Adv NLP/Project/Dataset/IndicQuest/Results/Vanilla/gemma/gemma_results_Tamil.csv



