In [None]:
#Code for training data (1384 entries)


import pandas as pd
import google.generativeai as genai
import time
import math

# Configure Gemini
GEMINI_API_KEY = ""  # Replace with your own key if needed
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')  # or gemini-2.0 if needed

# Read Excel file
print("Reading Excel file...")
df = pd.read_excel('filtered_punjabi.xlsx')
sentences = df['001_pa'].dropna().tolist()
print(f"Total sentences to process: {len(sentences)}")

batch_size = 16
file_idx = 1
file_sentences = []
file_intents = []

def build_prompt(batch):
    numbered = "\n".join([f"{i+1}. {s}" for i, s in enumerate(batch)])
    return (
        "Extract the intent of each of the following Punjabi sentences in Punjabi. "
        "Respond with only one or two words for each sentence, in the same order, one per line:\n\n" + numbered
    )

def get_intents(prompt, batch_num):
    try:
        response = model.generate_content(prompt)
        return response.text.strip().splitlines()
    except Exception as e:
        print(f"❌ Batch {batch_num}: Error during API call – {e}")
        return None

# Process in batches
num_batches = math.ceil(len(sentences) / batch_size)

for batch_num in range(num_batches):
    batch = sentences[batch_num * batch_size : (batch_num + 1) * batch_size]
    print(f"\n🔄 Processing batch {batch_num+1}/{num_batches} ({len(batch)} sentences)")

    prompt = build_prompt(batch)
    results = get_intents(prompt, batch_num + 1)

    # Retry if failed
    if results is None or len(results) != len(batch):
        print("⏳ Waiting 30 seconds before retrying batch...")
        time.sleep(30)
        results = get_intents(prompt, batch_num + 1)

    if results is None or len(results) != len(batch):
        print(f"⚠️ Batch {batch_num+1}: Failed after retry. Filling with empty intents.")
        results = [""] * len(batch)

    file_sentences.extend(batch)
    file_intents.extend(results)
    print(f"✅ Batch {batch_num+1} processed.")

    # Save every 3 batches (48 sentences)
    if (batch_num + 1) % 3 == 0:
        df_out = pd.DataFrame({'Sentence': file_sentences, 'Intent': file_intents})
        df_out.drop_duplicates(subset='Sentence', inplace=True)
        out_name = f"gemini_intents_{file_idx}.xlsx"
        df_out.to_excel(out_name, index=False)
        print(f"📁 Saved {out_name} with {len(df_out)} entries.")

        # Reset for next file
        file_idx += 1
        file_sentences.clear()
        file_intents.clear()

    time.sleep(30)  # Respect rate limits

# Save remaining sentences, if any
if file_sentences:
    df_out = pd.DataFrame({'Sentence': file_sentences, 'Intent': file_intents})
    df_out.drop_duplicates(subset='Sentence', inplace=True)
    out_name = f"gemini_intents_{file_idx}.xlsx"
    df_out.to_excel(out_name, index=False)
    print(f"📁 Saved {out_name} with {len(df_out)} remaining entries.")


Reading Excel file...
Total sentences to process: 1383

🔄 Processing batch 1/87 (16 sentences)
✅ Batch 1 processed.

🔄 Processing batch 2/87 (16 sentences)
✅ Batch 2 processed.

🔄 Processing batch 3/87 (16 sentences)
✅ Batch 3 processed.
📁 Saved gemini_intents_1.xlsx with 48 entries.

🔄 Processing batch 4/87 (16 sentences)
✅ Batch 4 processed.

🔄 Processing batch 5/87 (16 sentences)
✅ Batch 5 processed.

🔄 Processing batch 6/87 (16 sentences)
✅ Batch 6 processed.
📁 Saved gemini_intents_2.xlsx with 45 entries.

🔄 Processing batch 7/87 (16 sentences)
✅ Batch 7 processed.

🔄 Processing batch 8/87 (16 sentences)
✅ Batch 8 processed.

🔄 Processing batch 9/87 (16 sentences)
✅ Batch 9 processed.
📁 Saved gemini_intents_3.xlsx with 47 entries.

🔄 Processing batch 10/87 (16 sentences)
✅ Batch 10 processed.

🔄 Processing batch 11/87 (16 sentences)
✅ Batch 11 processed.

🔄 Processing batch 12/87 (16 sentences)
✅ Batch 12 processed.
📁 Saved gemini_intents_4.xlsx with 46 entries.

🔄 Processing batch

In [None]:
#Code for testing on unseen dataset


import pandas as pd
import time
import google.generativeai as genai

# ──────────────────────────────────────────────
# CONFIGURATION
# ──────────────────────────────────────────────
INPUT_FILE             = "unseen_data.xlsx"
API_KEY                = ""  # Replace with your API Key
OUTPUT_PREFIX          = "gemini_unseen"
BATCH_SIZE             = 16
SAVE_EVERY_N_BATCHES   = 3
RETRY_ATTEMPTS         = 5
RETRY_WAIT_SECONDS     = 10

# ──────────────────────────────────────────────
# SYSTEM PROMPT
# ──────────────────────────────────────────────
SYSTEM_PROMPT = """
ਤੁਸੀਂ ਇੱਕ ਪ੍ਰਸ਼ਿਖਤ intent classification ਮਾਡਲ ਹੋ।  
ਤੁਹਾਡਾ ਟਾਸਕ ਹੈ ਹਰੇਕ ਹੇਠ ਦਿੱਤੇ ਪੰਜਾਬੀ ਵਾਕ ਦਾ ਮੂਲ ਉਦੇਸ਼ (intent) ਨਿਰਧਾਰਤ ਕਰਨਾ — ਪੰਜਾਬੀ ਵਿੱਚ ਸਿਰਫ਼ 1 ਤੋਂ 4 ਸ਼ਬਦਾਂ ਵਿੱਚ।

👉 Intent ਦਾ ਅਰਥ ਹੈ: **ਇਸ ਵਾਕ ਦੇ ਪਿੱਛੇ ਲੁਕਿਆ ਹਵਾਲਾ ਜਾਂ ਉਦੇਸ਼ — ਲੇਖਕ/ਵਕਤਾ ਕਿਉਂ ਇਹ ਵਾਕ ਲਿਖ ਰਿਹਾ/ਬੋਲ ਰਿਹਾ ਹੈ?**  

ਉਦਾਹਰਨ:
- ਸਤ ਸ੍ਰੀ ਅਕਾਲ! → ਸੁਆਗਤ  
- ਕੀ ਤੁਸੀਂ ਮੇਰੀ ਮਦਦ ਕਰ ਸਕਦੇ ਹੋ? → ਬੇਨਤੀ  
- ਮੈਂ ਦਫ਼ਤਰ ਜਾ ਰਿਹਾ ਹਾਂ। → ਜਾਣਕਾਰੀ  
- ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ? → ਸਵਾਲ  
- ਮੈਂ ਅੱਜ ਖੁਸ਼ ਹਾਂ। → ਭਾਵਨਾ  
- ਪਾਸਤਾ ਲਈ ਸਪੇਨੀ ਸ਼ਬਦ ਕੀ ਹੈ? → ਅਨੁਵਾਦ  
- ਚੀਨ ਵਿੱਚ ਸੈਲਾਨੀ ਆਕਰਸ਼ਣ ਕਿਹੜੇ ਹਨ? → ਸੈਲਾਨੀ-ਸੂਚਨਾ  
- ਇਹ ਸ਼ਬਦ ਕਿਵੇਂ ਉਚਾਰਿਆ ਜਾਂਦਾ ਹੈ? → ਉਚਾਰਣ  

✅ ਹਰ ਵਾਕ ਲਈ ਸਿਰਫ਼ intent ਲਿਖੋ — 1-4 ਸ਼ਬਦ (ਪੰਜਾਬੀ ਵਿੱਚ)  
✅ Sentence ਨਾ ਦੁਹਰਾਓ। "intent:" ਨਾ ਜੋੜੋ।  
✅ Similar ਵਾਕਾਂ ਲਈ same intent wording ਵਰਤੋ।  
✅ Output ਸਿਰਫ਼ intent ਹੀ ਹੋਣਾ ਚਾਹੀਦਾ ਹੈ।
""".strip()

# ──────────────────────────────────────────────
# SETUP GEMINI
# ──────────────────────────────────────────────
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel("gemini-2.0-flash")

# ──────────────────────────────────────────────
# FUNCTION TO GET INTENT (Single Try)
# ──────────────────────────────────────────────
def get_intent_with_gemini(sentence):
    try:
        prompt = f"{SYSTEM_PROMPT}\n\nਵਾਕ:\n{sentence}\nਉਦੇਸ਼:"
        response = model.generate_content(prompt)
        text = response.text.strip()
        return " ".join(text.split()[:4]) if text else None
    except Exception as e:
        print(f"❌ Gemini Error: {e}")
        return None

# ──────────────────────────────────────────────
# RETRY WRAPPER FUNCTION
# ──────────────────────────────────────────────
def get_intent_with_retries(sentence, max_retries=RETRY_ATTEMPTS, wait_seconds=RETRY_WAIT_SECONDS):
    for attempt in range(1, max_retries + 1):
        intent = get_intent_with_gemini(sentence)
        if intent is not None:
            return intent
        print(f"  ⚠️ Attempt {attempt} failed. Retrying in {wait_seconds}s...")
        time.sleep(wait_seconds)
    print("  ❌ Max retries reached. Returning 'ERROR'")
    return "ERROR"

# ──────────────────────────────────────────────
# LOAD EXCEL DATA
# ──────────────────────────────────────────────
df = pd.read_excel(INPUT_FILE, engine='openpyxl')
if "001_pa" not in df.columns:
    raise KeyError("Column '001_pa' not found in the input file.")
df["intent"] = None

# ──────────────────────────────────────────────
# PROCESSING LOOP
# ──────────────────────────────────────────────
total_rows = len(df)
num_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE
file_counter = 1
sent_buffer, intent_buffer = [], []

for batch_idx in range(num_batches):
    start, end = batch_idx * BATCH_SIZE, min((batch_idx + 1) * BATCH_SIZE, total_rows)
    print(f"\n🔄 Batch {batch_idx + 1}/{num_batches} (rows {start}–{end - 1})")

    for row_idx in range(start, end):
        text = str(df.at[row_idx, "001_pa"]).strip()
        intent = get_intent_with_retries(text)
        print(f"  ✅ Row {row_idx}: '{intent}'")

        df.at[row_idx, "intent"] = intent
        sent_buffer.append(text)
        intent_buffer.append(intent)
        time.sleep(1)  # Gentle pacing

    # Save part files every few batches
    if (batch_idx + 1) % SAVE_EVERY_N_BATCHES == 0 or end == total_rows:
        part_df = pd.DataFrame({"Sentence": sent_buffer, "Intent": intent_buffer})
        part_file = f"{OUTPUT_PREFIX}_part{file_counter}.xlsx"
        part_df.to_excel(part_file, index=False)
        print(f"📦 Saved → {part_file} ({len(part_df)} rows)")
        sent_buffer.clear(); intent_buffer.clear(); file_counter += 1

print("\n🎉 All batches complete and saved.")



🔄 Batch 1/13 (rows 0–15)
  ✅ Row 0: 'ਜਾਣਕਾਰੀ'
  ✅ Row 1: 'ਗੀਤ ਚਲਾਉਣਾ'
  ✅ Row 2: 'ਮਿਤੀ ਦੀ ਜਾਣਕਾਰੀ'
  ✅ Row 3: 'ਜਾਣਕਾਰੀ ਦੀ ਮੰਗ'
  ✅ Row 4: 'ਮਦਦ ਦੀ ਬੇਨਤੀ'
  ✅ Row 5: 'ਦੁਬਿਧਾ/ਅਨਿਸ਼ਚਿਤਤਾ'
  ✅ Row 6: 'ਦੁਹਰਾਉਣ ਲਈ ਬੇਨਤੀ'
  ✅ Row 7: 'ਸਮਾਂ ਪੁੱਛਣਾ'
  ✅ Row 8: 'ਇੱਛਾ'
  ✅ Row 9: 'ਸ਼ੱਕ/ਅਵਿਸ਼ਵਾਸ'
  ✅ Row 10: 'ਬੇਨਤੀ'
  ✅ Row 11: 'ਜੂਆ/ਸੰਭਾਵਨਾ'
  ✅ Row 12: 'ਜਾਣਕਾਰੀ ਦੀ ਮੰਗ'
  ✅ Row 13: 'ਕੈਲੋਰੀ ਜਾਣਕਾਰੀ'
  ✅ Row 14: 'ਮੁਬਾਰਕ/ਖੁਸ਼ੀ'
  ✅ Row 15: 'ਜਾਣਕਾਰੀ ਦੀ ਬੇਨਤੀ'

🔄 Batch 2/13 (rows 16–31)
❌ Gemini Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links