In [None]:
#code for predicting intents from sarvam.ai for training and testing dataset


import requests
import pandas as pd
import time

# ----------------------------------------------
# Configuration
# ----------------------------------------------
INPUT_FILE             = "unseen_data.xlsx"        # Excel file with '001_pa' column
API_KEY                = "sk_zp6z696v_HnTVCWLL78Up5DjSod8whBxH"
OUTPUT_PREFIX          = "sarvam_unseen"         # Prefix for output files
BATCH_SIZE             = 16                             # Rows per batch
SAVE_EVERY_N_BATCHES   = 3                              # Save after this many batches (16√ó3 = 48 rows)
BACKOFF_SECONDS        = 30                             # Seconds to wait after any failed attempt
# ----------------------------------------------

# Load input data
df = pd.read_excel(INPUT_FILE, engine='openpyxl')
if "001_pa" not in df.columns:
    raise KeyError("Column '001_pa' not found in the input Excel file.")
df["intent"] = None

# A prompt that asks Sarvam.ai to extract intent freely (one or two Punjabi words),
# illustrating with a few varied examples without fixing a closed set of labels:
SYSTEM_PROMPT = """
‡®§‡©Å‡®∏‡©Ä‡®Ç ‡®á‡©±‡®ï ‡®™‡©ç‡®∞‡®∏‡®º‡®ø‡®ñ‡®§ intent classification ‡®Æ‡®æ‡®°‡®≤ ‡®π‡©ã‡•§  
‡®§‡©Å‡®π‡®æ‡®°‡®æ ‡®ü‡®æ‡®∏‡®ï ‡®π‡©à ‡®π‡®∞‡©á‡®ï ‡®π‡©á‡®† ‡®¶‡®ø‡©±‡®§‡©á ‡®™‡©∞‡®ú‡®æ‡®¨‡©Ä ‡®µ‡®æ‡®ï ‡®¶‡®æ ‡®Æ‡©Ç‡®≤ ‡®â‡®¶‡©á‡®∏‡®º (intent) ‡®®‡®ø‡®∞‡®ß‡®æ‡®∞‡®§ ‡®ï‡®∞‡®®‡®æ ‚Äî ‡®™‡©∞‡®ú‡®æ‡®¨‡©Ä ‡®µ‡®ø‡©±‡®ö ‡®∏‡®ø‡®∞‡®´‡®º 1 ‡®§‡©ã‡®Ç 4 ‡®∏‡®º‡®¨‡®¶‡®æ‡®Ç ‡®µ‡®ø‡©±‡®ö‡•§

üëâ Intent ‡®¶‡®æ ‡®Ö‡®∞‡®• ‡®π‡©à: **‡®á‡®∏ ‡®µ‡®æ‡®ï ‡®¶‡©á ‡®™‡®ø‡©±‡®õ‡©á ‡®≤‡©Å‡®ï‡®ø‡®Ü ‡®π‡®µ‡®æ‡®≤‡®æ ‡®ú‡®æ‡®Ç ‡®â‡®¶‡©á‡®∏‡®º ‚Äî ‡®≤‡©á‡®ñ‡®ï/‡®µ‡®ï‡®§‡®æ ‡®ï‡®ø‡®â‡®Ç ‡®á‡®π ‡®µ‡®æ‡®ï ‡®≤‡®ø‡®ñ ‡®∞‡®ø‡®π‡®æ/‡®¨‡©ã‡®≤ ‡®∞‡®ø‡®π‡®æ ‡®π‡©à?**  
‡®â‡®¶‡®æ‡®π‡®∞‡®®:

- Greeting ‚Üí ‡®∏‡©Å‡®Ü‡®ó‡®§
- Question ‚Üí ‡®∏‡®µ‡®æ‡®≤
- Request ‚Üí ‡®¨‡©á‡®®‡®§‡©Ä
- Information ‚Üí ‡®ú‡®æ‡®£‡®ï‡®æ‡®∞‡©Ä
- Emotion ‚Üí ‡®≠‡®æ‡®µ‡®®‡®æ
- Translation ‚Üí ‡®Ö‡®®‡©Å‡®µ‡®æ‡®¶
- Pronunciation ‚Üí ‡®â‡®ö‡®æ‡®∞‡®£
- Travel Info ‚Üí ‡®∏‡©à‡®≤‡®æ‡®®‡©Ä-‡®∏‡©Ç‡®ö‡®®‡®æ
- Help ‚Üí ‡®Æ‡®¶‡®¶-‡®®‡®ø‡®µ‡©á‡®¶‡®®
- Command ‚Üí ‡®Ü‡®¶‡©á‡®∏‡®º

**Examples**:

- ‡®∏‡®§ ‡®∏‡©ç‡®∞‡©Ä ‡®Ö‡®ï‡®æ‡®≤! ‚Üí ‡®∏‡©Å‡®Ü‡®ó‡®§  
- ‡®ï‡©Ä ‡®§‡©Å‡®∏‡©Ä‡®Ç ‡®Æ‡©á‡®∞‡©Ä ‡®Æ‡®¶‡®¶ ‡®ï‡®∞ ‡®∏‡®ï‡®¶‡©á ‡®π‡©ã? ‚Üí ‡®¨‡©á‡®®‡®§‡©Ä  
- ‡®Æ‡©à‡®Ç ‡®¶‡®´‡®º‡®§‡®∞ ‡®ú‡®æ ‡®∞‡®ø‡®π‡®æ ‡®π‡®æ‡®Ç‡•§ ‚Üí ‡®ú‡®æ‡®£‡®ï‡®æ‡®∞‡©Ä  
- ‡®§‡©Å‡®∏‡©Ä‡®Ç ‡®ï‡®ø‡®µ‡©á‡®Ç ‡®π‡©ã? ‚Üí ‡®∏‡®µ‡®æ‡®≤  
- ‡®Æ‡©à‡®Ç ‡®Ö‡©±‡®ú ‡®ñ‡©Å‡®∏‡®º ‡®π‡®æ‡®Ç‡•§ ‚Üí ‡®≠‡®æ‡®µ‡®®‡®æ  
- ‡®™‡®æ‡®∏‡®§‡®æ ‡®≤‡®à ‡®∏‡®™‡©á‡®®‡©Ä ‡®∏‡®º‡®¨‡®¶ ‡®ï‡©Ä ‡®π‡©à? ‚Üí ‡®Ö‡®®‡©Å‡®µ‡®æ‡®¶  
- ‡®ö‡©Ä‡®® ‡®µ‡®ø‡©±‡®ö ‡®∏‡©à‡®≤‡®æ‡®®‡©Ä ‡®Ü‡®ï‡®∞‡®∏‡®º‡®£ ‡®ï‡®ø‡®π‡©ú‡©á ‡®π‡®®? ‚Üí ‡®∏‡©à‡®≤‡®æ‡®®‡©Ä-‡®∏‡©Ç‡®ö‡®®‡®æ  
- ‡®á‡®π ‡®∏‡®º‡®¨‡®¶ ‡®ï‡®ø‡®µ‡©á‡®Ç ‡®â‡®ö‡®æ‡®∞‡®ø‡®Ü ‡®ú‡®æ‡®Ç‡®¶‡®æ ‡®π‡©à? ‚Üí ‡®â‡®ö‡®æ‡®∞‡®£  

**Guidelines**:

‚úÖ ‡®π‡®∞ ‡®µ‡®æ‡®ï ‡®≤‡®à ‡®∏‡®ø‡®∞‡®´‡®º intent ‡®≤‡®ø‡®ñ‡©ã ‚Äî 1-4 ‡®∏‡®º‡®¨‡®¶ (‡®™‡©∞‡®ú‡®æ‡®¨‡©Ä ‡®µ‡®ø‡©±‡®ö)  
‚úÖ Sentence ‡®®‡®æ ‡®¶‡©Å‡®π‡®∞‡®æ‡®ì‡•§ "intent:" ‡®®‡®æ ‡®ú‡©ã‡©ú‡©ã‡•§  
‚úÖ Similar ‡®µ‡®æ‡®ï‡®æ‡®Ç ‡®≤‡®à same intent wording ‡®µ‡®∞‡®§‡©ã‡•§  
‚úÖ ‡®ú‡©á sentence ‡®®‡®µ‡®æ‡®Ç ‡®π‡©à ‚Üí ‡®Ö‡®∞‡®• ‡®™‡©ú‡©ç‡®π‡©ã, ‡®∏‡®π‡©Ä intent ‡®≤‡®ø‡®ñ‡©ã‡•§  
‚úÖ ‡®∏‡©∞‡®¶‡©á‡®π‡®™‡©Ç‡®∞‡®® ‡®π‡©ã‡®£ '‡®§‡©á ‚Üí ‡®∏‡®≠ ‡®§‡©ã‡®Ç ‡®ò‡©±‡®ü ‡®∏‡®Æ‡®ù ‡®Ü‡®â‡®£ ‡®µ‡®æ‡®≤‡®æ ‡®™‡®∞ ‡®≤‡®æ‡®ó‡©Ç intent ‡®ö‡©Å‡®£‡©ã‡•§  
‚úÖ ‡®ñ‡®æ‡®≤‡©Ä ‡®ú‡®æ‡®Ç ‡®µ‡®ø‡®Ö‡®∞‡®• ‡®â‡©±‡®§‡®∞ ‡®®‡®æ ‡®¶‡®ø‡®ì‡•§  
‚úÖ Output ‡®∏‡®ø‡®∞‡®´‡®º intent ‡®π‡©Ä ‡®π‡©ã‡®£‡®æ ‡®ö‡®æ‡®π‡©Ä‡®¶‡®æ ‡®π‡©à‡•§

**Thinking step**:

‡®™‡®π‡®ø‡®≤‡®æ‡®Ç, ‡®π‡®∞ sentence ‡®®‡©Ç‡©∞ ‡®Ü‡®™‡®£‡©á ‡®Æ‡®® ‡®µ‡®ø‡©±‡®ö ‡®™‡©ú‡©ç‡®π‡©ã ‡®Ö‡®§‡©á ‡®∏‡©ã‡®ö‡©ã:

üëâ "‡®á‡®π sentence ‡®ï‡®ø‡®π‡©ú‡©Ä category ‡®µ‡®ø‡©±‡®ö ‡®™‡©à‡®Ç‡®¶‡®æ ‡®π‡©à?"  
üëâ "‡®≤‡©á‡®ñ‡®ï/‡®µ‡®ï‡®§‡®æ ‡®¶‡®æ ‡®â‡®¶‡©á‡®∏‡®º ‡®ï‡©Ä ‡®π‡©à?"  
üëâ "‡®ï‡©Ä ‡®á‡®π ‡®¨‡©á‡®®‡®§‡©Ä ‡®π‡©à? ‡®∏‡®µ‡®æ‡®≤? ‡®ú‡®æ‡®£‡®ï‡®æ‡®∞‡©Ä? ‡®≠‡®æ‡®µ‡®®‡®æ? ‡®π‡©ã‡®∞?"

‡®´‡®ø‡®∞ ‡®∏‡®ø‡®∞‡®´‡®º intent ‡®≤‡®ø‡®ñ‡©ã ‚Äî 1 line per sentence.

‡®π‡©Å‡®£ ‡®π‡©á‡®†‡®æ‡®Ç ‡®¶‡®ø‡©±‡®§‡©á ‡®µ‡®æ‡®ï‡®æ‡®Ç ‡®¶‡©á intent ‡®≤‡®ø‡®ñ‡©ã:
""".strip()


def get_intent_from_sarvam(text, api_key):
    url = "https://api.sarvam.ai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "api-subscription-key": api_key
    }
    payload = {
        "model": "sarvam-m",
        "messages": [
            {"role": "system",  "content": SYSTEM_PROMPT},
            {"role": "user",    "content": text}
        ]
    }
    try:
        resp = requests.post(url, json=payload, headers=headers)
        if resp.status_code != 200:
            print(f"[Error] API call failed {resp.status_code}: {resp.text}")
            return None
        data = resp.json()
        choices = data.get("choices", [])
        if not choices:
            return None
        content = choices[0].get("message", {}).get("content", "").strip()
        # Take at most two words from the start (in case of extra whitespace/punctuation)
        words = content.split()
        return " ".join(words[:2]) if words else None
    except Exception as e:
        print(f"[Exception] during API call: {e}")
        return None

# Batch-processing loop
total_rows   = len(df)
num_batches  = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE
file_counter = 1

sent_buffer   = []
intent_buffer = []

for batch_idx in range(num_batches):
    start = batch_idx * BATCH_SIZE
    end   = min(start + BATCH_SIZE, total_rows)
    print(f"\nüîÑ Batch {batch_idx+1}/{num_batches} (rows {start}‚Äì{end-1})")

    for row_idx in range(start, end):
        text   = df.at[row_idx, "001_pa"]
        intent = None

        # Retry until success
        while intent is None:
            intent = get_intent_from_sarvam(text, API_KEY)
            if intent is None:
                print(f"  ‚ö†Ô∏è Failed to get intent for row {row_idx}, sleeping {BACKOFF_SECONDS}s‚Ä¶")
                time.sleep(BACKOFF_SECONDS)
            else:
                print(f"  ‚úÖ Row {row_idx}: intent = '{intent}'")

        df.at[row_idx, "intent"] = intent
        sent_buffer.append(text)
        intent_buffer.append(intent)

        # short pause on success
        time.sleep(1)

    # After each batch, save if needed
    batches_done = batch_idx + 1
    if (batches_done % SAVE_EVERY_N_BATCHES == 0) or (end == total_rows):
        out_df   = pd.DataFrame({"Sentence": sent_buffer, "Intent": intent_buffer})
        out_path = f"{OUTPUT_PREFIX}_part{file_counter}.xlsx"
        out_df.to_excel(out_path, index=False)
        print(f"üì¶ Saved {len(out_df)} rows ‚Üí {out_path}")

        sent_buffer.clear()
        intent_buffer.clear()
        file_counter += 1

print("\nüéâ All batches finished and saved!")



üîÑ Batch 1/13 (rows 0‚Äì15)
  ‚úÖ Row 0: intent = '‡®ú‡®æ‡®£‡®ï‡®æ‡®∞‡©Ä'
  ‚úÖ Row 1: intent = '‡®¨‡©á‡®®‡®§‡©Ä'
  ‚úÖ Row 2: intent = '‡®ú‡®æ‡®£‡®ï‡®æ‡®∞‡©Ä'
  ‚úÖ Row 3: intent = '‡®ú‡®æ‡®£‡®ï‡®æ‡®∞‡©Ä'
  ‚úÖ Row 4: intent = '‡®¨‡©á‡®®‡®§‡©Ä'
  ‚úÖ Row 5: intent = '‡®∏‡©ã‡®ö'
  ‚úÖ Row 6: intent = '‡®∏‡®µ‡®æ‡®≤'
  ‚úÖ Row 7: intent = '‡®ú‡®æ‡®£‡®ï‡®æ‡®∞‡©Ä'
  ‚úÖ Row 8: intent = '‡®Æ‡©à‡®Ç ‡®ï‡©±‡®≤‡©ç‡®π'
  ‚úÖ Row 9: intent = '‡®≠‡®æ‡®µ‡®®‡®æ'
  ‚úÖ Row 10: intent = '‡®Ö‡®®‡©Å‡®µ‡®æ‡®¶'
  ‚úÖ Row 11: intent = '‡®Ü‡®¶‡©á‡®∏‡®º'
  ‚úÖ Row 12: intent = '‡®∏‡©à‡®≤‡®æ‡®®‡©Ä-‡®∏‡©Ç‡®ö‡®®‡®æ'
  ‚úÖ Row 13: intent = '‡®ú‡®æ‡®£‡®ï‡®æ‡®∞‡©Ä'
  ‚úÖ Row 14: intent = '‡®∏‡©Å‡®Ü‡®ó‡®§'
[Error] API call failed 429: {"error":{"message":"Rate limit exceeded","code":"rate_limit_exceeded_error","request_id":"20250624_28b5240c-f30c-4d37-9f08-37447bb0c98b"}}
  ‚ö†Ô∏è Failed to get intent for row 15, sleeping 30s‚Ä¶
[Error] API call failed 429: {"error":{"message":"Rate limit exceeded","code":"rate_