In [None]:
#code for training data (GPT)--> 2384 entries


import pandas as pd
import openai
import time
import math
import glob

# Configure OpenRouter with the older openai library
openai.api_key = ""
openai.api_base = "https://openrouter.ai/api/v1"

# Read Excel file
print("Reading Excel file...")
df = pd.read_excel('filtered_punjabi.xlsx')
sentences = df['001_pa'].dropna().tolist()
print(f"Total sentences to process: {len(sentences)}")

batch_size = 16
file_idx = 1
file_sentences = []
file_intents = []

# Build prompt
def build_prompt(batch):
    numbered = "\n".join([f"{i+1}. {s}" for i, s in enumerate(batch)])
    return (
        "ਤੁਸੀਂ ਇੱਕ ਇੰਟੈਂਟ ਕਲਾਸੀਫਿਕੇਸ਼ਨ ਸਹਾਇਕ ਹੋ। ਤੁਹਾਡਾ ਕੰਮ ਹਰ ਪੰਜਾਬੀ ਵਾਕ ਦਾ ਮੁੱਖ *ਉਦੇਸ਼* (intent) ਕੱਢਣਾ ਹੈ।\n\n"
        "ਉਦੇਸ਼ ਦਾ ਅਰਥ ਹੈ ਕਿ ਵਾਕ ਦੇ ਪਿੱਛੇ ਮਕਸਦ — ਜਿਵੇਂ ਕਿ ਅਭਿਵਾਦਨ, ਪ੍ਰਸ਼ਨ, ਬੇਨਤੀ, ਹੁਕਮ, ਜਾਣਕਾਰੀ, ਭਾਵਨਾ ਦਾ ਪ੍ਰਗਟਾਵਾ ਆਦਿ।\n\n"
        "ਕੁਝ ਉਦਾਹਰਣ:\n"
        "- ਸਤ ਸ੍ਰੀ ਅਕਾਲ! → ਅਭਿਵਾਦਨ\n"
        "- ਕੀ ਤੁਸੀਂ ਮੇਰੀ ਮਦਦ ਕਰ ਸਕਦੇ ਹੋ? → ਬੇਨਤੀ\n"
        "- ਮੈਂ ਦਫ਼ਤਰ ਜਾ ਰਿਹਾ ਹਾਂ। → ਜਾਣਕਾਰੀ\n"
        "- ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ? → ਪ੍ਰਸ਼ਨ\n"
        "- ਤੁਸੀਂ ਕਿੰਨੇ ਸਾਲਾਂ ਦੇ ਹੋ? → ਪ੍ਰਸ਼ਨ\n"
        "- ਮੈਂ ਅੱਜ ਖੁਸ਼ ਹਾਂ। → ਭਾਵਨਾ ਦਾ ਪ੍ਰਗਟਾਵਾ\n\n"
        "ਜਵਾਬ ਦੇਣ ਵੇਲੇ:\n"
        "✅ ਕੇਵਲ ਉਦੇਸ਼ ਲਿਖੋ, ਪੰਜਾਬੀ ਵਿੱਚ, 1-4 ਸ਼ਬਦਾਂ ਵਿੱਚ\n"
        "✅ ਇੱਕੋ ਤਰਜ਼ ਦੇ ਸ਼ਬਦ ਵਰਤੋ ਸਮਾਨ ਵਾਕਾਂ ਲਈ\n"
        "✅ ਵਾਕ ਨੂੰ ਦੁਹਰਾਓ ਨਾ\n"
        "✅ ਸਿਰਫ਼ ਉਦੇਸ਼ ਲਾਈਨ ਵਾਈਜ਼ ਲਿਖੋ\n\n"
        "ਹੁਣ ਇਹਨਾਂ ਵਾਕਾਂ ਦੇ ਉਦੇਸ਼ ਲਿਖੋ:\n\n"
        + numbered
    )

# API call
def get_intents(prompt, batch_num):
    try:
        response = openai.ChatCompletion.create(
            model="openai/gpt-3.5-turbo",  # OpenRouter format
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts intents from Punjabi sentences."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500,
            temperature=0.3
        )
        return response.choices[0].message.content.strip().splitlines()
    except Exception as e:
        print(f"❌ Batch {batch_num}: Error during API call – {e}")
        return None

# Test API connection
def test_api_connection():
    try:
        response = openai.ChatCompletion.create(
            model="openai/gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hello"}],
            max_tokens=5
        )
        print("✅ OpenRouter API connection successful!")
        return True
    except Exception as e:
        print(f"❌ API connection test failed: {e}")
        return False

# Test first
if not test_api_connection():
    print("Please check your OpenRouter API key and try again.")
    exit()

# Main batch loop
num_batches = math.ceil(len(sentences) / batch_size)

for batch_num in range(num_batches):
    batch = sentences[batch_num * batch_size : (batch_num + 1) * batch_size]
    print(f"\n🔄 Processing batch {batch_num+1}/{num_batches} ({len(batch)} sentences)")

    prompt = build_prompt(batch)

    # Retry loop
    retry_count = 0
    while True:
        results = get_intents(prompt, batch_num + 1)
        if results is not None and len(results) == len(batch):
            print(f"✅ Batch {batch_num+1} succeeded after {retry_count} retries.")
            break
        else:
            retry_count += 1
            print(f"⏳ Batch {batch_num+1}: Retry {retry_count}... Waiting 30 seconds...")
            time.sleep(30)

    # Save results
    file_sentences.extend(batch)
    file_intents.extend(results)
    print(f"✅ Batch {batch_num+1} processed.")

    # Save every 3 batches
    if (batch_num + 1) % 3 == 0:
        df_out = pd.DataFrame({'Sentence': file_sentences, 'Intent': file_intents})
        df_out.drop_duplicates(subset='Sentence', inplace=True)
        out_name = f"prompt_gpt_intents_{file_idx}.xlsx"
        df_out.to_excel(out_name, index=False)
        print(f"📁 Saved {out_name} with {len(df_out)} entries.")

        file_idx += 1
        file_sentences.clear()
        file_intents.clear()

    time.sleep(20)  # Rate limit

# Save remaining sentences
if file_sentences:
    df_out = pd.DataFrame({'Sentence': file_sentences, 'Intent': file_intents})
    df_out.drop_duplicates(subset='Sentence', inplace=True)
    out_name = f"prompt_gpt_intents_{file_idx}.xlsx"
    df_out.to_excel(out_name, index=False)
    print(f"📁 Saved {out_name} with {len(df_out)} remaining entries.")

# FINAL STEP — Combine all files
print("\n🔍 Combining all partial files into one full CSV...")
xlsx_files = glob.glob("prompt_gpt_intents_*.xlsx")

combined_df = pd.DataFrame()

for f in xlsx_files:
    print(f"📄 Reading {f}...")
    df_part = pd.read_excel(f)
    combined_df = pd.concat([combined_df, df_part], ignore_index=True)

# Remove duplicates again
combined_df.drop_duplicates(subset='Sentence', inplace=True)

# Save combined CSV
combined_df.to_csv("prompt_gpt_intents_full.csv", index=False, encoding='utf-8-sig')
print(f"\n✅ Combined {len(combined_df)} sentences saved to prompt_gpt_intents_full.csv!")

print("\n🎉 All processing completed!")


Reading Excel file...
Total sentences to process: 1383
✅ OpenRouter API connection successful!

🔄 Processing batch 1/87 (16 sentences)
✅ Batch 1 succeeded after 0 retries.
✅ Batch 1 processed.

🔄 Processing batch 2/87 (16 sentences)
✅ Batch 2 succeeded after 0 retries.
✅ Batch 2 processed.

🔄 Processing batch 3/87 (16 sentences)
⏳ Batch 3: Retry 1... Waiting 30 seconds...
⏳ Batch 3: Retry 2... Waiting 30 seconds...
✅ Batch 3 succeeded after 2 retries.
✅ Batch 3 processed.
📁 Saved prompt_gpt_intents_1.xlsx with 48 entries.

🔄 Processing batch 4/87 (16 sentences)
✅ Batch 4 succeeded after 0 retries.
✅ Batch 4 processed.

🔄 Processing batch 5/87 (16 sentences)
✅ Batch 5 succeeded after 0 retries.
✅ Batch 5 processed.

🔄 Processing batch 6/87 (16 sentences)
⏳ Batch 6: Retry 1... Waiting 30 seconds...
⏳ Batch 6: Retry 2... Waiting 30 seconds...
⏳ Batch 6: Retry 3... Waiting 30 seconds...
⏳ Batch 6: Retry 4... Waiting 30 seconds...
❌ Batch 6: Error during API call – Error communicating with 

In [None]:
#code for testing on unseen data


import pandas as pd
import openai
import time
import math
import glob
import os

# ──────────────────────────────────────────────
# CONFIGURATION
# ──────────────────────────────────────────────
API_KEY = ""
openai.api_key = API_KEY
openai.api_base = "https://openrouter.ai/api/v1"

INPUT_FILE = "unseen_data.xlsx"
COLUMN_NAME = "001_pa"
MODEL_NAME = "openai/gpt-3.5-turbo"

BATCH_SIZE = 16
SAVE_EVERY_N_BATCHES = 3
SLEEP_BETWEEN_CALLS = 20
RETRY_WAIT = 30

OUTPUT_PREFIX = "intents_output"
FINAL_OUTPUT = "intents_combined.csv"

# ──────────────────────────────────────────────
# PROMPT
# ──────────────────────────────────────────────
EXTRACTION_INSTRUCTIONS = """
ਤੁਸੀਂ ਇੱਕ ਇੰਟੈਂਟ ਕਲਾਸੀਫਿਕੇਸ਼ਨ ਸਹਾਇਕ ਹੋ। ਤੁਹਾਡਾ ਕੰਮ ਹਰ ਪੰਜਾਬੀ ਵਾਕ ਦਾ ਮੁੱਖ *ਉਦੇਸ਼* (intent) ਕੱਢਣਾ ਹੈ।

ਉਦੇਸ਼ ਦਾ ਅਰਥ ਹੈ ਕਿ ਵਾਕ ਦੇ ਪਿੱਛੇ ਮਕਸਦ — ਜਿਵੇਂ ਕਿ ਅਭਿਵਾਦਨ, ਪ੍ਰਸ਼ਨ, ਬੇਨਤੀ, ਹੁਕਮ, ਜਾਣਕਾਰੀ, ਭਾਵਨਾ ਦਾ ਪ੍ਰਗਟਾਵਾ ਆਦਿ।

ਉਦਾਹਰਣ:
- ਸਤ ਸ੍ਰੀ ਅਕਾਲ! → ਅਭਿਵਾਦਨ
- ਕੀ ਤੁਸੀਂ ਮੇਰੀ ਮਦਦ ਕਰ ਸਕਦੇ ਹੋ? → ਬੇਨਤੀ
- ਮੈਂ ਦਫ਼ਤਰ ਜਾ ਰਿਹਾ ਹਾਂ। → ਜਾਣਕਾਰੀ
- ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ? → ਪ੍ਰਸ਼ਨ
- ਮੈਂ ਅੱਜ ਖੁਸ਼ ਹਾਂ। → ਭਾਵਨਾ ਦਾ ਪ੍ਰਗਟਾਵਾ

✅ ਜਵਾਬ ਸਿਰਫ਼ intent ਹੋਣਾ ਚਾਹੀਦਾ ਹੈ
✅ ਪੰਜਾਬੀ ਵਿੱਚ 1–4 ਸ਼ਬਦ
✅ Sentence ਨਾ ਦੁਹਰਾਓ, ਸਿਰਫ intent ਲਿਖੋ
✅ ਹਰ ਵਾਕ ਲਈ ਇੱਕ ਨਵੀਂ ਲਾਈਨ
"""
# ──────────────────────────────────────────────

def build_prompt(sent_batch):
    numbered = "\n".join([f"{i+1}. {s}" for i, s in enumerate(sent_batch)])
    return EXTRACTION_INSTRUCTIONS + "\n\nਹੁਣ ਇਹਨਾਂ ਵਾਕਾਂ ਦੇ ਉਦੇਸ਼ ਲਿਖੋ:\n" + numbered

def call_openrouter(prompt, model_name):
    try:
        response = openai.ChatCompletion.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts intents from Punjabi sentences."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=600
        )
        return response.choices[0].message.content.strip().splitlines()
    except Exception as e:
        print(f"❌ API Error: {e}")
        return None

def test_api():
    try:
        openai.ChatCompletion.create(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": "Hello"}],
            max_tokens=5
        )
        print("✅ API connection test successful.")
        return True
    except Exception as e:
        print(f"❌ API connection failed: {e}")
        return False

# ──────────────────────────────────────────────
# MAIN LOGIC
# ──────────────────────────────────────────────
if not test_api():
    exit()

df = pd.read_excel(INPUT_FILE)
if COLUMN_NAME not in df.columns:
    raise ValueError(f"Column '{COLUMN_NAME}' not found.")

sentences = df[COLUMN_NAME].dropna().tolist()
total_batches = math.ceil(len(sentences) / BATCH_SIZE)

file_idx = 1
collected_sentences = []
collected_intents = []

for batch_num in range(total_batches):
    start = batch_num * BATCH_SIZE
    end = start + BATCH_SIZE
    batch = sentences[start:end]
    print(f"\n🔁 Batch {batch_num + 1}/{total_batches} ({len(batch)} sentences)")

    prompt = build_prompt(batch)
    retry_count = 0

    while True:
        results = call_openrouter(prompt, MODEL_NAME)
        if results and len(results) == len(batch):
            print(f"✅ Batch {batch_num+1} success after {retry_count} retries.")
            break
        retry_count += 1
        print(f"🔄 Retry {retry_count} for batch {batch_num+1} after {RETRY_WAIT}s...")
        time.sleep(RETRY_WAIT)

    collected_sentences.extend(batch)
    collected_intents.extend(results)

    if (batch_num + 1) % SAVE_EVERY_N_BATCHES == 0:
        partial_df = pd.DataFrame({"Sentence": collected_sentences, "Intent": collected_intents})
        partial_df.drop_duplicates(subset="Sentence", inplace=True)
        out_file = f"{OUTPUT_PREFIX}_part{file_idx}.xlsx"
        partial_df.to_excel(out_file, index=False)
        print(f"💾 Saved: {out_file} ({len(partial_df)} entries)")

        collected_sentences.clear()
        collected_intents.clear()
        file_idx += 1

    time.sleep(SLEEP_BETWEEN_CALLS)

if collected_sentences:
    final_part = pd.DataFrame({"Sentence": collected_sentences, "Intent": collected_intents})
    final_part.drop_duplicates(subset="Sentence", inplace=True)
    out_file = f"{OUTPUT_PREFIX}_part{file_idx}.xlsx"
    final_part.to_excel(out_file, index=False)
    print(f"💾 Saved remaining: {out_file} ({len(final_part)} entries)")

# ──────────────────────────────────────────────
# COMBINE ALL OUTPUTS
# ──────────────────────────────────────────────
print("\n🧩 Combining all partial Excel files...")
all_files = glob.glob(f"{OUTPUT_PREFIX}_part*.xlsx")
combined_df = pd.concat([pd.read_excel(f) for f in all_files], ignore_index=True)
combined_df.drop_duplicates(subset="Sentence", inplace=True)
combined_df.to_csv(FINAL_OUTPUT, index=False, encoding="utf-8-sig")

print(f"\n✅ Final output saved to: {FINAL_OUTPUT} ({len(combined_df)} unique entries)")
print("🎉 All done!")

✅ API connection test successful.

🔁 Batch 1/13 (16 sentences)
✅ Batch 1 success after 0 retries.

🔁 Batch 2/13 (16 sentences)
✅ Batch 2 success after 0 retries.

🔁 Batch 3/13 (16 sentences)
✅ Batch 3 success after 0 retries.
💾 Saved: intents_output_part1.xlsx (48 entries)

🔁 Batch 4/13 (16 sentences)
✅ Batch 4 success after 0 retries.

🔁 Batch 5/13 (16 sentences)
🔄 Retry 1 for batch 5 after 30s...
✅ Batch 5 success after 1 retries.

🔁 Batch 6/13 (16 sentences)
✅ Batch 6 success after 0 retries.
💾 Saved: intents_output_part2.xlsx (48 entries)

🔁 Batch 7/13 (16 sentences)
✅ Batch 7 success after 0 retries.

🔁 Batch 8/13 (16 sentences)
✅ Batch 8 success after 0 retries.

🔁 Batch 9/13 (16 sentences)
✅ Batch 9 success after 0 retries.
💾 Saved: intents_output_part3.xlsx (48 entries)

🔁 Batch 10/13 (16 sentences)
✅ Batch 10 success after 0 retries.

🔁 Batch 11/13 (16 sentences)
✅ Batch 11 success after 0 retries.

🔁 Batch 12/13 (16 sentences)
🔄 Retry 1 for batch 12 after 30s...
🔄 Retry 2 for