In [1]:
# 📦 Required libraries
import pandas as pd
import google.generativeai as genai
from sklearn.metrics import accuracy_score
import time

# 🔑 Configure Gemini
GEMINI_API_KEY = "AIzaSyBR8dgMUD4440UnrmcNqwtmw3rktCzVKrY"
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")

# 📂 Load your idioms dataset
df = pd.read_excel("idioms.xlsx")
df = df.dropna(subset=["Sentences", "is_idiomatic"])  # clean data
df = df.sample(frac=1).reset_index(drop=True)     # shuffle for fairness


  from .autonotebook import tqdm as notebook_tqdm


KeyError: ['is_idiomatic']

In [None]:
predictions_no_cot = []

# 🔁 Iterate with retry & logs
for idx, sentence in enumerate(df["Sentences"]):
    prompt = f"""Sentence: "{sentence}"\nDoes this sentence contain an idiom? Reply with just 0 (no) or 1 (yes)."""
    success = False
    attempts = 0
    max_attempts = 5
    wait_time = 5

    print(f"\n🟡 [{idx + 1}/{len(df)}] Processing sentence:\n➡️ \"{sentence}\"")

    while not success and attempts < max_attempts:
        try:
            response = model.generate_content(prompt)
            reply = response.text.strip()
            print(f"📩 Gemini reply: {reply}")

            # Extract prediction
            if "1" in reply:
                prediction = 1
            elif "0" in reply:
                prediction = 0
            else:
                raise ValueError(f"Unexpected reply: {reply}")

            predictions_no_cot.append(prediction)
            print(f"✅ Prediction added: {prediction}")
            success = True
        except Exception as e:
            attempts += 1
            print(f"❌ Attempt {attempts} failed: {str(e)}")
            print(f"⏳ Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
            wait_time *= 2

    if not success:
        print(f"⚠️ Giving up after {max_attempts} attempts. Defaulting to 0.")
        predictions_no_cot.append(0)

    time.sleep(1)

# ✅ Evaluate Accuracy
accuracy_no_cot = accuracy_score(df["is_idiomatic"], predictions_no_cot)
print("\n🎯 FINAL EVALUATION")
print(f"🔹 Accuracy without Chain of Thought: {accuracy_no_cot:.2f}")


🟡 [1/62] Processing sentence:
➡️ "ਘਰ ਵਿਚ ਇਸ ਸਮੇਂ ਇਹ ਕਿਹੜਾ ਸਮਾਂ ਹੈ"
📩 Gemini reply: 0
✅ Prediction added: 0

🟡 [2/62] Processing sentence:
➡️ "ਇੱਕ ਕਰਮਚਾਰੀ ਦੀ ਕੀ ਸ਼ਿਕਾਇਤ ਕਰਨ, ਇੱਥੇ ਤਾਂ ਆਵਾ ਹੀ ਉਛਿਆ ਹੋਇਆ ਹੈ।"
📩 Gemini reply: 1
✅ Prediction added: 1

🟡 [3/62] Processing sentence:
➡️ "ਇਸ ਕਮਰੇ ਵਿਚ ਮੇਰੇ ਸੈਲਫੋਨ ਲੱਭੋ"
📩 Gemini reply: 0
✅ Prediction added: 0

🟡 [4/62] Processing sentence:
➡️ "ਸੀਤਾ ਦਾ ਆਚਰਨ ਇੰਨਾ ਬੇਦਾਗ ਹੈ ਕਿ ਕੋਈ ਉਸ ਵੱਲ ਉੰਗਲੀ ਨਹੀਂ ਕਰ ਸਕਦਾ।,"
📩 Gemini reply: 1
✅ Prediction added: 1

🟡 [5/62] Processing sentence:
➡️ " ਸਿਆਣੇ ਮਾਤਾ–ਪਿਤਾ ਆਪਣੇ ਸਥਾਨ ਬੱਚਿਆਂ ਨੂੰ ਇੰਝ ਅੱਖ ਨਾਲ ਵੇਖਦੇ ਹਨ।"
📩 Gemini reply: 1
✅ Prediction added: 1

🟡 [6/62] Processing sentence:
➡️ "ਡਰਕਾਮ ਤੋਂ ਆਏ 'ਤੇ ਸਾਲ ਸੁੱਕ ਗਿਆ।"
📩 Gemini reply: 0
✅ Prediction added: 0

🟡 [7/62] Processing sentence:
➡️ "ਰਾਜ! ਤੂੰ ਮੇਰੇ ਨਾਲ ਉਸਤਾਦੀਆਂ ਨਾ ਕਰ, ਮੈਂ ਤੇਰੀ ਅਸਲੀਅਤ ਜਾਣਦਾ ਹਾਂ।","
📩 Gemini reply: 0
✅ Prediction added: 0

🟡 [8/62] Processing sentence:
➡️ "ਕੀ ਤੁਸੀਂ ਜਾਣੋਗੇ ਕਿ ਮੇਰਾ ਫੋਨ ਕਿੱਥੇ ਲੱਭਣਾ ਹੈ"
📩 Gemini reply: 0
✅ Prediction added: 0

🟡 [9/62] P

In [5]:
predictions_cot = []

# 🔁 Iterate with retry & verbose logs
for idx, sentence in enumerate(df["Sentences"]):
    prompt = f"""
    Punjabi Sentence: "{sentence}"
    Step-by-step, think about the meaning and structure of the sentence.
    Determine if there is any metaphorical, cultural, or non-literal meaning involved.
    Conclude if the sentence contains an idiom or not.
    Reply in this format only:
    Reason: ...
    Prediction: 0 or 1
    """

    success = False
    attempts = 0
    max_attempts = 5
    wait_time = 5

    print(f"\n🧠 [{idx + 1}/{len(df)}] Processing (with CoT):\n➡️ \"{sentence}\"")

    while not success and attempts < max_attempts:
        try:
            response = model.generate_content(prompt)
            reply = response.text.strip()
            print(f"📩 Gemini reply:\n{reply}")

            # Extract prediction
            if "Prediction: 1" in reply:
                prediction = 1
            elif "Prediction: 0" in reply:
                prediction = 0
            else:
                raise ValueError(f"❗ Unexpected format: {reply}")

            predictions_cot.append(prediction)
            print(f"✅ Prediction added: {prediction}")
            success = True
        except Exception as e:
            attempts += 1
            print(f"❌ Attempt {attempts} failed: {str(e)}")
            print(f"⏳ Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
            wait_time *= 2

    if not success:
        print(f"⚠️ Giving up after {max_attempts} attempts. Defaulting to 0.")
        predictions_cot.append(0)

    time.sleep(1)  # Avoid hitting rate limits

# 🧠 Evaluate Accuracy
accuracy_cot = accuracy_score(df["is_idiomatic"], predictions_cot)
print("\n🎯 FINAL EVALUATION")
print(f"🧠 Accuracy with Chain of Thought: {accuracy_cot:.2f}")


🧠 [1/62] Processing (with CoT):
➡️ "ਘਰ ਵਿਚ ਇਸ ਸਮੇਂ ਇਹ ਕਿਹੜਾ ਸਮਾਂ ਹੈ"
📩 Gemini reply:
Reason:The sentence translates to "What time is it in the house?" in English.  There's no metaphorical, cultural, or non-literal meaning. It's a straightforward question about the time. The phrase "in the house" is slightly unusual in English, but grammatically correct in the context of possibly specifying a particular location where a clock is, but it doesn't introduce any idiomatic expression.

Prediction: 0
✅ Prediction added: 0

🧠 [2/62] Processing (with CoT):
➡️ "ਇੱਕ ਕਰਮਚਾਰੀ ਦੀ ਕੀ ਸ਼ਿਕਾਇਤ ਕਰਨ, ਇੱਥੇ ਤਾਂ ਆਵਾ ਹੀ ਉਛਿਆ ਹੋਇਆ ਹੈ।"
📩 Gemini reply:
Reason:The sentence translates roughly to "What's one employee's complaint?  There's already a lot of chaos/turmoil here."  The phrase "ਆਵਾ ਹੀ ਉਛਿਆ ਹੋਇਆ ਹੈ" (āvā hī uchhiā hoyā hai) describes a state of significant commotion or upheaval.  It doesn't have a direct, literal meaning of "chaos" but conveys a sense of disorder and unrest, suggesting that a single co

In [6]:
print("🎯 Final Accuracy Comparison:")
print(f"→ Without CoT: {accuracy_no_cot:.2f}")
print(f"→ With CoT   : {accuracy_cot:.2f}")


🎯 Final Accuracy Comparison:
→ Without CoT: 0.89
→ With CoT   : 0.68


In [2]:
import pandas as pd
import time
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted

# ── CONFIG ───────────────────────────────────────────────────────────
GEMINI_API_KEY = "AIzaSyBR8dgMUD4440UnrmcNqwtmw3rktCzVKrY"
DATA_FILE      = "idioms.xlsx"           # must have a column "Sentences"
MODEL_NAME     = "gemini-2.0-flash"
OUTPUT_FILE    = "predictions_no_cot.xlsx"
MAX_RETRIES    = 5
# ─────────────────────────────────────────────────────────────────────

genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(MODEL_NAME)

df = pd.read_excel(DATA_FILE).dropna(subset=["Sentences"]).reset_index(drop=True)
preds = []

print(f"🔍 Predicting intents (no-CoT) for {len(df)} sentences…")

for i, sent in enumerate(df["Sentences"], start=1):
    prompt = f'''
Punjabi Sentence: "{sent}"
What is the intent of this sentence? Reply with exactly one line.
'''
    retry, wait = 0, 5

    while True:
        try:
            resp = model.generate_content(prompt)
            label = resp.text.strip().splitlines()[0]
            preds.append(label)
            print(f"[{i}/{len(df)}] ✓ {label}")
            break

        except ResourceExhausted:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Quota still exhausted after {MAX_RETRIES} retries. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Quota exhausted—retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

        except Exception as e:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Error after {MAX_RETRIES} retries: {e}. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Error: {e} — retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

    time.sleep(0.8)

df["pred_intent_no_cot"] = preds
df.to_excel(OUTPUT_FILE, index=False)
print(f"\n✅ Saved predictions to {OUTPUT_FILE}")


🔍 Predicting intents (no-CoT) for 147 sentences…
[1/147] ✓ The sentence emphasizes Sita's impeccable character, implying she is beyond reproach.
[3/147] ✓ During the war days, traders made a lot of money off of people's needs.
[4/147] ✓ The sentence implies that Ram considers himself clever and tries to deceive others.
[5/147] ✓ The sentence describes a role reversal in society where women are working and men are taking care of the children.
[6/147] ✓ The sentence means that Dippo has so thoroughly manipulated or brainwashed his Pandit (priest) that he has abandoned his traditional religious practices or beliefs.
[7/147] ✓ The sentence expresses that rising inflation causes stress, and ultimately, time passes regardless.
[8/147] ✓ It implies someone acted impulsively or unwisely without thinking, abandoning all strategies or resources.
[9/147] ✓ The sentence implies someone pretends to be capable of great things for good people but is actually useless.
[10/147] ✓ The sentence describes

In [3]:
import pandas as pd
import time
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted

# ── CONFIG ───────────────────────────────────────────────────────────
GEMINI_API_KEY = "AIzaSyBR8dgMUD4440UnrmcNqwtmw3rktCzVKrY"
DATA_FILE      = "idioms.xlsx"
MODEL_NAME     = "gemini-2.0-flash"
OUTPUT_FILE    = "predictions_cot.xlsx"
MAX_RETRIES    = 5
# ─────────────────────────────────────────────────────────────────────

genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(MODEL_NAME)

df = pd.read_excel(DATA_FILE).dropna(subset=["Sentences"]).reset_index(drop=True)
preds = []

print(f"🧠 Predicting intents (with CoT) for {len(df)} sentences…")

for i, sent in enumerate(df["Sentences"], start=1):
    prompt = f'''
You are an expert in Punjabi language and culture, specifically adept at understanding and interpreting Punjabi idioms. Your task is to analyze a given Punjabi sentence, identify if it contains an idiom, and then determine the underlying user intent.

Here's an example of how to identify intent from a Punjabi idiomatic sentence, following a step-by-step reasoning process. Notice the format of the final intent:

**Example Punjabi Sentence:** 'ਰਤਾ! ਤੂੰ ਮੇਰੇ ਨਾਲ ਹਮਦਰਦੀ ਨਾ ਕਰ, ਮੈਂ ਤੇਰੀ ਅਸਲੀਅਤ ਜਾਣਦਾ ਹਾਂ।' 

**Step-by-step Reasoning:**
1.  **Identify Idiom:** The sentence contains the Punjabi idiom "ਹਮਦਰਦੀ ਨਾ ਕਰ" (hamdardee naa kar).
2.  **Literal Meaning:** "ਹਮਦਰਦੀ ਨਾ ਕਰ" literally translates to "don't sympathize."
3.  **Idiomatic Meaning:** The idiomatic meaning of "ਹਮਦਰਦੀ ਨਾ ਕਰ" in this context, especially when paired with "ਮੈਂ ਤੇਰੀ ਅਸਲੀਅਤ ਜਾਣਦਾ ਹਾਂ" (I know your reality), is a rejection of false sympathy, implying the speaker sees through manipulative or insincere gestures.
4.  **Contextual Interpretation:** The speaker is telling someone not to show sympathy, because the speaker already knows their true nature. This implies a rejection of insincere pity or a demand for authenticity, often stemming from a perceived manipulative intent. The user is conveying a strong dismissal of someone's pretense of sympathy, indicating they understand the person's true, likely negative, intentions.
5.  **Determine User Intent:** To act out of knowing someone's true intentions

Now, apply the same step-by-step reasoning to identify the intent of the following Punjabi sentence. Provide your response by clearly labeling each step's output. Ensure your final "Determine User Intent" is a single, clear intent label, preferably starting with "To" followed by a verb, aligning with the examples provided in your dataset.

**Punjabi Sentence:** "{sent}"

Let's begin.
'''
    retry, wait = 0, 5

    while True:
        try:
            resp = model.generate_content(prompt)
            reply = resp.text.strip().splitlines()
            # extract after "Prediction:"
            label = next(
                (line.split(":",1)[1].strip()
                 for line in reply
                 if line.lower().startswith("prediction:")),
                None
            ) or reply[-1].strip()

            preds.append(label)
            print(f"[{i}/{len(df)}] ✓ {label}")
            break

        except ResourceExhausted:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Quota still exhausted after {MAX_RETRIES} retries. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Quota exhausted—retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

        except Exception as e:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Error after {MAX_RETRIES} retries: {e}. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Error: {e} — retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

    time.sleep(0.8)

df["pred_intent_cot"] = preds
df.to_excel(OUTPUT_FILE, index=False)
print(f"\n✅ Saved CoT predictions to {OUTPUT_FILE}")


🧠 Predicting intents (with CoT) for 147 sentences…
[1/147] ✓ 5.  **Determine User Intent:** To emphasize the impeccable character of someone.
[2/147] ✓ 5.  **Determine User Intent:** To reject someone's condescending or manipulative behavior.
[3/147] ✓ 5.  **Determine User Intent:** To criticize war profiteering.
[4/147] ✓ 5.  **Determine User Intent:** To expose someone's deceptive nature.
[5/147] ✓ 5.  **Determine User Intent:** To express observation and perhaps mild disapproval/surprise regarding the reversal of traditional gender roles in society.
[6/147] ✓ 5. **Determine User Intent:** To convey successful, albeit malicious, manipulation of someone, leading to an absurd outcome.
[7/147] ✓ 5.  **Determine User Intent:** To convey acceptance of an uncontrollable situation while suggesting that worrying is pointless.
[8/147] ✓ 5.  **Determine User Intent:** To express disappointment/frustration about someone's self-defeating actions due to lack of foresight.
[9/147] ✓ 5.  **Determin

In [17]:
import pandas as pd
import time
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted

# ── CONFIG ───────────────────────────────────────────────────────────
GEMINI_API_KEY = "AIzaSyBR8dgMUD4440UnrmcNqwtmw3rktCzVKrY"
DATA_FILE      = "idioms.xlsx"
MODEL_NAME     = "gemini-2.0-flash"
OUTPUT_FILE    = "predictions_cot_1.xlsx"
MAX_RETRIES    = 5
# ─────────────────────────────────────────────────────────────────────

genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(MODEL_NAME)

df = pd.read_excel(DATA_FILE).dropna(subset=["Sentences"]).reset_index(drop=True)
preds = []

print(f"🧠 Predicting intents (with CoT) for {len(df)} sentences…")

for i, sent in enumerate(df["Sentences"], start=1):
    prompt = f'''
You are an expert in Punjabi language and culture, specifically adept at understanding and interpreting Punjabi idioms. Your task is to analyze a given Punjabi sentence, identify if it contains an idiom, and then determine the underlying user intent.

To achieve this, carefully follow these steps to reason through the problem:

1.  **Identify Idiom:** Read the Punjabi Sentence carefully: "{sent}". Does this sentence contain a common Punjabi idiom? If yes, clearly state the idiom. If no, state "No idiom found."
2.  **Literal Meaning (if idiom found):** If an idiom was identified in Step 1, provide the literal, word-for-word translation of the idiom from Punjabi to English. If no idiom was found, skip this step.
3.  **Idiomatic Meaning (if idiom found):** If an idiom was identified, explain its figurative or actual meaning in English, as it is commonly understood in Punjabi culture. If no idiom was found, skip this step.
4.  Based on the idiomatic meaning (or literal meaning if no idiom), interpret the overall message or action implied by the Punjabi sentence in its common usage. What is the user trying to convey or achieve?
5.  From the contextual interpretation, infer the most specific and accurate user intent for this sentence. Be concise and give answer in a single line, preferably starting with "To" followed by a verb (e.g., "To express sadness," "To warn someone," "To complain about prices").

Provide your response by clearly labeling each step's output.

Let's begin.
'''
    retry, wait = 0, 5

    while True:
        try:
            resp = model.generate_content(prompt)
            reply = resp.text.strip().splitlines()
            # extract after "Prediction:"
            label = next(
                (line.split(":",1)[1].strip()
                 for line in reply
                 if line.lower().startswith("prediction:")),
                None
            ) or reply[-1].strip()

            preds.append(label)
            print(f"[{i}/{len(df)}] ✓ {label}")
            break

        except ResourceExhausted:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Quota still exhausted after {MAX_RETRIES} retries. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Quota exhausted—retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

        except Exception as e:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Error after {MAX_RETRIES} retries: {e}. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Error: {e} — retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

    time.sleep(0.8)

df["pred_intent_cot"] = preds
df.to_excel(OUTPUT_FILE, index=False)
print(f"\n✅ Saved CoT predictions to {OUTPUT_FILE}")


🧠 Predicting intents (with CoT) for 36 sentences…
[1/36] ✓ 5. **User Intent:** To praise someone's impeccable character.
[2/36] ✓ 5. To warn someone against being deceitful and to assert knowledge of their true intentions.
[3/36] ✓ 5. **User Intent:** To criticize traders for profiteering during wartime.
[4/36] ✓ To criticize someone's deceptive behavior.
[5/36] ✓ 5.  **User Intent:** To express disapproval/surprise/observation regarding changing gender roles in society.
[6/36] ✓ To express disapproval/criticism of Dippo's manipulative influence on the Pandit, leading to negative consequences.
[7/36] ✓ 5. **User Intent:** To express resignation and acceptance of the temporary nature of a stressful situation caused by rising inflation.
[8/36] ✓ To criticize someone for giving up without trying.
[9/36] ✓ 5.  **User Intent:** To criticize someone for being boastful and incapable.
[10/36] ✓ 5. **User Intent:** To criticize someone's laziness and opportunistic behavior, only showing interes

In [31]:


import pandas as pd
import time
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted

# ── CONFIG ───────────────────────────────────────────────────────────
GEMINI_API_KEY = "AIzaSyD6PP6kpVBypMKE1LoozacllpM1NS8Hb0U"
DATA_FILE      = "idioms.xlsx"
MODEL_NAME     = "gemini-2.0-flash"
OUTPUT_FILE    = "predictions_cot_4.xlsx"
MAX_RETRIES    = 5
# ─────────────────────────────────────────────────────────────────────

genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(MODEL_NAME)

df = pd.read_excel(DATA_FILE).dropna(subset=["Sentences"]).reset_index(drop=True)
preds = []

print(f"🧠 Predicting intents (with CoT) for {len(df)} sentences…")

for i, sent in enumerate(df["Sentences"], start=1):
    prompt = f'''
You are an expert in Punjabi language and culture, specifically adept at understanding and interpreting Punjabi idioms. Your task is to analyze a given Punjabi sentence, first by deeply understanding any idiom present, and then by determining the underlying user intent.

Follow these two main phases:

---
**Phase 1: Knowledge Generation about the Idiom**

1.  **Identify Idiom:** Read the Punjabi Sentence carefully: "{sent}". Does this sentence contain a common Punjabi idiom? If yes, clearly state the idiom. If no, state "No idiom found. Proceed to Phase 2 with literal meaning."
2.  **Literal Meaning (if idiom found):** If an idiom was identified in Step 1, provide the literal, word-for-word translation of the idiom from Punjabi to English.
3.  **Comprehensive Idiom Explanation (if idiom found):** If an idiom was identified, provide a detailed explanation of its figurative or actual meaning, including any common contexts, cultural nuances, or implications as understood in Punjabi. Explain *why* it means what it means, if possible. If no idiom, state "N/A".

---
**Phase 2: Intent Extraction (using generated knowledge)**

Now, using the idiom explanation generated in Phase 1 (if applicable) as additional context, determine the intent of the original Punjabi sentence.

1.  **Contextual Interpretation:** Based on the detailed idiomatic meaning from Phase 1 (or literal meaning if no idiom), interpret the overall message or action implied by the Punjabi sentence in its common usage. How does the generated knowledge about the idiom contribute to understanding the sentence's message?
2.  **Determine User Intent:** From this contextual interpretation, infer the most specific and accurate user intent for this sentence. Be concise and give answer in a single line, preferably starting with "To" followed by a verb (e.g., "To express happiness," "To advise caution," "To state a fact").

Let's begin.
'''
    retry, wait = 0, 5

    while True:
        try:
            resp = model.generate_content(prompt)
            reply = resp.text.strip().splitlines()
            # extract after "Prediction:"
            label = next(
                (line.split(":",1)[1].strip()
                 for line in reply
                 if line.lower().startswith("prediction:")),
                None
            ) or reply[-1].strip()

            preds.append(label)
            print(f"[{i}/{len(df)}] ✓ {label}")
            break

        except ResourceExhausted:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Quota still exhausted after {MAX_RETRIES} retries. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Quota exhausted—retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

        except Exception as e:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Error after {MAX_RETRIES} retries: {e}. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Error: {e} — retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

    time.sleep(0.8)

df["pred_intent_cot"] = preds
df.to_excel(OUTPUT_FILE, index=False)
print(f"\n✅ Saved CoT predictions to {OUTPUT_FILE}")


🧠 Predicting intents (with CoT) for 36 sentences…
[1/36] ✓ 2.  **Determine User Intent:** To emphasize Seeta's impeccable character and innocence.
[2/36] ✓ 2.  **Determine User Intent:** To warn someone against deception and assert awareness of their true nature.
[3/36] ✓ 2.  **Determine User Intent:** To express disapproval of merchants who profited from the war.
[4/36] ✓ 2.  **Determine User Intent:** To describe someone as cunning and deceitful, who tries to fool everyone.
[5/36] ✓ 2.  **Determine User Intent:** To express a perceived reversal of traditional gender roles in contemporary society, with a possible undertone of surprise or concern.
[6/36] ✓ 2.  **Determine User Intent:** To express how severely someone was manipulated/misguided leading to foolish actions.
[7/36] ✓ 2.  **Determine User Intent:** To offer hope and perspective regarding the temporary nature of stress caused by inflation.
[8/36] ✓ 2.  **Determine User Intent:** To express resignation and abandonment of effo

In [36]:


import pandas as pd
import time
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted

# ── CONFIG ───────────────────────────────────────────────────────────
GEMINI_API_KEY = "AIzaSyD6PP6kpVBypMKE1LoozacllpM1NS8Hb0U"
DATA_FILE      = "idioms.xlsx"
MODEL_NAME     = "gemini-2.0-flash"
OUTPUT_FILE    = "predictions_cot_4.xlsx"
MAX_RETRIES    = 5
# ─────────────────────────────────────────────────────────────────────

genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(MODEL_NAME)

df = pd.read_excel(DATA_FILE).dropna(subset=["Sentences"]).reset_index(drop=True)
preds = []

print(f"🧠 Predicting intents (with CoT) for {len(df)} sentences…")

for i, sent in enumerate(df["Sentences"], start=1):
    prompt = f'''
You are an expert in Punjabi language and culture, specifically adept at understanding and interpreting Punjabi idioms. Your task is to analyze a given Punjabi sentence, identify if it contains an idiom, and then determine the underlying user intent.

Here are a few examples of how to identify intent from Punjabi idiomatic sentences, following a precise step-by-step reasoning process. Pay close attention to the format of the final intent:

---
**Example 1: Complaint/Lament**

**Punjabi Sentence:** 'ਅੱਜ ਕੱਲ੍ਹ ਤਾਂ ਹਰ ਚੀਜ਼ ਅੱਗ ਦੇ ਭਾਅ ਹੋ ਗਈ ਹੈ, ਗਰੀਬ ਲੋਕ ਕੀ ਕਰਨ?'

**Step-by-step Reasoning:**
1.  **Identify Idiom:** The sentence contains the Punjabi idiom "ਅੱਗ ਦੇ ਭਾਅ ਹੋ ਗਈ ਹੈ".
2.  **Literal Meaning:** "ਅੱਗ ਦੇ ਭਾਅ ਹੋ ਗਈ ਹੈ" literally translates to "has become the price of fire."
3.  **Idiomatic Meaning:** The idiomatic meaning of "ਅੱਗ ਦੇ ਭਾਅ ਹੋ ਗਈ ਹੈ" is "to become extremely expensive," "to be unaffordable," or "to have sky-high prices."
4.  **Contextual Interpretation:** Everything has become incredibly costly. The rhetorical question "what should poor people do?" emphasizes the dire impact of these high prices on the poor. The user is conveying a lament about economic hardship and helplessness.
5.  **Determine User Intent:** To complain about high prices and economic hardship

---
**Example 2: Statement of Common Occurrence**

**Punjabi Sentence:** 'ਸਕੂਲੀ ਦਿਨਾਂ ‘ਚ ਕਈ ਵਾਰੀ ਅੱਖਾਂ ਤੇਰ ਲੈਣਾ ਆਮ ਹੈ।'

**Step-by-step Reasoning:**
1.  **Identify Idiom:** The sentence contains the Punjabi idiom "ਅੱਖਾਂ ਤੇਰ ਲੈਣਾ".
2.  **Literal Meaning:** "ਅੱਖਾਂ ਤੇਰ ਲੈਣਾ" literally translates to "to turn one's eyes."
3.  **Idiomatic Meaning:** The idiomatic meaning of "ਅੱਖਾਂ ਤੇਰ ਲੈਣਾ" in this context is "to look away surreptitiously to copy," or "to cheat by glancing at someone else's work."
4.  **Contextual Interpretation:** The phrase "in school days, many times...is common" combined with the idiom about cheating implies that it's a common behavior in school to copy from others during tests. The user is making a general observation about this common practice.
5.  **Determine User Intent:** To describe a common occurrence/behavior

---
**Example 3: Call to Action/Urging**

**Punjabi Sentence:** 'ਅੱਜ ਸਮੇਂ ਲੋਕ ਇਸ ਗੱਲ ਦੀ ਹੈ ਕਿ ਆਪਣੀ ਮਤ-ਭੇਦ ਭੁਲਾ ਕੇ ਦੁਸ਼ਮਣ ਦੇ ਟਾਕਰੇ ਲਈ ਇੱਕ ਮੂੰਹ ਹੋ ਜਾਈਏ।'

**Step-by-step Reasoning:**
1.  **Identify Idiom:** The sentence contains the Punjabi idiom "ਇੱਕ ਮੂੰਹ ਹੋ ਜਾਈਏ".
2.  **Literal Meaning:** "ਇੱਕ ਮੂੰਹ ਹੋ ਜਾਈਏ" literally translates to "become one mouth."
3.  **Idiomatic Meaning:** The idiomatic meaning of "ਇੱਕ ਮੂੰਹ ਹੋ ਜਾਈਏ" is "to be united," "to speak with one voice," or "to act in unison."
4.  **Contextual Interpretation:** The sentence emphasizes the current need for people to forget their differences and become united to confront an enemy. The user is urging for solidarity and collective action against a common threat.
5.  **Determine User Intent:** To urge for unity against an adversary

---

Now, apply the same step-by-step reasoning to identify the intent of the following Punjabi sentence. Provide your response by clearly labeling each step's output. Ensure your final "Determine User Intent" is a single, clear intent label, preferably starting with "To" followed by a verb, aligning with the examples above.

**Punjabi Sentence:** "{sent}"

Let's begin.
'''
    retry, wait = 0, 5

    while True:
        try:
            resp = model.generate_content(prompt)
            reply = resp.text.strip().splitlines()
            # extract after "Prediction:"
            label = next(
                (line.split(":",1)[1].strip()
                 for line in reply
                 if line.lower().startswith("prediction:")),
                None
            ) or reply[-1].strip()

            preds.append(label)
            print(f"[{i}/{len(df)}] ✓ {label}")
            break

        except ResourceExhausted:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Quota still exhausted after {MAX_RETRIES} retries. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Quota exhausted—retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

        except Exception as e:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Error after {MAX_RETRIES} retries: {e}. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Error: {e} — retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

    time.sleep(0.8)

df["pred_intent_cot"] = preds
df.to_excel(OUTPUT_FILE, index=False)
print(f"\n✅ Saved CoT predictions to {OUTPUT_FILE}")


🧠 Predicting intents (with CoT) for 36 sentences…
[1/36] ✓ 5.  **Determine User Intent:** To highlight someone's impeccable character
[2/36] ✓ 5.  **Determine User Intent:** To warn against deception and assert awareness.
[3/36] ✓ 5.  **Determine User Intent:** To criticize profiteering during wartime.
[4/36] ✓ 5.  **Determine User Intent:** To describe someone's deceptive behaviour
[5/36] ✓ 5.  **Determine User Intent:** To comment on the reversal of traditional roles
[6/36] ✓ 5.  **Determine User Intent:** To illustrate the consequences of being misguided by someone.
[7/36] ✓ 5.  **Determine User Intent:** To express a resigned acceptance of difficult times, suggesting coping rather than excessive worry.
[8/36] ✓ 5.  **Determine User Intent:** To criticize someone's lack of foresight and their act of abandoning all efforts.
[9/36] ✓ 5.  **Determine User Intent:** To criticize someone's empty promises and incompetence
[10/36] ✓ 5.  **Determine User Intent:** To describe a futile attem

In [8]:


import pandas as pd
import time
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted

# ── CONFIG ───────────────────────────────────────────────────────────
GEMINI_API_KEY = "AIzaSyD6PP6kpVBypMKE1LoozacllpM1NS8Hb0U"
DATA_FILE      = "idioms.xlsx"
MODEL_NAME     = "gemini-2.0-flash"
OUTPUT_FILE    = "predictions_cot_7.xlsx"
MAX_RETRIES    = 5
# ─────────────────────────────────────────────────────────────────────

genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(MODEL_NAME)

df = pd.read_excel(DATA_FILE).dropna(subset=["Sentences"]).reset_index(drop=True)
preds = []

print(f"🧠 Predicting intents (with CoT) for {len(df)} sentences…")

for i, sent in enumerate(df["Sentences"], start=1):
    prompt = f'''
You are an expert in Punjabi language and culture, specifically adept at understanding and interpreting Punjabi idioms. Your task is to analyze a given Punjabi sentence, first by mastering the idiom's meaning, and then accurately determining the user's intent.

Follow these two distinct phases for each sentence:

---
**Example Analysis Process:**

**Punjabi Sentence:** 'ਅੱਜ ਕੱਲ੍ਹ ਤਾਂ ਹਰ ਚੀਜ਼ ਅੱਗ ਦੇ ਭਾਅ ਹੋ ਗਈ ਹੈ, ਗਰੀਬ ਲੋਕ ਕੀ ਕਰਨ?'

**Phase 1: Idiom Comprehension**
1.  **Identify Idiom:** "ਅੱਗ ਦੇ ਭਾਅ ਹੋ ਗਈ ਹੈ"
2.  **Literal Meaning:** "has become the price of fire."
3.  **Idiomatic Meaning & Cultural Context:** This idiom means prices have become extremely expensive or unaffordable. It implies a significant increase in cost that is frustrating and burdensome, especially for the poor.

**Phase 2: Intent Determination**
4.  **Contextual Interpretation:** The sentence uses the idiom to express that everything is now prohibitively expensive. The follow-up question highlights the plight of the poor, signaling deep concern and dissatisfaction with current economic conditions.
5.  **Determine User Intent:** To complain about high prices and economic hardship

---

Now, apply this two-phase analysis to the following Punjabi sentence. Provide your response by clearly labeling each step's output. Ensure your final "Determine User Intent" is a single, concise intent label, starting with "To" followed by a verb.

**Punjabi Sentence:** "{sent}"

Let's begin.
'''
    retry, wait = 0, 5

    while True:
        try:
            resp = model.generate_content(prompt)
            reply = resp.text.strip().splitlines()
            # extract after "Prediction:"
            label = next(
                (line.split(":",1)[1].strip()
                 for line in reply
                 if line.lower().startswith("prediction:")),
                None
            ) or reply[-1].strip()

            preds.append(label)
            print(f"[{i}/{len(df)}] ✓ {label}")
            break

        except ResourceExhausted:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Quota still exhausted after {MAX_RETRIES} retries. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Quota exhausted—retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

        except Exception as e:
            retry += 1
            if retry > MAX_RETRIES:
                print(f"[{i}] ⚠️ Error after {MAX_RETRIES} retries: {e}. Defaulting to 'unknown'.")
                preds.append("unknown")
                break
            print(f"[{i}] ❌ Error: {e} — retry {retry}/{MAX_RETRIES} in {wait}s…")
            time.sleep(wait)
            wait *= 2

    time.sleep(0.8)

df["pred_intent_cot"] = preds
df.to_excel(OUTPUT_FILE, index=False)
print(f"\n✅ Saved CoT predictions to {OUTPUT_FILE}")


🧠 Predicting intents (with CoT) for 36 sentences…
[1/36] ✓ 5.  **Determine User Intent:** To praise someone's flawless character.
[2/36] ✓ 5.  **Determine User Intent:** To warn Raj to stop being deceptive.
[3/36] ✓ 5.  **Determine User Intent:** To express disapproval of war profiteering.
[4/36] ✓ 5.  **Determine User Intent:** To criticize Ram's deceitful behavior
[5/36] ✓ 5.  **Determine User Intent:** To comment on changing gender roles in society.
[6/36] ✓ 5.  **Determine User Intent:** To express severe disapproval and criticize Dippo for misleading the Pandit and causing major failures.
[7/36] ✓ 5.  **Determine User Intent:** To express resignation and helplessness in the face of rising inflation.
[8/36] ✓ 5.  **Determine User Intent:** To criticize someone for acting foolishly and losing opportunities.
[9/36] ✓ 5.  **Determine User Intent:** To criticize someone's boastful but ultimately useless nature.
[10/36] ✓ 5.  **Determine User Intent:** To express futility and frustratio

In [3]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import ttest_rel

# ── CONFIG ────────────────────────────────────────────────────────────────
GROUND_TRUTH_FILE    = "idioms.xlsx"      # must have columns: Sentences, real_meanings
NO_COT_FILE          = "predictions_no_cot.xlsx" 
COT_FILE             = "predictions_cot.xlsx"
OUTPUT_DEVIATIONS    = "deviations.xlsx"
SBERT_MODEL_NAME     = "all-mpnet-base-v2"  # or any SBERT model
# ──────────────────────────────────────────────────────────────────────────

# 1️⃣ Load and clean data
df_truth = pd.read_excel(GROUND_TRUTH_FILE).dropna(subset=["Sentences", "real_meanings"])
df_no    = pd.read_excel(NO_COT_FILE).dropna(subset=["Sentences", "pred_intent_no_cot"])
df_co    = pd.read_excel(COT_FILE).dropna(subset=["Sentences", "pred_intent_cot"])

# 2️⃣ Merge data on Sentences
df = (
    df_truth
    .merge(df_no[["Sentences", "pred_intent_no_cot"]], on="Sentences")
    .merge(df_co[["Sentences", "pred_intent_cot"]], on="Sentences")
)

# 3️⃣ Load SBERT model
print("🔄 Loading SBERT model...")
model = SentenceTransformer(SBERT_MODEL_NAME)

# 4️⃣ Create embeddings
print("🧠 Generating embeddings...")
real_texts = df["real_meanings"].tolist()
no_texts   = df["pred_intent_no_cot"].tolist()
co_texts   = df["pred_intent_cot"].tolist()

emb_real = model.encode(real_texts, convert_to_numpy=True, normalize_embeddings=True)
emb_no   = model.encode(no_texts,   convert_to_numpy=True, normalize_embeddings=True)
emb_co   = model.encode(co_texts,   convert_to_numpy=True, normalize_embeddings=True)

# 5️⃣ Compute cosine distances
dist_no = [cosine(r, n) for r, n in zip(emb_real, emb_no)]
dist_co = [cosine(r, c) for r, c in zip(emb_real, emb_co)]

# 6️⃣ Store distances in dataframe
df["dist_no_cot"] = dist_no
df["dist_cot"]    = dist_co

# 7️⃣ Compute and display means
mean_no = np.mean(dist_no)
mean_co = np.mean(dist_co)

print(f"\n🎯 Mean cosine distance (no-CoT): {mean_no:.4f}")
print(f"🎯 Mean cosine distance ( CoT ): {mean_co:.4f}")

# 8️⃣ Save detailed deviations
df.to_excel(OUTPUT_DEVIATIONS, index=False)
print(f"✅ Saved detailed deviations to '{OUTPUT_DEVIATIONS}'")

# 9️⃣ Additional metrics
std_no = np.std(dist_no)
std_co = np.std(dist_co)

median_no = np.median(dist_no)
median_co = np.median(dist_co)

t_stat, p_value = ttest_rel(dist_no, dist_co)

print("\n📊 Additional Metrics:")
print(f"Standard Deviation (no-CoT): {std_no:.4f}")
print(f"Standard Deviation ( CoT ): {std_co:.4f}")
print(f"Median Cosine Distance (no-CoT): {median_no:.4f}")
print(f"Median Cosine Distance ( CoT ): {median_co:.4f}")
print(f"\n📈 Paired t-test: t = {t_stat:.4f}, p = {p_value:.4f}")

if p_value < 0.05:
    print("✅ The difference is statistically significant (p < 0.05).")
else:
    print("⚠️ No statistically significant difference (p ≥ 0.05).")


🔄 Loading SBERT model...
🧠 Generating embeddings...

🎯 Mean cosine distance (no-CoT): 0.6518
🎯 Mean cosine distance ( CoT ): 0.5651
✅ Saved detailed deviations to 'deviations.xlsx'

📊 Additional Metrics:
Standard Deviation (no-CoT): 0.1404
Standard Deviation ( CoT ): 0.1449
Median Cosine Distance (no-CoT): 0.6593
Median Cosine Distance ( CoT ): 0.5828

📈 Paired t-test: t = 7.5063, p = 0.0000
✅ The difference is statistically significant (p < 0.05).


In [6]:
# 🔍 🔟 Find sentences where CoT performs worse (higher distance)
worse_with_cot = df[df["dist_cot"] > df["dist_no_cot"]]

# Print or save those examples
print(f"\n🔍 Sentences where CoT performed worse than no-CoT: {len(worse_with_cot)}")
print(worse_with_cot[["Sentences", "real_meanings", "pred_intent_no_cot", "pred_intent_cot", "dist_no_cot", "dist_cot"]])

# Optionally save them to a separate Excel file
worse_with_cot.to_excel("cot_worse_cases.xlsx", index=False)
print("📁 Saved worse CoT cases to 'cot_worse_cases.xlsx'")



🔍 Sentences where CoT performed worse than no-CoT: 42
                                             Sentences  \
0    ਸੀਤਾ ਦਾ ਆਚਰਨ ਇੰਨਾ ਬੇਦਾਗ ਹੈ ਕਿ ਕੋਈ ਉਸ ਵੱਲ ਉੰਗਲੀ...   
7    ਸੋਚ ਦੀ ਅਕਲ ਤੇ ਪਤਾ ਨਹੀਂ ਲੱਗਿਆ—ਉਸ ਨੇ ਸਾਰੇ ਫੰਦੇ ਛ...   
18   ਅੱਜ ਕੱਲ੍ਹ ਤਾਂ ਹਰ ਚੀਜ਼ ਅੱਗ ਦੇ ਭਾਅ ਹੋ ਗਈ ਹੈ, ਗਰੀ...   
19    ਇਮਤਿਹਾਨ ਤੋਂ ਦੋ ਦਿਨ ਪਹਿਲਾਂ ਉਹ ਪੜ੍ਹਾਈ ਕਰਨ ਬੈਠਾ,...   
20   ਮਈ–ਜੂਨ ਦੇ ਮਹੀਨਿਆਂ ਵਿੱਚ ਬਾਹਰ ਨਿਕਲੋ ਤਾਂ ਅੰਗ ਵਡੂਹ...   
21   ਇੱਕ ਕਰਮਚਾਰੀ ਦੀ ਕੀ ਸ਼ਿਕਾਇਤ ਕਰਨ, ਇੱਥੇ ਤਾਂ ਆਵਾ ਹੀ...   
26   ਕੁਝ ਸਾਲ ਪਹਿਲਾਂ ਚੀਨ ਤੇ ਅਮਰੀਕਾ ਦਾ ਆਪਸ ਵਿੱਚ ਇੱਕ ਕ...   
27   ਦੋ ਹਾਥੇ ਨਿਕਲ ਜਾਣ ਨਾਲ ਡੋਲੇ ਰਾਮ ਦੀ ਸਾਰੇ ਮੁਹੱਲੇ ਵ...   
32       "ਛੋਟੇ ਬਾਲ ਅਪਣੀਆਂ ਗੱਲਾਂ ਨਾਲ ਮਾਪਿਆਂ ਦਾ ਸਿਰ ਖ...   
37   ਗੁਰਨਾਮ ਬਹੁਤ ਚਲਾਕ ਹੈ।ਉਸੇ ਨਾਲ ਕੋਈ ਉਸਤਾਦੀ ਨਹੀਂ ਕਰ...   
38     ਸਬੂਤ ਤੋਂ ਬਿਨਾਂ ਕਿਸੇ ਵੱਲ ਉੱਗਲੀ ਨਹੀਂ ਕਰਨੀ ਚਾਹੀਦੀ।   
39     ਗੁਰਪ੍ਰੀਤੇ ਅਤੇ ਹਰਪ੍ਰੀਤ ਵਿੱਚ ਉੱਨੀ-ਇੱਕੀ ਦਾ ਫਰਕ ਹੈ।   
44    ਸਾਂਨੂੰ ਕਿਸੇ ਨੂੰ ਵੀ ਉੱਚਾ-ਨੀਵਾਂ ਨਹੀਂ ਬੋਲਣਾ ਚਾਹਿਦਾ।   
46   ਅੰਗਰੇਜ਼ਾਂ ਨੇ ਭਾਰਤ ਦੇ ਵਾਸੀਆਂ ਨੂੰ ਪੂਰੇ ਦੋ ਸੋ ਸਾਲਾ...   
49      ਚੀਨ ਹਰ ਸਮੇਂ ਭਾਰਤ ਨੂੰ ਅੱਖਾਂ ਦਿਖਾਉਂਦਾ ਰਹਿੰਦਾ ਹੈ।   
51   ਕਿਸੇ ਦੇ ਅਬਾ 

In [4]:
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import nltk
nltk.download('punkt')  # Ensure tokenizers are available

# Tokenize
df["real_tokens"] = df["real_meanings"].apply(word_tokenize)
df["no_tokens"]   = df["pred_intent_no_cot"].apply(word_tokenize)
df["co_tokens"]   = df["pred_intent_cot"].apply(word_tokenize)

# Define F1 calculation between two token lists
def token_f1(reference, prediction):
    common = set(reference) & set(prediction)
    if not reference or not prediction:
        return 0.0
    precision = len(common) / len(prediction)
    recall = len(common) / len(reference)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

# Apply
df["f1_no_cot"] = [token_f1(r, p) for r, p in zip(df["real_tokens"], df["no_tokens"])]
df["f1_cot"]    = [token_f1(r, p) for r, p in zip(df["real_tokens"], df["co_tokens"])]

# Mean F1
mean_f1_no = np.mean(df["f1_no_cot"])
mean_f1_co = np.mean(df["f1_cot"])

print(f"🔍 Mean F1 Score (no-CoT): {mean_f1_no:.4f}")
print(f"🔍 Mean F1 Score ( CoT ): {mean_f1_co:.4f}")


🔍 Mean F1 Score (no-CoT): 0.0979
🔍 Mean F1 Score ( CoT ): 0.0845


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
smooth_fn = SmoothingFunction().method1

# BLEU scores (unigram + bigram recommended for short text)
df["bleu_no_cot"] = [
    sentence_bleu([ref], pred, weights=(0.5, 0.5), smoothing_function=smooth_fn)
    for ref, pred in zip(df["real_tokens"], df["no_tokens"])
]

df["bleu_cot"] = [
    sentence_bleu([ref], pred, weights=(0.5, 0.5), smoothing_function=smooth_fn)
    for ref, pred in zip(df["real_tokens"], df["co_tokens"])
]

# Mean BLEU
mean_bleu_no = np.mean(df["bleu_no_cot"])
mean_bleu_co = np.mean(df["bleu_cot"])

print(f"📘 Mean BLEU Score (no-CoT): {mean_bleu_no:.4f}")
print(f"📘 Mean BLEU Score ( CoT ): {mean_bleu_co:.4f}")


📘 Mean BLEU Score (no-CoT): 0.0266
📘 Mean BLEU Score ( CoT ): 0.0214
