In [None]:
!pip install -q transformers accelerate

In [None]:
from transformers import AutoTokenizer , AutoModelForSeq2SeqLM , pipeline
import torch

model_id = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausualLM.from_pretrained(model_id,
                                              device_map = "auto",
                                              torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32)
pipe = pipeline("text-generation",model = model,tokenizer=tokenizer,max_new_tokens=128,temperature = 0.3)



In [None]:
questions = [
    "If Alice is older than Bob, and Bob is older than Charlie, who is the youngest?",
    "A train travels 60 km/h for 3 hours. How far does it go?",
    "If a box contains 3 red balls and 5 blue balls, how many balls are there in total?",
    "Tom has twice as many apples as Jerry. Jerry has 3 apples. How many apples does Tom have?",
    "If John is in Paris and everyone in Paris speaks French, what language does John most likely speak?"
]


Define Prompt Template:

In [None]:
# Few-shot Chain of Thought Prompt
few_shot_cot = """Q: If there are 2 pens and each costs $3, how much in total?
A: Each pen costs $3. There are 2 pens. So 2 × 3 = $6. The answer is 6.

Q: Alice is older than Bob. Bob is older than Charlie. Who is the youngest?
A: Alice > Bob > Charlie. So Charlie is the youngest."""

# Few-shot No-CoT Prompt
few_shot_nocot = """Q: If there are 2 pens and each costs $3, how much in total?
A: 6

Q: Alice is older than Bob. Bob is older than Charlie. Who is the youngest?
A: Charlie"""


In [None]:
import pandas as pd

results = []

for q in questions:

  #Prompt 1: Few-Shot Chain of Thought
  prompt_cot = few_shot_cot + f"nQ: {q}:"
  output_cot = pipe(prompt_cot)[0]["generated_text"].split("A:")[-1].strip()

  #Prompt 2: Zero shot CoT
  prompt_zcot = f"Q: {q} Let's think step by step."
  output_zcot = pipe(prompt_zcot)[0]["generated_text"].split("A:")[-1].strip()

  #Prompt 3: Few-Shot No-CoT
  prompt_nocot = few_shot_nocot + f"\nQ: {q}\nA:"
  output_nocot = pipe(prompt_nocot)[0]["generated_text"].split("A:")[-1].strip()

  results.append({
      "question": q,
      "cot": output_cot,
      "zcot": output_zcot,
      "nocot": output_nocot
  })

  df = pd.DataFrame(results)

In [None]:
from IPython.display import display
pd.set_option('display.max_colwidth', None)
display(df)


In [None]:
ground_truth = [
    "Charlie",     # youngest
    "180",         # 60 × 3
    "8",           # 3 red + 5 blue
    "6",           # 3 × 2
    "French"       # inference
]

import re

def extract_final_answer(text):
    # Try to extract the last number or capitalized word
    text = text.replace(",", "")
    matches = re.findall(r"\b([A-Z][a-z]+|\d+(?:\.\d+)?)\b", text)
    return matches[-1] if matches else text.strip()

# Track correct counts
correct_cot = correct_zscot = correct_nocot = 0

for i, row in df.iterrows():
    gt = ground_truth[i].strip().lower()

    ans_cot = extract_final_answer(row["Few-shot CoT"]).lower()
    ans_zscot = extract_final_answer(row["Zero-shot CoT"]).lower()
    ans_nocot = extract_final_answer(row["Few-shot No-CoT"]).lower()

    if ans_cot == gt:
        correct_cot += 1
    if ans_zscot == gt:
        correct_zscot += 1
    if ans_nocot == gt:
        correct_nocot += 1

total = len(df)
print(f"\n Evaluation on {total} questions:\n")
print(f"Few-shot CoT Accuracy       : {correct_cot}/{total} ({correct_cot/total:.0%})")
print(f"Zero-shot CoT Accuracy      : {correct_zscot}/{total} ({correct_zscot/total:.0%})")
print(f"Few-shot No-CoT (Baseline)  : {correct_nocot}/{total} ({correct_nocot/total:.0%})")




Adding more examples

In [None]:
few_shot_cot = """Q: If there are 2 pens and each costs $3, how much in total?
A: Each pen costs $3. There are 2 pens. So 2 × 3 = $6. The answer is 6.

Q: Alice is older than Bob. Bob is older than Charlie. Who is the youngest?
A: Alice > Bob > Charlie. So Charlie is the youngest.

Q: A train travels 60 km/h for 3 hours. How far does it go?
A: The train moves 60 km each hour. 60 × 3 = 180. The answer is 180.

Q: A box has 4 red balls and 5 green balls. How many total balls are there?
A: 4 red + 5 green = 9 balls. The answer is 9.

Q: Sarah has 7 candies. She eats 2. How many are left?
A: 7 − 2 = 5. The answer is 5.

Q: A chair costs $15. You buy 2. How much do you spend?
A: 2 × $15 = $30. The answer is 30.

Q: Mike is taller than Tom. Tom is taller than Jim. Who is the shortest?
A: Mike > Tom > Jim. So Jim is the shortest. The answer is Jim.

Q: There are 3 rows of desks. Each row has 5 desks. How many desks total?
A: 3 × 5 = 15. The answer is 15.

Q: If a pie has 8 slices and you eat 3, how many are left?
A: 8 − 3 = 5. The answer is 5.

Q: John has 4 apples. His friend gives him 3 more. How many apples total?
A: 4 + 3 = 7. The answer is 7."""


few_shot_nocot = """Q: If there are 2 pens and each costs $3, how much in total?
A: 6

Q: Alice is older than Bob. Bob is older than Charlie. Who is the youngest?
A: Charlie

Q: A train travels 60 km/h for 3 hours. How far does it go?
A: 180

Q: A box has 4 red balls and 5 green balls. How many total balls are there?
A: 9

Q: Sarah has 7 candies. She eats 2. How many are left?
A: 5

Q: A chair costs $15. You buy 2. How much do you spend?
A: 30

Q: Mike is taller than Tom. Tom is taller than Jim. Who is the shortest?
A: Jim

Q: There are 3 rows of desks. Each row has 5 desks. How many desks total?
A: 15

Q: If a pie has 8 slices and you eat 3, how many are left?
A: 5

Q: John has 4 apples. His friend gives him 3 more. How many apples total?
A: 7"""


In [None]:
import pandas as pd

results = []

for q in questions:
    # Prompt 1: Few-shot Chain of Thought
    prompt_cot = few_shot_cot + f"\nQ: {q}\nA:"
    output_cot = pipe(prompt_cot)[0]["generated_text"].split("A:")[-1].strip()

    # Prompt 2: Zero-shot CoT
    prompt_zscot = f"Q: {q} Let's think step by step.\nA:"
    output_zscot = pipe(prompt_zscot)[0]["generated_text"].split("A:")[-1].strip()

    # Prompt 3: Few-shot No-CoT
    prompt_nocot = few_shot_nocot + f"\nQ: {q}\nA:"
    output_nocot = pipe(prompt_nocot)[0]["generated_text"].split("A:")[-1].strip()

    results.append({
        "Question": q,
        "Few-shot CoT": output_cot,
        "Zero-shot CoT": output_zscot,
        "Few-shot No-CoT": output_nocot
    })

df = pd.DataFrame(results)


In [None]:
from IPython.display import display
pd.set_option('display.max_colwidth', None)
display(df)

In [None]:
#  Install required libraries
!pip install -q transformers accelerate

#  Load OpenChat 3.5 Model
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch, re, pandas as pd

model_id = "openchat/openchat-3.5-1210"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, temperature=0.3)

#  10 Mixed Logical & Symbolic Questions
questions = [
    "If Alice is older than Bob, and Bob is older than Charlie, who is the youngest?",
    "A train travels 60 km/h for 3 hours. How far does it go?",
    "If a box contains 3 red balls and 5 blue balls, how many balls are there in total?",
    "Tom has twice as many apples as Jerry. Jerry has 3 apples. How many apples does Tom have?",
    "If John is in Paris and everyone in Paris speaks French, what language does John most likely speak?",
    "If a car has 4 wheels, how many wheels do 6 cars have?",
    "Sarah has 3 pencils. She buys 4 more. How many pencils does she have now?",
    "Bob is taller than Sam. Sam is taller than Mike. Who is the shortest?",
    "There are 5 rows of chairs. Each row has 6 chairs. How many chairs are there in total?",
    "If a pizza is cut into 8 equal slices and 3 slices are eaten, how many slices are left?"
]

#  Ground-Truth Answers
ground_truth = ["Charlie", "180", "8", "6", "French", "24", "7", "Mike", "30", "5"]

#  Few-Shot CoT Prompt (10 examples)
few_shot_cot = """Q: If there are 2 pens and each costs $3, how much in total?
A: Each pen costs $3. There are 2 pens. So 2 × 3 = $6. The answer is 6.
Q: Alice is older than Bob. Bob is older than Charlie. Who is the youngest?
A: Alice > Bob > Charlie. So Charlie is the youngest.
Q: A train travels 60 km/h for 3 hours. How far does it go?
A: The train moves 60 km each hour. 60 × 3 = 180. The answer is 180.
Q: A box has 4 red balls and 5 green balls. How many total balls are there?
A: 4 red + 5 green = 9 balls. The answer is 9.
Q: Sarah has 7 candies. She eats 2. How many are left?
A: 7 − 2 = 5. The answer is 5.
Q: A chair costs $15. You buy 2. How much do you spend?
A: 2 × $15 = $30. The answer is 30.
Q: Mike is taller than Tom. Tom is taller than Jim. Who is the shortest?
A: Mike > Tom > Jim. So Jim is the shortest. The answer is Jim.
Q: There are 3 rows of desks. Each row has 5 desks. How many desks total?
A: 3 × 5 = 15. The answer is 15.
Q: If a pie has 8 slices and you eat 3, how many are left?
A: 8 − 3 = 5. The answer is 5.
Q: John has 4 apples. His friend gives him 3 more. How many apples total?
A: 4 + 3 = 7. The answer is 7."""

#  Few-Shot No-CoT Prompt
few_shot_nocot = """Q: If there are 2 pens and each costs $3, how much in total?
A: 6
Q: Alice is older than Bob. Bob is older than Charlie. Who is the youngest?
A: Charlie
Q: A train travels 60 km/h for 3 hours. How far does it go?
A: 180
Q: A box has 4 red balls and 5 green balls. How many total balls are there?
A: 9
Q: Sarah has 7 candies. She eats 2. How many are left?
A: 5
Q: A chair costs $15. You buy 2. How much do you spend?
A: 30
Q: Mike is taller than Tom. Tom is taller than Jim. Who is the shortest?
A: Jim
Q: There are 3 rows of desks. Each row has 5 desks. How many desks total?
A: 15
Q: If a pie has 8 slices and you eat 3, how many are left?
A: 5
Q: John has 4 apples. His friend gives him 3 more. How many apples total?
A: 7"""

#  Inference + Evaluation
results = []

def extract_final_answer(text):
    text = text.replace(",", "")
    matches = re.findall(r"\b([A-Z][a-z]+|\d+(?:\.\d+)?)\b", text)
    return matches[-1] if matches else text.strip()

for i, q in enumerate(questions):
    gt = ground_truth[i].strip().lower()

    # Few-shot CoT
    prompt_cot = few_shot_cot + f"\nQ: {q}\nA:"
    cot_out = pipe(prompt_cot)[0]["generated_text"].split("A:")[-1].strip()
    cot_ans = extract_final_answer(cot_out).lower()

    # Zero-shot CoT
    prompt_zscot = f"Q: {q} Let's think step by step.\nA:"
    zscot_out = pipe(prompt_zscot)[0]["generated_text"].split("A:")[-1].strip()
    zscot_ans = extract_final_answer(zscot_out).lower()

    # Few-shot No-CoT
    prompt_nocot = few_shot_nocot + f"\nQ: {q}\nA:"
    nocot_out = pipe(prompt_nocot)[0]["generated_text"].split("A:")[-1].strip()
    nocot_ans = extract_final_answer(nocot_out).lower()

    results.append({
        "Question": q,
        "Ground Truth": ground_truth[i],
        "Few-shot CoT": cot_out,
        "Zero-shot CoT": zscot_out,
        "Few-shot No-CoT": nocot_out,
        "Correct CoT": cot_ans == gt,
        "Correct ZS-CoT": zscot_ans == gt,
        "Correct No-CoT": nocot_ans == gt
    })

#  Show Table
df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
display(df)

#  Summary Accuracy
print("\n Accuracy Summary:")
print(f"Few-shot CoT       : {df['Correct CoT'].sum()}/10")
print(f"Zero-shot CoT      : {df['Correct ZS-CoT'].sum()}/10")
print(f"Few-shot No-CoT    : {df['Correct No-CoT'].sum()}/10")
