# Zero Shot v/s Few Shot performance analysis

In [1]:
import os

os.environ['HF_HOME'] = 'E:/Models/hf_cache'
os.environ['HUGGINGFACE_HUB_CACHE'] = 'E:/Models/hf_cache'

In [2]:
import pandas as pd

aqua_train_path = 'E:/Data/AQuA Dataset/train.json'

df = pd.read_json(aqua_train_path, lines = True)
df = df.head(5000)

In [3]:
df.head()

Unnamed: 0,question,options,rationale,correct
0,"Two friends plan to walk along a 43-km trail, ...","[A)21, B)21.5, C)22, D)22.5, E)23]","If Q complete x kilometers, then P completes 1...",E
1,"In the coordinate plane, points (x, 1) and (5,...","[A)4 and 1, B)1 and 5, C)5 and 1, D)3 and 5, E...",Line k passes through the origin and has slope...,C
2,"For all numbers p and q, the operation @ is de...","[A)II, B)I and II, C)I and III, D)II and III, ...",p@q = p^2 - pq=p(p-q).... so p@q will be zero ...,B
3,Carl is facing very difficult financial times ...,"[A)$1600, B)$2000, C)$2150, D)$2500, E)$12000]","Usually, you are given the annual rate of inte...",A
4,The speed at which a man can row a boat in sti...,"[A)18 seconds, B)27 seconds, C)26 seconds, D)1...",Speed of the boat downstream = 25 +11\n= 36 km...,E


In [4]:
eg = df['options'][0]
print(eg)

['A)21', 'B)21.5', 'C)22', 'D)22.5', 'E)23']


In [5]:
eg = ' '.join(eg)
print(eg)

A)21 B)21.5 C)22 D)22.5 E)23


In [19]:
from sklearn.metrics import accuracy_score
def solve_rate(true_labels, predictions):
    """Calculates accuracy, correctly handling None as incorrect."""
    if not true_labels:
        return 0.0
    correct_count = sum(1 for p, t in zip(predictions, true_labels) if p == t)
    return correct_count / len(true_labels)

In [7]:
# Zero-shot prompt template
def zero_shot_prompt(question, options):
    options_str = ' '.join(options)
    return f"""Find the correct option from the choices that is the answer to the question.

Question: '''{question}'''

Choices:
'''{options_str}'''

Provide your final answer in the format:
Answer: The answer is \"(option letter).\"
"""

In [8]:
# Few-shot prompt template (with 4 examples)
def few_shot_prompt(question, options):
    options_str = " ".join(options)
    return [
        {
            "role": "user",
            "content": f"""Find the correct option from the choices that is the answer to the question.

Question: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers is?\
Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64
Answer: The answer is (a).

Question: If a / b = 3/4 and 8a + 5b = 22, then find the value of a.
Answer Choices: (a) 1/2 (b) 3/2 (c) 5/2 (d) 4/2 (e) 7/2
Answer: The answer is (b).

Question: A person is traveling at 20 km/hr and reached his destiny in 2.5 hr then find the distance?
Answer Choices: (a) 53 km (b) 55 km (c) 52 km (d) 60 km (e) 50 km
Answer: The answer is (e).

Question: '''{question}'''
Answer Choices: '''{options_str}'''
"""
        }
    ]

In [9]:
import re
def extract_answer(text):
    patterns = [
        r"[Tt]he answer is:?\s*\(([a-eA-E])\)",  # Standard format
        r"[Tt]he correct answer is:?\s*\(([a-eA-E])\)", # alternative
        r"[Aa]nswer:?\s*\(([a-eA-E])\)",          # Alternative format
        r"\(([a-eA-E])\)[.,;]?\s*$",              # Answer at end of text
        r"[Tt]herefore,?\s*(?:the answer is)?\s*\(([a-eA-E])\)",  # With "therefore"
        r"[Tt]he correct answer is option\s*([a-eA-E])",  # "The correct answer is option B"
        r"[Tt]he correct answer is \s*([a-eA-E])",        # "The correct answer is B."
        r"[Tt]he answer is \s*([a-eA-E])",                # "The answer is B."
        r"[Tt]he answer is option\s*([a-eA-E])",          # "The answer is option B"
        r"option\s+([a-eA-E])\s*(?:is correct|\.|$)",     # "option B is correct" or "option B."
        r"[Aa]nswer:\s*option\s*([a-eA-E])",              # "Answer: option B"
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).upper()
    
    # Last resort: find any isolated letter in parentheses near the end
    matches = re.findall(r"\(([a-eA-E])\)", text)
    if matches:
        return matches[-1].upper()  # Return the last match
    
    return None

In [10]:
# Generate model predictions for StableLM-3B-4eit
def get_model_response(model, tokenizer, prompt, device):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.1,
        top_p=0.9
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "stabilityai/stablelm-3b-4e1t" 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using model: {model_name} on device: {device}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,
    device_map="auto"
)
print(f"Model '{model_name}' loaded successfully.")

Using model: stabilityai/stablelm-3b-4e1t on device: cuda
Model 'stabilityai/stablelm-3b-4e1t' loaded successfully.


In [None]:
predictions = []
true_labels = []

for i, (index, row) in enumerate(df.iterrows()):
        question = row['question']
        options = row['options']
        correct_label = row['correct']
        
        prompt = zero_shot_prompt(question, options)
        full_response = get_model_response(model, tokenizer, prompt, device)
        predicted_answer = extract_answer(full_response)
        
        print(f"{full_response}\n")
        print(f"({i+1}/{len(df)}) Predicted: {predicted_answer} | Correct: {correct_label}")
        print("-" * 50 + "\n\n")
        
        predictions.append(predicted_answer)
        true_labels.append(correct_label)
    

print(f"\nEvaluation complete.")

In [15]:
predictions

['D',
 'B',
 'B',
 'D',
 None,
 None,
 None,
 'D',
 'D',
 'C',
 'B',
 None,
 None,
 'B',
 None,
 'C',
 'B',
 'C',
 'D',
 'B',
 None,
 None,
 None,
 None,
 'E',
 'B',
 None,
 None,
 None,
 None,
 'D',
 'B',
 'D',
 'B',
 None,
 'B',
 None,
 'B',
 None,
 None,
 'D',
 None,
 'D',
 None,
 None,
 None,
 None,
 'D',
 None,
 'D',
 None,
 'C',
 None,
 None,
 'E',
 None,
 None,
 'B',
 None,
 'D',
 None,
 None,
 'B',
 None,
 'A',
 None,
 None,
 None,
 None,
 'B',
 None,
 None,
 'B',
 'E',
 None,
 None,
 'B',
 None,
 'B',
 None,
 None,
 None,
 'D',
 None,
 'C',
 'D',
 'B',
 'D',
 'D',
 None,
 'B',
 None,
 'B',
 None,
 'B',
 None,
 'D',
 'D',
 None,
 None,
 None,
 'D',
 None,
 'B',
 None,
 None,
 'D',
 None,
 None,
 'C',
 None,
 'D',
 'B',
 None,
 'B',
 None,
 None,
 None,
 None,
 'B',
 'B',
 None,
 None,
 None,
 None,
 'D',
 None,
 'D',
 'B',
 'D',
 'B',
 'D',
 'B',
 'D',
 None,
 None,
 None,
 None,
 None,
 'D',
 'D',
 'E',
 'C',
 'D',
 'D',
 None,
 'D',
 None,
 None,
 'D',
 None,
 'C',
 'B',
 Non

In [16]:
for i in range(len(predictions)):
    if predictions[i] == None:
        predictions[i] = 'X'

In [17]:
predictions

['D',
 'B',
 'B',
 'D',
 'X',
 'X',
 'X',
 'D',
 'D',
 'C',
 'B',
 'X',
 'X',
 'B',
 'X',
 'C',
 'B',
 'C',
 'D',
 'B',
 'X',
 'X',
 'X',
 'X',
 'E',
 'B',
 'X',
 'X',
 'X',
 'X',
 'D',
 'B',
 'D',
 'B',
 'X',
 'B',
 'X',
 'B',
 'X',
 'X',
 'D',
 'X',
 'D',
 'X',
 'X',
 'X',
 'X',
 'D',
 'X',
 'D',
 'X',
 'C',
 'X',
 'X',
 'E',
 'X',
 'X',
 'B',
 'X',
 'D',
 'X',
 'X',
 'B',
 'X',
 'A',
 'X',
 'X',
 'X',
 'X',
 'B',
 'X',
 'X',
 'B',
 'E',
 'X',
 'X',
 'B',
 'X',
 'B',
 'X',
 'X',
 'X',
 'D',
 'X',
 'C',
 'D',
 'B',
 'D',
 'D',
 'X',
 'B',
 'X',
 'B',
 'X',
 'B',
 'X',
 'D',
 'D',
 'X',
 'X',
 'X',
 'D',
 'X',
 'B',
 'X',
 'X',
 'D',
 'X',
 'X',
 'C',
 'X',
 'D',
 'B',
 'X',
 'B',
 'X',
 'X',
 'X',
 'X',
 'B',
 'B',
 'X',
 'X',
 'X',
 'X',
 'D',
 'X',
 'D',
 'B',
 'D',
 'B',
 'D',
 'B',
 'D',
 'X',
 'X',
 'X',
 'X',
 'X',
 'D',
 'D',
 'E',
 'C',
 'D',
 'D',
 'X',
 'D',
 'X',
 'X',
 'D',
 'X',
 'C',
 'B',
 'X',
 'X',
 'X',
 'B',
 'B',
 'X',
 'B',
 'D',
 'X',
 'X',
 'D',
 'D',
 'X',
 'A'

In [20]:
# --- Calculate and Display Final Accuracy ---

if true_labels:
    accuracy = solve_rate(true_labels, predictions)
    print(f"\nFinal Accuracy on {len(true_labels)} samples: {accuracy * 100:.2f}%")
else:
    print("\nNo samples were processed to calculate accuracy.")


Final Accuracy on 5000 samples: 9.16%
