In [None]:
# 0 — INSTALLS
!pip install pandas tqdm requests

import pandas as pd
import requests
import json
from tqdm import tqdm
import random
import re
import copy
from collections import defaultdict
from tqdm import tqdm




In [None]:
# 1 — LOAD DATA (Yelp Dataset)
df = pd.read_csv('/content/yelp.csv')
df = df[["text", "stars"]].dropna()
df.head()




Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5


In [None]:
# 2 — SAMPLE ~200 REVIEWS
sample = df.sample(200, random_state=42).reset_index(drop=True)

In [None]:
# 3 — OPENROUTER API CALL FUNCTION
OPENROUTER_API_KEY = "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"  # put your key here

def call_llm_openrouter(prompt):
    url = "https://openrouter.ai/api/v1/chat/completions"

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://github.com/Prasadbhat23",
        "X-Title": "Yelp-Rating-Notebook"
    }

    payload = {
        "model": "meta-llama/llama-3.1-8b-instruct",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.0
    }

    response = requests.post(url, headers=headers, json=payload)

    print("\n===== DEBUG RESPONSE =====")
    print("Status:", response.status_code)
    print(response.text)
    try:
        return response.json()["choices"][0]["message"]["content"]
    except:
        return "DEBUG-ERROR: Could not parse"


In [None]:
# 4 — SAFE JSON EXTRACTION
import json
import re

def extract_json(raw_response):
    try:
        match = re.search(r'\{.*\}', raw_response, re.DOTALL)
        if match:
            return json.loads(match.group())
    except json.JSONDecodeError:
        return {}
    return {}



In [None]:
# 5 — PROMPTS
PROMPT_ZERO_SHOT = """
You are a helpful assistant. Read the review below and assign a rating 1–5.
Return ONLY:

{{
  "predicted_stars": <number>
}}

Review:
"{review_text}"
"""

PROMPT_FEW_SHOT = """
You classify reviews into 1–5 stars.

Return ONLY JSON:
{{
  "predicted_stars": <number>,
  "explanation": "short reason"
}}

Examples:
Review: "Amazing food!"
Response: {{"predicted_stars": 5, "explanation": "very positive"}}

Review: "Food was cold and tasteless."
Response: {{"predicted_stars": 2, "explanation": "negative food quality"}}

Now classify:
"{review_text}"
"""

PROMPT_RUBRIC = """
Rate the review using this rubric:
1: very negative
2: negative
3: mixed
4: positive
5: very positive

Return ONLY JSON:
{{
  "predicted_stars": <number>,
  "explanation": "short reason"
}}

Review:
"{review_text}"
"""


In [None]:
# 6 — RUN ONE PROMPT ON ENTIRE SAMPLE
def run_prompt_evaluation(prompt_template, sample_df):
    results = []
    for text, true_star in zip(sample_df["text"], sample_df["stars"]):
        prompt = prompt_template.format(review_text=text)
        raw_output = call_llm_openrouter(prompt)
        parsed = extract_json(raw_output)
        results.append({
            "text": text,
            "true": true_star,
            "predicted": parsed.get("predicted_stars", None),
            "explanation": parsed.get("explanation", "No explanation returned"),
            "raw_response": raw_output,
            "json_valid": 1 if parsed else 0
        })
    return results



In [None]:
# 7 — RUN ALL THREE PROMPTS
PROMPT_ZERO_SHOT_TEMPLATE = PROMPT_ZERO_SHOT
PROMPT_FEW_SHOT_TEMPLATE = PROMPT_FEW_SHOT
PROMPT_RUBRIC_TEMPLATE = PROMPT_RUBRIC


In [None]:
df_zero = pd.DataFrame(run_prompt_evaluation(PROMPT_ZERO_SHOT_TEMPLATE, sample))
df_few = pd.DataFrame(run_prompt_evaluation(PROMPT_FEW_SHOT_TEMPLATE, sample))
df_rubric = pd.DataFrame(run_prompt_evaluation(PROMPT_RUBRIC_TEMPLATE, sample))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m


===== DEBUG RESPONSE =====
Status: 200

         
{"id":"gen-1765001272-AxqT4qg10aGvhvHsBnte","provider":"Novita","model":"meta-llama/llama-3.1-8b-instruct","object":"chat.completion","created":1765001272,"choices":[{"logprobs":null,"finish_reason":"stop","native_finish_reason":"stop","index":0,"message":{"role":"assistant","content":"{\n  \"predicted_stars\": 5\n}","refusal":null,"reasoning":null}}],"usage":{"prompt_tokens":137,"completion_tokens":12,"total_tokens":149,"cost":0.00000334,"is_byok":false,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0,"video_tokens":0},"cost_details":{"upstream_inference_cost":null,"upstream_inference_prompt_cost":0.00000274,"upstream_inference_completions_cost":6e-7},"completion_tokens_details":{"reasoning_tokens":0,"image_tokens":0}}}


===== DEBUG RESPONSE =====
Status: 200
{"id":"gen-1765001273-9vfoqwACEdV6psfP2QeR","provider":"Groq","model":"meta-llama/llama-3.1-8b-instr

In [None]:
# 8 — EVALUATION FUNCTION
def evaluate(df):
    accuracy = (df["true"] == df["predicted"]).mean()
    json_validity = df["json_valid"].mean()
    return accuracy, json_validity

zero_acc, zero_json = evaluate(df_zero)
few_acc, few_json = evaluate(df_few)
rubric_acc, rubric_json = evaluate(df_rubric)


#9 RELIABILITY & CONSISTENCY TEST

In [None]:
def reliability_test(prompt_template, sample_df, runs=3):
    """
    Runs the SAME prompt multiple times to check stability.
    """
    all_runs = []

    for r in range(runs):
        print(f"\n🔁 Running reliability pass {r+1}/{runs} ...")
        run_results = run_prompt_evaluation(prompt_template, sample_df)
        df_run = pd.DataFrame(run_results)
        all_runs.append(df_run)

    return all_runs

def consistency_score(all_runs):
    """
    Measures how often the model predicts the SAME star rating
    for the SAME review across all runs.
    """
    n = len(all_runs)
    sample_size = len(all_runs[0])

    consistent = 0

    for i in range(sample_size):
        preds = [df.iloc[i]["predicted"] for df in all_runs]
        if len(set(preds)) == 1:   # all equal
            consistent += 1

    return consistent / sample_size




In [None]:
sample_small = sample.head(10)
zero_runs = reliability_test(PROMPT_ZERO_SHOT_TEMPLATE, sample_small, runs=3)
few_runs = reliability_test(PROMPT_FEW_SHOT_TEMPLATE, sample_small, runs=3)
rubric_runs = reliability_test(PROMPT_RUBRIC_TEMPLATE, sample_small, runs=3)



🔁 Running reliability pass 1/3 ...

===== DEBUG RESPONSE =====
Status: 200

         
{"id":"gen-1765004578-6WaGkJcRf9I7liGsHSsp","provider":"DeepInfra","model":"meta-llama/llama-3.1-8b-instruct","object":"chat.completion","created":1765004578,"choices":[{"logprobs":null,"finish_reason":"stop","native_finish_reason":"stop","index":0,"message":{"role":"assistant","content":"{\n  \"predicted_stars\": 4\n}","refusal":null,"reasoning":null}}],"usage":{"prompt_tokens":163,"completion_tokens":11,"total_tokens":174,"cost":0.00000544,"is_byok":false,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0,"video_tokens":0},"cost_details":{"upstream_inference_cost":null,"upstream_inference_prompt_cost":0.00000489,"upstream_inference_completions_cost":5.5e-7},"completion_tokens_details":{"reasoning_tokens":0,"image_tokens":0}}}

===== DEBUG RESPONSE =====
Status: 200
{"id":"gen-1765004579-CsWBYbxhPy0SFT0FwJQQ","provider":"Nebius","model":"meta-llama/llama-3.1-8b-instruct","object":"chat.comp

In [None]:
#CALCULATE CONSISTENCY SCORES
zero_consistency = consistency_score(zero_runs)
few_consistency = consistency_score(few_runs)
rubric_consistency = consistency_score(rubric_runs)

print("Zero-Shot Consistency:", zero_consistency)
print("Few-Shot Consistency:", few_consistency)
print("Rubric Consistency:", rubric_consistency)



Zero-Shot Consistency: 1.0
Few-Shot Consistency: 1.0
Rubric Consistency: 1.0


In [None]:
# 10 — COMPARISON TABLE
comparison = pd.DataFrame({
    "Prompt Type": ["Zero-Shot", "Few-Shot", "Rubric"],
    "Accuracy": [zero_acc, few_acc, rubric_acc],
    "JSON Validity": [zero_json, few_json, rubric_json]
})
print(comparison)

  Prompt Type  Accuracy  JSON Validity
0   Zero-Shot     0.660            1.0
1    Few-Shot     0.610            1.0
2      Rubric     0.605            1.0


In [None]:
# 11 — SAVE RESULTS
df_zero.to_csv("zero_shot_results.csv", index=False)
df_few.to_csv("few_shot_results.csv", index=False)
df_rubric.to_csv("rubric_results.csv", index=False)
comparison.to_csv("prompt_comparison.csv", index=False)