In [None]:

# TASK 1 - Yelp Review Rating Prediction using LLM (OpenAI GPT-4o-Mini)


# imports
import os
import json
import re
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI


# 1) Load API key & Configure OpenAI Client


load_dotenv()   # loads OPENAI_API_KEY from .env
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

DATA_PATH = "yelp.csv"
SAMPLE_SIZE = 200   # use 50 while testing to save cost


In [2]:
# 2) Load dataset and take a sample

df = pd.read_csv("yelp.csv")

# keep only needed columns
df = df[["text", "stars"]].dropna()

# sample for cost control
df_sample = df.sample(SAMPLE_SIZE, random_state=42).reset_index(drop=True)

print("Total rows in dataset:", len(df))
print("Running model on:", len(df_sample))
df_sample.head()


Total rows in dataset: 10000
Running model on: 200


Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [3]:
# 3) Extract JSON from model response

def extract_json_from_text(text):
    if text is None:
        return None
    
    text = text.strip()
    
    # Remove code fences ```json
    if text.startswith("```"):
        text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
        text = re.sub(r"\n?```$", "", text)
        text = text.strip()
    
    # Direct JSON
    if text.startswith("{") and text.endswith("}"):
        try:
            return json.loads(text)
        except:
            pass
    
    # JSON inside text
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except:
            return None
    
    return None


In [4]:
# 4) LLM Call Wrapper (OPENAI GPT-4o-Mini)

def call_llm(prompt: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
    return response.choices[0].message.content


In [5]:
# 5) Prompt Version-1 (Simple Classification)

def build_prompt_v1(review_text):
    return f"""
You are given a Yelp restaurant review.

Return a rating (1-5) in JSON only.

Format:
{{
  "predicted_stars": <integer>,
  "explanation": "<short reason>"
}}

Review:
\"\"\"{review_text}\"\"\"
"""


In [6]:
# 6) Prompt Version-2 (Few-shot examples)

FEW_SHOTS = """
Example:
Review: "Terrible food"
Output: {"predicted_stars": 1, "explanation": "Very negative"}

Example:
Review: "The food was okay"
Output: {"predicted_stars": 3, "explanation": "Neutral / mixed"}

Example:
Review: "Amazing staff and tasty food"
Output: {"predicted_stars": 5, "explanation": "Very positive"}
"""

def build_prompt_v2(review_text):
    return f"""
Classify Yelp review into 1-5 stars.
Return only JSON.

{FEW_SHOTS}

Now classify this review:
\"\"\"{review_text}\"\"\"

Output:
"""


In [7]:
# 7) Prompt Version-3 (Chain Style)

def build_prompt_v3(review_text):
    return f"""
Analyze the review internally, then output ONLY final JSON:
{{
  "predicted_stars": <int 1-5>,
  "explanation": "<very short>"
}}

Review:
\"\"\"{review_text}\"\"\"
"""


In [8]:
# 8) Running evaluation and calculating accuracy

def evaluate_prompt(df_input, prompt_builder, max_samples=200):
    records = []
    n = min(max_samples, len(df_input))
    
    for i in tqdm(range(n)):
        row = df_input.iloc[i]
        
        prompt = prompt_builder(row["text"])
        try:
            raw = call_llm(prompt)
        except Exception as e:
            raw = str(e)

        parsed = extract_json_from_text(raw)
        
        if parsed:
            json_ok = True
            pred = parsed.get("predicted_stars", None)
            explanation = parsed.get("explanation", None)
            try:
                pred = int(pred)
            except:
                pred = None
        else:
            json_ok = False
            pred = None
            explanation = None
        
        records.append({
            "true_stars": int(row["stars"]),
            "predicted_stars": pred,
            "json_valid": json_ok,
            "explanation": explanation,
            "raw_output": raw
        })
    
    df_out = pd.DataFrame(records)
    
    # accuracy
    valid = df_out[df_out["json_valid"] & df_out["predicted_stars"].notna()]
    if len(valid) > 0:
        acc = (valid["true_stars"] == valid["predicted_stars"]).mean()
    else:
        acc = 0.0
    
    metrics = {
        "samples": len(df_out),
        "json_valid_rate": df_out["json_valid"].mean(),
        "accuracy": acc
    }
    
    return df_out, metrics


In [9]:
results_v1, metrics_v1 = evaluate_prompt(df_sample, build_prompt_v1, SAMPLE_SIZE)
results_v2, metrics_v2 = evaluate_prompt(df_sample, build_prompt_v2, SAMPLE_SIZE)
results_v3, metrics_v3 = evaluate_prompt(df_sample, build_prompt_v3, SAMPLE_SIZE)

print("V1:", metrics_v1)
print("V2:", metrics_v2)
print("V3:", metrics_v3)


100%|██████████| 200/200 [05:20<00:00,  1.60s/it]
100%|██████████| 200/200 [03:58<00:00,  1.19s/it]
100%|██████████| 200/200 [04:35<00:00,  1.38s/it]

V1: {'samples': 200, 'json_valid_rate': np.float64(1.0), 'accuracy': np.float64(0.685)}
V2: {'samples': 200, 'json_valid_rate': np.float64(1.0), 'accuracy': np.float64(0.66)}
V3: {'samples': 200, 'json_valid_rate': np.float64(1.0), 'accuracy': np.float64(0.665)}





In [10]:
comparison_df = pd.DataFrame([
    {"prompt": "V1_Simple", **metrics_v1},
    {"prompt": "V2_FewShot", **metrics_v2},
    {"prompt": "V3_Chain",  **metrics_v3}
])

comparison_df
comparison_df.to_csv("task1_prompt_comparison.csv", index=False)
results_v1.to_csv("task1_results_v1.csv", index=False)
results_v2.to_csv("task1_results_v2.csv", index=False)
results_v3.to_csv("task1_results_v3.csv", index=False)
