In [3]:
!pip install groq

Collecting groq
  Downloading groq-1.0.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-1.0.0-py3-none-any.whl (138 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/138.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.3/138.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-1.0.0


In [27]:
import pandas as pd

df = pd.read_csv("yelp.csv")
df = df[["text", "stars"]].dropna()
sample_df = df.sample(n=10, random_state=30)
sample_df.head()


Unnamed: 0,text,stars
8793,"Been there many times with many friends, not f...",4
1122,"I have not been bowling in 24 years,so you can...",5
1283,Consistency is an issue with the Chipotle chai...,2
9318,This is my first time using Groupon. It's one ...,2
7765,I absolutely love this sub shop! Its the only ...,5


In [28]:
#Direct Classification prompt
p1 = """
Classify the Yelp review into a star rating from 1 to 5.

Return ONLY valid JSON:
{{
  "predicted_stars": <1-5>,
  "explanation": "<brief reason>"
}}

Review:
"{review}"
"""


In [29]:
#Criteria-Based Analysis
p2 = """
You are rating a Yelp review strictly based on the customer's overall satisfaction.

Rules:
- If the review mentions serious complaints, service issues, or disappointment, do NOT give 4 or 5 stars.
- If both positives and negatives are present, default to 3 stars unless praise clearly dominates.
- Use 5 stars ONLY if the review shows strong enthusiasm with no complaints.
- Use 4 stars ONLY if mostly positive with very minor issues.
- Be conservative: avoid inflating ratings.

Return ONLY valid JSON:
{{
  "predicted_stars": <1-5>,
  "explanation": "<short justification>"
}}

Review:
"{review}"
"""


In [30]:
#Criteria-Based Analysis
p3 = """
You are rating a Yelp review strictly based on the customer's overall satisfaction.

Rules:
- If the review mentions serious complaints, service issues, or disappointment, do NOT give 4 or 5 stars.
- If both positives and negatives are present, default to 3 stars unless praise or negative clearly dominates.
- Use 5 stars ONLY if the review shows strong enthusiasm with no complaints.
- Use 4 stars ONLY if mostly positive with very minor issues.
- Be conservative: avoid inflating ratings.

Return ONLY valid JSON:
{{
  "predicted_stars": <1-5>,
  "explanation": "<short justification>"
}}

Review:
"{review}"
"""


In [31]:
from groq import Groq
import os
from google.colab import userdata
# Create API Key from here
# https://console.groq.com/keys
client = Groq(api_key=userdata.get('GROQ_API'))
MODEL_NAME = "openai/gpt-oss-120b"


In [32]:
def call_llm(prompt):
    completion = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=600,
        response_format={"type": "json_object"},
    )
    return completion.choices[0].message.content.strip()


In [33]:
import json

def run_experiment(prompt_template):
    results = []

    for _, row in sample_df.iterrows():
        prompt = prompt_template.format(review=row["text"])
        response = call_llm(prompt)

        try:
            parsed = json.loads(response)
            results.append({
                "actual": row["stars"],
                "predicted": parsed["predicted_stars"],
                "valid_json": True
            })
        except:
            results.append({
                "actual": row["stars"],
                "predicted": None,
                "valid_json": False
            })

    return pd.DataFrame(results)


In [34]:
res_v1 = run_experiment(p1)
res_v2 = run_experiment(p2)
res_v3 = run_experiment(p3)


In [26]:
def evaluate(df):
    return {
        "Accuracy": (df["actual"] == df["predicted"]).mean(),
        "JSON_Validity": df["valid_json"].mean()
    }

summary = pd.DataFrame([
    {"Prompt": "1", **evaluate(res_v1)},
    {"Prompt": "2", **evaluate(res_v2)},
    {"Prompt": "3", **evaluate(res_v3)}
])

summary

Unnamed: 0,Prompt,Accuracy,JSON_Validity
0,1,0.8,1.0
1,2,0.6,1.0
2,3,0.6,1.0
