In [1]:
# Install necessary libraries
!pip install -q langchain-groq langchain pandas scikit-learn

import os
import json
import pandas as pd
import time
from sklearn.metrics import accuracy_score, classification_report
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

# --- CONFIGURATION ---
# Paste your Groq Key here (or use the one from your .env file)

# Initialize the LLM
llm = ChatGroq(
    temperature=0,
    model_name="llama-3.3-70b-versatile", # Changed from llama3-70b-8192 to a currently supported model
    api_key=os.environ["GROQ_API_KEY"]
)

print("Setup Complete! LLM is ready.")


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Setup Complete! LLM is ready.


In [None]:
# --- OPTION A: GENERATE DUMMY DATA (For Instant Testing) ---
# data = [
#     {"text": "The food was absolutely wonderful, from preparation to presentation, very pleasing.", "stars": 5},
#     {"text": "Service was slow and the waiter was rude. I will never come back.", "stars": 1},
#     {"text": "It was okay. Not great, not terrible. Just average food.", "stars": 3},
#     {"text": "I loved the ambiance but the steak was overcooked.", "stars": 3},
#     {"text": "Best pizza in town! Highly recommended.", "stars": 5},
#     {"text": "Disgusting hygiene. Found a hair in my soup.", "stars": 1},
#     {"text": "Pretty good experience, but a bit pricey for the portion size.", "stars": 4},
#     {"text": "Terrible. Do not waste your money here.", "stars": 1},
#     {"text": "A hidden gem! The pasta is to die for.", "stars": 5},
#     {"text": "Mediocre at best. I've had better frozen dinners.", "stars": 2}
# ]
# df = pd.DataFrame(data)

# --- OPTION B: LOAD REAL YELP DATA 
df = pd.read_csv("../data/yelp.csv")

# Keep only required columns
df = df[["text", "stars"]]

# Sample 200 reviews as required
df = df.sample(200, random_state=42).reset_index(drop=True)

print(f"Loaded {len(df)} Yelp reviews")
display(df.head())

Loaded 200 Yelp reviews


Unnamed: 0,text,stars
0,We got here around midnight last Friday... the...,4
1,Brought a friend from Louisiana here. She say...,5
2,"Every friday, my dad and I eat here. We order ...",3
3,"My husband and I were really, really disappoin...",1
4,Love this place! Was in phoenix 3 weeks for w...,5


In [4]:
# Define the Output Parser to ensure valid JSON
parser = JsonOutputParser()

# --- PROMPT 1: ZERO-SHOT (Direct) ---
prompt_1 = ChatPromptTemplate.from_messages([
    ("system", "You are a sentiment analysis AI. Return valid JSON only."),
    ("user", """
    Classify the sentiment of this Yelp review into a star rating (1-5).
    Review: "{text}"

    Output Format:
    {{
        "predicted_stars": <int>,
        "explanation": "<string>"
    }}
    """)
])

# --- PROMPT 2: FEW-SHOT (With Examples) ---
prompt_2 = ChatPromptTemplate.from_messages([
    ("system", "You are a sentiment analysis AI. Learn from the examples and classify the new review."),
    ("user", """
    Examples:
    Review: "Worst place ever!" -> {{"predicted_stars": 1, "explanation": "Extreme negative sentiment."}}
    Review: "It was okay, nothing special." -> {{"predicted_stars": 3, "explanation": "Neutral sentiment."}}
    Review: "Absolutely loved it!" -> {{"predicted_stars": 5, "explanation": "Strong positive sentiment."}}

    Task:
    Review: "{text}"

    Return JSON only.
    """)
])

# --- PROMPT 3: CRITERIA-BASED (Chain of Thought Lite) ---
prompt_3 = ChatPromptTemplate.from_messages([
    ("system", "You are an expert critic. Analyze the review based on specific criteria."),
    ("user", """
    Review: "{text}"

    Analyze the review internally based on sentiment polarity, intensity, and specificity.
    Do not reveal step-by-step reasoning.

    Based on this analysis, assign a rating from 1 to 5.

    Output JSON:
    {{
        "predicted_stars": <int>,
        "explanation": "<Brief reasoning based on analysis>"
    }}
    """)
])

# Dictionary to store our chains
strategies = {
    "Zero-Shot": prompt_1 | llm | parser,
    "Few-Shot": prompt_2 | llm | parser,
    "Criteria-Based": prompt_3 | llm | parser
}

print("Strategies defined.")

Strategies defined.


In [6]:
results = []
summary_results = []

print("Starting evaluation... this might take a minute.")

for strategy_name, chain in strategies.items():
    print(f"Running strategy: {strategy_name}...")

    correct_predictions = 0
    valid_json_count = 0

    for index, row in df.iterrows():
        try:
            # Invoke LLM
            output = chain.invoke({"text": row['text']})

            time.sleep(0.25)

            # Check JSON Validity
            if "predicted_stars" in output and "explanation" in output:
                valid_json_count += 1

            # Check Accuracy
            predicted = int(output['predicted_stars'])
            actual = int(row['stars'])

            if predicted == actual:
                correct_predictions += 1

            # Store detail for review
            results.append({
                "Strategy": strategy_name,
                "Review": row['text'][:50] + "...",
                "Actual": actual,
                "Predicted": predicted,
                "Explanation": output['explanation']
            })

        except Exception as e:
            print(f"Error in {strategy_name} at index {index}: {e}")

    # Calculate Metrics for this strategy
    accuracy = (correct_predictions / len(df)) * 100
    json_validity_rate = (valid_json_count / len(df)) * 100
    print(
    f"--> {strategy_name} Accuracy: {accuracy:.2f}% | "
    f"JSON Validity: {json_validity_rate:.2f}%"
    )
    summary_results.append({
    "Strategy": strategy_name,
    "Accuracy (%)": round(accuracy, 2),
    "JSON Validity (%)": round(json_validity_rate, 2)
    })

# Create Comparison Table
results_df = pd.DataFrame(results)
print("\n--- FINAL COMPARISON ---")
display(results_df.groupby("Strategy").apply(lambda x: (x['Actual'] == x['Predicted']).mean() * 100).reset_index(name="Accuracy %"))
print("\n--- STRATEGY COMPARISON (REQUIRED METRICS) ---")
summary_df = pd.DataFrame(summary_results)
display(summary_df)


# Show detailed samples
display(results_df.head())

Starting evaluation... this might take a minute.
Running strategy: Zero-Shot...
Error in Zero-Shot at index 199: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01kcgtwyf0fxqb9k22t4tycx4n` service tier `on_demand` on tokens per day (TPD): Limit 100000, Used 99748, Requested 410. Please try again in 2m16.512s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
--> Zero-Shot Accuracy: 58.00% | JSON Validity: 99.50%
Running strategy: Few-Shot...
Error in Few-Shot at index 1: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01kcgtwyf0fxqb9k22t4tycx4n` service tier `on_demand` on tokens per day (TPD): Limit 100000, Used 100000, Requested 182. Please try again in 2m37.248s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type

  display(results_df.groupby("Strategy").apply(lambda x: (x['Actual'] == x['Predicted']).mean() * 100).reset_index(name="Accuracy %"))


Unnamed: 0,Strategy,Accuracy %
0,Few-Shot,100.0
1,Zero-Shot,58.291457



--- STRATEGY COMPARISON (REQUIRED METRICS) ---


Unnamed: 0,Strategy,Accuracy (%),JSON Validity (%)
0,Zero-Shot,58.0,99.5
1,Few-Shot,0.5,0.5
2,Criteria-Based,0.0,0.0


Unnamed: 0,Strategy,Review,Actual,Predicted,Explanation
0,Zero-Shot,We got here around midnight last Friday... the...,4,4,The reviewer had a positive experience with th...
1,Zero-Shot,Brought a friend from Louisiana here. She say...,5,5,"The reviewer's friend, who is from Louisiana, ..."
2,Zero-Shot,"Every friday, my dad and I eat here. We order ...",3,4,The reviewer mentions that they are regular cu...
3,Zero-Shot,"My husband and I were really, really disappoin...",1,1,The reviewer expresses extreme disappointment ...
4,Zero-Shot,Love this place! Was in phoenix 3 weeks for w...,5,5,"The reviewer uses extremely positive language,..."
