In [1]:
!pip install google-generativeai pandas




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import google.generativeai as genai

# Use Gemini API key directly
GEMINI_API_KEY = "AIzaSyATYgczfXXfe8kfwwgeALh-gD9kJ1wgk6w"

genai.configure(api_key=GEMINI_API_KEY)

print("Gemini API Ready!")


Gemini API Ready!


In [3]:
# Load the dataset
df = pd.read_csv("yelp.csv")

# Show first 5 rows
df.head()
df_200 = df.sample(n=200, random_state=42)
df_200 = df_200.fillna("Unknown")

df_200.shape

(200, 10)

In [4]:
PROMPT_1 = """
You are an expert Yelp rating auditor.
Read the review below and return ONLY valid JSON in EXACTLY this format:

{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<one short sentence>"
}}

Rules:
- predicted_stars must be an integer between 1 and 5.
- explanation: 6–20 words max, mention the main cue (e.g., "positive language", "poor service").
- Do NOT output anything other than the JSON object (no code blocks, no commentary).
- Do not hallucinate facts not present in the text.

Review:
"{review_text}"
"""


In [5]:
model = genai.GenerativeModel("models/gemini-2.5-flash")


def get_response(prompt):
    response = model.generate_content([prompt])  # LIST input
    return response.text


In [6]:
def run_prompt_1(review_text):
    prompt = PROMPT_1.format(review_text=review_text)
    raw_output = get_response(prompt)
    return raw_output


In [7]:
sample_df = df_200.sample(5, random_state=42).copy()
sample_df["prompt1_output"] = sample_df["text"].apply(run_prompt_1)
pd.set_option('display.max_colwidth', 500)
sample_df[["text", "stars", "prompt1_output"]]


Unnamed: 0,text,stars,prompt1_output
3787,Went here while on vacation in Phoneix based on seeing it on Man vs Food. Could not have been more impressed. Food was amazing. Far above expectations. The torta was huge and delicious. Would highly recommend it to anyone looking for a fantastic authentic meal at a good price!,5,"```json\n{\n ""predicted_stars"": 5,\n ""explanation"": ""The reviewer used overwhelmingly positive language and strongly recommended the fantastic food.""\n}\n```"
6033,"So I was pretty excited about this burger joint opening since it's President Obama's favorite... \nThe burger was good. Not great, not bland, just okay. My biggest hang up with this restaurant are the prices! Skip the beverage... it's very over priced. I was also not impressed with the fries- regular or cajun. They were pretty average as well. Stick with the burger and skip the rest is my suggestion to you.",2,"{\n ""predicted_stars"": 2,\n ""explanation"": ""The review expresses strong dissatisfaction with overpriced items and average food quality, despite an okay burger.""\n}"
321,"My wife and I live around the corner, hadn't eaten here in a few months. We got food for take out, Mongolian beef,kung po chicken,pad Thai noodles. Mongolian beef, there were more white onions then scallions and it was very bland. Kung po chicken,lots of white meat chicken the whole dish only had one peanut and the taste of this dish was non existent. Very very Blah... Pad Thai noodles,the dish was dry and came out of the container in one giant clump. It tasted like plain noodles, i was gla...",1,"```json\n{\n ""predicted_stars"": 1,\n ""explanation"": ""The review expresses extreme disappointment due to bland, dry, and unappetizing food quality across multiple dishes.""\n}\n```"
5674,"I don't often review places on Yelp because of the time commitment and because usually I am just looking for a good bite to eat. This time, though I feel I should if not to balance the ridiculous number of 4 and 5 stars bouncing around for this place.\nThe food was decent, and the atmosphere OK but seriously, how can this place cost as much as it does ($60+ for two people eating light)?? And how are people fawning all over it? I don't like to complain about a good meal, and indeed this meal ...",2,"```json\n{\n ""predicted_stars"": 2,\n ""explanation"": ""Despite good service, the high cost, poor value, and pretentious atmosphere led to a negative experience.""\n}\n```"
9149,The Harkins Camelview 5 gives Arizonans the unique opportunity to see movies that are normally reserved for screens in NY and CA. Without this theater there would be very little chance to see independent or foreign films in Phoenix. I give a lot of credit to Dan Harkins for keeping this theater open. He certainly can't make much money at this location. It's financially impractical to keep such a small theater operating on such prime real estate. This just shows Dan Harkin's commitment to s...,5,"{\n ""predicted_stars"": 5,\n ""explanation"": ""The review expresses strong gratitude for the theater's unique film selection and owner's dedication.""\n}"


In [8]:
import time

eval_df = df_200.sample(50, random_state=123).reset_index(drop=True)
eval_df.shape


(50, 10)

In [9]:

p1_outputs = []

for i, txt in enumerate(eval_df["text"]):
    print(f"Running Prompt 1 for row {i+1}/{len(eval_df)}...")
    try:
        out = run_prompt_1(txt)
    except Exception as e:
        print("Error on row", i, ":", e)
        out = None
    p1_outputs.append(out)
    
    # Sleep ~7 seconds to stay under 10 requests per minute
    time.sleep(7)

eval_df["p1_output"] = p1_outputs



Running Prompt 1 for row 1/50...
Running Prompt 1 for row 2/50...
Running Prompt 1 for row 3/50...
Running Prompt 1 for row 4/50...
Running Prompt 1 for row 5/50...
Running Prompt 1 for row 6/50...
Running Prompt 1 for row 7/50...
Running Prompt 1 for row 8/50...
Running Prompt 1 for row 9/50...
Running Prompt 1 for row 10/50...
Running Prompt 1 for row 11/50...
Running Prompt 1 for row 12/50...
Running Prompt 1 for row 13/50...
Running Prompt 1 for row 14/50...
Running Prompt 1 for row 15/50...
Running Prompt 1 for row 16/50...
Running Prompt 1 for row 17/50...
Running Prompt 1 for row 18/50...
Running Prompt 1 for row 19/50...
Running Prompt 1 for row 20/50...
Running Prompt 1 for row 21/50...
Running Prompt 1 for row 22/50...
Running Prompt 1 for row 23/50...
Running Prompt 1 for row 24/50...
Running Prompt 1 for row 25/50...
Running Prompt 1 for row 26/50...
Running Prompt 1 for row 27/50...
Running Prompt 1 for row 28/50...
Running Prompt 1 for row 29/50...
Running Prompt 1 for ro

In [12]:
import json

def parse_json_output(output):
    try:
        # Sometimes model adds code fences like ```json
        cleaned = output.replace("```json", "").replace("```", "").strip()
        return json.loads(cleaned), True
    except:
        return None, False


In [13]:
parsed_results = []
validity_flags = []
predicted_stars = []

for out in eval_df["p1_output"]:
    result, is_valid = parse_json_output(out)
    validity_flags.append(is_valid)
    
    if is_valid:
        predicted_stars.append(result.get("predicted_stars", None))
    else:
        predicted_stars.append(None)

eval_df["p1_valid_json"] = validity_flags
eval_df["p1_predicted_stars"] = predicted_stars

correct = (eval_df["stars"] == eval_df["p1_predicted_stars"]).sum()
accuracy_p1 = correct / len(eval_df)
json_validity_p1 = eval_df["p1_valid_json"].mean()
accuracy_p1, json_validity_p1


(0.56, 1.0)

In [14]:
PROMPT_2 = """
You are an expert Yelp rating auditor.
Study the complex examples below and then rate the new review.
Return ONLY valid JSON in EXACTLY this format:

{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<one short sentence>"
}}

Examples:

Example 1:
Review: "The appetizers were delicious but the main course arrived cold. The staff apologized, but the long wait ruined the experience."
Stars: 2

Example 2:
Review: "Great location and cozy atmosphere. The coffee was above average, but the pastries were dry. Still, I’d visit again for the vibe."
Stars: 4

Example 3:
Review: "Absolute disaster. The server ignored us for 30 minutes, the food was bland, and they messed up our bill twice."
Stars: 1

Example 4:
Review: "Food tasted fine, portion sizes were okay, service was neither slow nor fast. Very average overall, nothing stood out."
Stars: 3

Example 5:
Review: "Fantastic! Fresh ingredients, fast service, fair prices, and the staff went out of their way to make us comfortable."
Stars: 5

Instructions:
- Use the examples as a guide to map sentiment to stars.
- predicted_stars must be an integer from 1 to 5.
- explanation should be 6–20 words and mention the main reason.
- Do NOT output anything other than the JSON object.
- Do NOT include code blocks.

Now classify this review:

"{review_text}"
"""


In [15]:
def run_prompt_2(review_text):
    prompt = PROMPT_2.format(review_text=review_text)
    raw_output = get_response(prompt)   # uses same get_response as Prompt 1
    return raw_output


In [16]:
run_prompt_2("The food was good but service was slow.")


'```json\n{\n  "predicted_stars": 3,\n  "explanation": "The good food is a positive, but the slow service significantly detracts from the experience."\n}\n```'

In [17]:
import time

p2_outputs = []

for i, txt in enumerate(eval_df["text"]):
    print(f"Running Prompt 2 for row {i+1}/{len(eval_df)}...")
    try:
        out = run_prompt_2(txt)
    except Exception as e:
        print("Error on row", i, ":", e)
        out = None
    p2_outputs.append(out)
    
    time.sleep(7)  # respect free-tier rate limit

eval_df["p2_output"] = p2_outputs


Running Prompt 2 for row 1/50...
Running Prompt 2 for row 2/50...
Running Prompt 2 for row 3/50...
Running Prompt 2 for row 4/50...
Running Prompt 2 for row 5/50...
Running Prompt 2 for row 6/50...
Running Prompt 2 for row 7/50...
Running Prompt 2 for row 8/50...
Running Prompt 2 for row 9/50...
Running Prompt 2 for row 10/50...
Running Prompt 2 for row 11/50...
Running Prompt 2 for row 12/50...
Running Prompt 2 for row 13/50...
Running Prompt 2 for row 14/50...
Running Prompt 2 for row 15/50...
Running Prompt 2 for row 16/50...
Running Prompt 2 for row 17/50...
Running Prompt 2 for row 18/50...
Running Prompt 2 for row 19/50...
Running Prompt 2 for row 20/50...
Running Prompt 2 for row 21/50...
Running Prompt 2 for row 22/50...
Running Prompt 2 for row 23/50...
Running Prompt 2 for row 24/50...
Running Prompt 2 for row 25/50...
Running Prompt 2 for row 26/50...
Running Prompt 2 for row 27/50...
Running Prompt 2 for row 28/50...
Running Prompt 2 for row 29/50...
Running Prompt 2 for ro

In [18]:
p2_valid_flags = []
p2_pred_stars = []

for out in eval_df["p2_output"]:
    result, is_valid = parse_json_output(out)
    p2_valid_flags.append(is_valid)
    
    if is_valid:
        p2_pred_stars.append(result.get("predicted_stars", None))
    else:
        p2_pred_stars.append(None)

eval_df["p2_valid_json"] = p2_valid_flags
eval_df["p2_predicted_stars"] = p2_pred_stars

correct_p2 = (eval_df["stars"] == eval_df["p2_predicted_stars"]).sum()
accuracy_p2 = correct_p2 / len(eval_df)
json_validity_p2 = eval_df["p2_valid_json"].mean()

accuracy_p2, json_validity_p2


(0.52, 0.96)

In [19]:
PROMPT_3 = """
You are a Yelp Rating Evaluator. Use the rubric below and return ONLY valid JSON in EXACTLY this format:

{{
  "predicted_stars": <integer 1-5>,
  "explanation": "<one short sentence>"
}}

Rubric (how to choose the rating):
- 1 star  = Strong negative sentiment, serious complaints, very bad experience.
- 2 stars = Mostly negative, clear problems, but not completely horrible.
- 3 stars = Mixed or neutral; some positives and some negatives, or very weak sentiment.
- 4 stars = Mostly positive with minor issues.
- 5 stars = Strongly positive, enthusiastic, clear praise or recommendation.

Rules:
- Base your decision ONLY on the review text.
- If the sentiment is mixed, choose the rating that matches the dominant tone.
- predicted_stars must be an integer from 1 to 5.
- explanation (6–20 words) must mention the main reason (e.g., good food, bad service, slow staff).
- Do NOT output anything other than the JSON object.
- Do NOT include code blocks.

Now classify this review:

"{review_text}"
"""


In [20]:
def run_prompt_3(review_text):
    prompt = PROMPT_3.format(review_text=review_text)
    raw_output = get_response(prompt)
    return raw_output


In [21]:
print(run_prompt_3("The food was good but the service was slow."))


{
  "predicted_stars": 3,
  "explanation": "The good food was balanced by the negative experience of slow service."
}


In [22]:
import time

p3_outputs = []

for i, txt in enumerate(eval_df["text"]):
    print(f"Running Prompt 3 for row {i+1}/{len(eval_df)}...")
    try:
        out = run_prompt_3(txt)
    except Exception as e:
        print("Error on row", i, ":", e)
        out = None
    p3_outputs.append(out)
    
    time.sleep(7)  # respect free-tier rate limit

eval_df["p3_output"] = p3_outputs


Running Prompt 3 for row 1/50...
Running Prompt 3 for row 2/50...
Running Prompt 3 for row 3/50...
Running Prompt 3 for row 4/50...
Running Prompt 3 for row 5/50...
Running Prompt 3 for row 6/50...
Running Prompt 3 for row 7/50...
Running Prompt 3 for row 8/50...
Running Prompt 3 for row 9/50...
Running Prompt 3 for row 10/50...
Running Prompt 3 for row 11/50...
Running Prompt 3 for row 12/50...
Running Prompt 3 for row 13/50...
Running Prompt 3 for row 14/50...
Running Prompt 3 for row 15/50...
Running Prompt 3 for row 16/50...
Running Prompt 3 for row 17/50...
Running Prompt 3 for row 18/50...
Running Prompt 3 for row 19/50...
Running Prompt 3 for row 20/50...
Running Prompt 3 for row 21/50...
Running Prompt 3 for row 22/50...
Running Prompt 3 for row 23/50...
Running Prompt 3 for row 24/50...
Running Prompt 3 for row 25/50...
Running Prompt 3 for row 26/50...
Running Prompt 3 for row 27/50...
Running Prompt 3 for row 28/50...
Running Prompt 3 for row 29/50...
Running Prompt 3 for ro

In [23]:
p3_valid_flags = []
p3_pred_stars = []

for out in eval_df["p3_output"]:
    result, is_valid = parse_json_output(out)
    p3_valid_flags.append(is_valid)
    
    if is_valid:
        p3_pred_stars.append(result.get("predicted_stars", None))
    else:
        p3_pred_stars.append(None)

eval_df["p3_valid_json"] = p3_valid_flags
eval_df["p3_predicted_stars"] = p3_pred_stars

correct_p3 = (eval_df["stars"] == eval_df["p3_predicted_stars"]).sum()
accuracy_p3 = correct_p3 / len(eval_df)
json_validity_p3 = eval_df["p3_valid_json"].mean()

accuracy_p3, json_validity_p3


(0.58, 1.0)

In [30]:
PROMPT_4 = """
You are a Yelp review rating classifier. Your job is to decide the correct rating (1–5 stars) STRICTLY based on the text.

Return ONLY valid JSON in EXACTLY this format:

{{
  "predicted_stars": <integer 1–5>,
  "explanation": "<short reason>"
}}

Follow this process:

1. Internally identify:
   - The sentiment (positive, mixed, negative)
   - Strength of emotion (weak / moderate / strong)
   - Any explicit praise or complaints
   - Any indicators of service, food, price, ambience

2. Internally decide the correct rating using these rules:
   - Strong negative emotion → 1 star
   - Mostly negative → 2 stars
   - Mixed or neutral → 3 stars
   - Mostly positive → 4 stars
   - Strongly positive → 5 stars

3. IMPORTANT: Do NOT reveal your reasoning. Only output the final JSON.

Rules:
- predicted_stars must be an integer 1–5.
- explanation must be 6–18 words summarizing the main reason.
- No code blocks, no markdown, no extra text.

Review:
"{review_text}"
"""


In [31]:
def run_prompt_4(review_text):
    prompt = PROMPT_4.format(review_text=review_text)
    raw_output = get_response(prompt)
    return raw_output


In [35]:
print(run_prompt_4("Definitely come for Happy hour! Prices are amazing, sake bombers for $3...Great atmosphere and wait staff was incredibly nice and right on to all of our needs, didn't have to ask for a thing They were always spot on...Place gets crowded in the evening especially if you plan on sitting outside. I only wish there were one in Apollo Beach or Brandon!"))


{"predicted_stars": 5, "explanation": "Amazing prices, great atmosphere, and incredibly attentive wait staff made for an excellent experience."}


In [33]:
import time

p4_outputs = []

for i, txt in enumerate(eval_df["text"]):
    print(f"Running Prompt 4 for row {i+1}/{len(eval_df)}...")
    try:
        out = run_prompt_4(txt)
    except Exception as e:
        print("Error on row", i, ":", e)
        out = None
        
    p4_outputs.append(out)
    
    time.sleep(7)  # respect free-tier rate limit

eval_df["p4_output"] = p4_outputs



Running Prompt 4 for row 1/50...
Running Prompt 4 for row 2/50...
Running Prompt 4 for row 3/50...
Running Prompt 4 for row 4/50...
Running Prompt 4 for row 5/50...
Running Prompt 4 for row 6/50...
Running Prompt 4 for row 7/50...
Running Prompt 4 for row 8/50...
Running Prompt 4 for row 9/50...
Running Prompt 4 for row 10/50...
Running Prompt 4 for row 11/50...
Running Prompt 4 for row 12/50...
Running Prompt 4 for row 13/50...
Running Prompt 4 for row 14/50...
Running Prompt 4 for row 15/50...
Running Prompt 4 for row 16/50...
Running Prompt 4 for row 17/50...
Running Prompt 4 for row 18/50...
Running Prompt 4 for row 19/50...
Running Prompt 4 for row 20/50...
Running Prompt 4 for row 21/50...
Running Prompt 4 for row 22/50...
Running Prompt 4 for row 23/50...
Running Prompt 4 for row 24/50...
Running Prompt 4 for row 25/50...
Running Prompt 4 for row 26/50...
Running Prompt 4 for row 27/50...
Running Prompt 4 for row 28/50...
Running Prompt 4 for row 29/50...
Running Prompt 4 for ro

In [34]:
p4_valid_flags = []
p4_pred_stars = []

for out in eval_df["p4_output"]:
    result, is_valid = parse_json_output(out)
    p4_valid_flags.append(is_valid)
    
    if is_valid:
        p4_pred_stars.append(result.get("predicted_stars", None))
    else:
        p4_pred_stars.append(None)

eval_df["p4_valid_json"] = p4_valid_flags
eval_df["p4_predicted_stars"] = p4_pred_stars

correct_p4 = (eval_df["stars"] == eval_df["p4_predicted_stars"]).sum()
accuracy_p4 = correct_p4 / len(eval_df)
json_validity_p4 = eval_df["p4_valid_json"].mean()

accuracy_p4, json_validity_p4


(0.58, 1.0)

In [36]:
summary = {
    "Prompt": ["Prompt 1", "Prompt 2", "Prompt 3", "Prompt 4"],
    "Accuracy": [accuracy_p1, accuracy_p2, accuracy_p3, accuracy_p4],
    "JSON_validity": [json_validity_p1, json_validity_p2, json_validity_p3, json_validity_p4]
}
summary


{'Prompt': ['Prompt 1', 'Prompt 2', 'Prompt 3', 'Prompt 4'],
 'Accuracy': [0.56, 0.52, 0.58, 0.58],
 'JSON_validity': [1.0, 0.96, 1.0, 1.0]}