In [1]:
pip install openai


Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\satya\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
import json
import re
from tqdm import tqdm


In [3]:
from openai import OpenAI
import os

client = OpenAI(
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1"
)

MODEL_NAME = "mistralai/mistral-7b-instruct"


cell 3

In [4]:
df = pd.read_csv(
    r"C:\Users\satya\Downloads\fynd-ai-task1-rating-prediction\task\data\processed\yelp_processed.csv"
)

df = df[["review_text", "stars"]]
df.dropna(inplace=True)
df["stars"] = df["stars"].astype(int)

df.head()


Unnamed: 0,review_text,stars
0,Meh.\n\nPlus for cool decor. Minus for being i...,3
1,"Very consistent, thin crust pizza, made with f...",4
2,While we were in Mesa AZ for a holiday we foun...,5
3,Exactly what a hole in the wall Pho place shou...,5
4,I used to work near here and we got take out a...,4


CELL 4 — Prompt Loader

In [5]:
def load_prompt(prompt_file, review_text):
    with open(prompt_file, "r", encoding="utf-8") as f:
        template = f.read()
    return template.replace("{{review_text}}", review_text)


CELL 5 — Gemini Call

In [12]:
import json
import re

def safe_parse_json(text):
    if text is None:
        return None

    # Remove markdown/code block wrappers if present
    text = re.sub(r"```json|```", "", text).strip()

    try:
        data = json.loads(text)

        if (
            isinstance(data, dict)
            and "predicted_stars" in data
            and "explanation" in data
        ):
            return {
                "predicted_stars": int(data["predicted_stars"]),
                "explanation": data["explanation"],
                "valid_json": True
            }

    except Exception:
        pass

    # True fallback (used only if parsing fails)
    return {
        "predicted_stars": None,
        "explanation": None,
        "valid_json": False
    }


cell 6

In [14]:
def predict_single_review(review_text, prompt_file):
    prompt = load_prompt(prompt_file, review_text)

    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {
                "role": "system",
                "content": "You are a strict JSON-only response generator."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0
    )

    output_text = response.choices[0].message.content
    return safe_parse_json(output_text)


In [16]:
test_review = df["review_text"].iloc[0]

result = predict_single_review(
    test_review,
    "prompt_v3_structured.txt"
)

# FINAL OUTPUT AS PER ASSIGNMENT FORMAT
final_output = {
    "predicted_stars": result["predicted_stars"],
    "explanation": result["explanation"]
}

print(final_output)


{'predicted_stars': 3, 'explanation': 'The review is mixed, with positive notes on decor and some menu items, but overall lukewarm satisfaction.'}


CELL 7 — Run ALL 3 Prompts

In [38]:
PROMPT_DIR = r"C:\Users\satya\Downloads\fynd-ai-task1-rating-prediction\task\prompts"


In [19]:
import os

def load_prompt(prompt_file, review_text):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)
    with open(prompt_path, "r", encoding="utf-8") as f:
        template = f.read()
    return template.replace("{{review_text}}", review_text)


In [20]:
import json
import re

def safe_parse_json(text):
    if text is None:
        return None

    text = re.sub(r"```json|```", "", text).strip()

    try:
        data = json.loads(text)
        if "predicted_stars" in data and "explanation" in data:
            return {
                "predicted_stars": int(data["predicted_stars"]),
                "explanation": data["explanation"],
                "valid_json": True
            }
    except:
        pass

    return {
        "predicted_stars": None,
        "explanation": None,
        "valid_json": False
    }


In [21]:
from tqdm import tqdm
import pandas as pd

def run_prompt(prompt_file, sample_size=10):
    results = []
    sample_df = df.sample(sample_size, random_state=42)

    for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
        pred = predict_single_review(row["review_text"], prompt_file)

        results.append({
            "actual_stars": row["stars"],
            "predicted_stars": pred["predicted_stars"],
            "explanation": pred["explanation"],
            "valid_json": pred["valid_json"]
        })

    return pd.DataFrame(results)


In [22]:
results = {
    "Zero-Shot": run_prompt("prompt_v1_zero_shot.txt", sample_size=10),
    "Few-Shot": run_prompt("prompt_v2_few_shot.txt", sample_size=10),
    "Structured": run_prompt("prompt_v3_structured.txt", sample_size=10)
}


100%|██████████| 10/10 [00:19<00:00,  1.94s/it]
100%|██████████| 10/10 [00:11<00:00,  1.11s/it]
100%|██████████| 10/10 [00:15<00:00,  1.57s/it]


In [23]:
def evaluate_results(df):
    total = len(df)
    correct = (df["actual_stars"] == df["predicted_stars"]).sum()
    accuracy = correct / total if total > 0 else 0

    json_validity = df["valid_json"].mean()  # True = 1, False = 0

    return accuracy, json_validity


In [24]:
summary = []

for prompt_name, df_result in results.items():
    acc, json_rate = evaluate_results(df_result)
    summary.append({
        "Prompt Type": prompt_name,
        "Accuracy": round(acc, 3),
        "JSON Validity Rate": round(json_rate, 3)
    })

comparison_df = pd.DataFrame(summary)
comparison_df


Unnamed: 0,Prompt Type,Accuracy,JSON Validity Rate
0,Zero-Shot,0.4,0.4
1,Few-Shot,0.5,0.5
2,Structured,0.9,1.0


In [37]:
comparison_df.to_csv("task1_prompt_comparison.csv", index=False)


In [43]:
import os

PROMPT_DIR = r"C:\Users\satya\Downloads\fynd-ai-task1-rating-prediction\task\prompts"

print("Prompt dir exists:", os.path.exists(PROMPT_DIR))
print("Files:", os.listdir(PROMPT_DIR))


Prompt dir exists: True
Files: ['prompt_v1_zero_shot.txt', 'prompt_v2_few_shot.txt', 'prompt_v3_structured.txt']


In [44]:
results = {
    "v1": run_prompt("prompt_v1_zero_shot.txt", sample_size=10),
    "v2": run_prompt("prompt_v2_few_shot.txt", sample_size=10),
    "v3": run_prompt("prompt_v3_structured.txt", sample_size=10)
}


100%|██████████| 10/10 [00:23<00:00,  2.36s/it]
100%|██████████| 10/10 [00:12<00:00,  1.26s/it]
100%|██████████| 10/10 [00:18<00:00,  1.84s/it]


In [47]:
import math
import json
import os

OUTPUT_DIR = "task/outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def save_clean_json(df, filename):
    clean_rows = []

    for _, row in df.iterrows():
        # Skip NaN or fallback rows
        if (
            pd.notna(row["predicted_stars"])
            and row["explanation"] != "Fallback applied."
        ):
            clean_rows.append({
                "predicted_stars": int(row["predicted_stars"]),
                "explanation": row["explanation"]
            })

    with open(os.path.join(OUTPUT_DIR, filename), "w", encoding="utf-8") as f:
        json.dump(clean_rows, f, indent=2)

    print(f"Saved {len(clean_rows)} clean records to {filename}")


In [49]:
save_clean_json(results["v1"], "prompt_v1_outputs.json")
save_clean_json(results["v2"], "prompt_v2_outputs.json")
save_clean_json(results["v3"], "prompt_v3_outputs.json")


Saved 6 clean records to prompt_v1_outputs.json
Saved 5 clean records to prompt_v2_outputs.json
Saved 10 clean records to prompt_v3_outputs.json
