In [9]:
# =================================================================================
# Yelp Reviews - Prompting Experiments (Colab single cell)
# - Loads Kaggle dataset (omkarsabnis/yelp-reviews-dataset) or accepts user-uploaded CSV
# - Samples ~200 rows
# - Implements 3 prompting strategies (Direct, Few-Shot, Chain-of-Thought)
# - Runs each strategy 3 times for reliability measurement
# - Computes: Accuracy, MAE, JSON Validity Rate, Reliability / Consistency
# - Saves results and prints a comparison table + short discussion
# =================================================================================

# ---------------------------
# Standard library imports
# ---------------------------
import os
import sys
import json
import re
import time
import statistics
import tempfile
from collections import Counter, defaultdict
from typing import List, Dict, Tuple

# ---------------------------
# Third-party imports
# ---------------------------
# (Installs are quiet; Colab will keep them if already installed)
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import accuracy_score, confusion_matrix
    from tqdm.notebook import tqdm
except Exception:
    !pip install -q pandas numpy matplotlib seaborn scikit-learn tqdm
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import accuracy_score, confusion_matrix
    from tqdm.notebook import tqdm

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# ---------------------------
# Optional: Gemini (Google) config
# If you want to use Google Gemini, set GEMINI_API_KEY environment or provide when prompted.
# If you prefer another LLM, adapt the `call_model` function below.
# ---------------------------
USE_GEMINI = True
try:
    import google.generativeai as genai
except Exception:
    # If google generative isn't installed, install it but continue gracefully.
    !pip install -q google-generativeai
    import google.generativeai as genai

# ---------------------------
# Settings (tuneable)
# ---------------------------
SAMPLE_SIZE = 20          # recommended ~200 rows
RUNS_PER_PROMPT = 1        # number of independent runs per prompt for reliability
DELAY_BETWEEN_CALLS = 0.15  # seconds between LLM calls to be polite
MAX_RETRIES = 3            # retries on API failure
MODEL_NAME = "gemini-2.5-flash-lite"  # default Gemini model (change if needed)

# ---------------------------
# Helper: secure API input for Gemini
# ---------------------------
def configure_gemini():
    """
    Configures google.generativeai using environment variable GEMINI_API_KEY
    or by securely prompting the user in Colab.
    """
    api_key = os.environ.get("GEMINI_API_KEY")
    if not api_key:
        # Prompt securely if running interactively
        try:
            from getpass import getpass
            api_key = getpass("Paste your Google Gemini API Key (input hidden): ")
        except Exception:
            api_key = input("Paste your Google Gemini API Key: ")
    if not api_key:
        raise ValueError("Gemini API key was not provided.")
    genai.configure(api_key=api_key)
    print(f"Gemini configured for model: {MODEL_NAME}")

# ---------------------------
# Helper: model call abstraction
# ---------------------------
def call_model(prompt: str) -> str:
    """
    Calls Gemini using the modern GenerativeModel interface with forced JSON output.
    This guarantees structured output when combined with strict prompts.
    """
    if not USE_GEMINI:
        # Safe offline fallback (for debugging only)
        text = prompt.lower()
        if "best" in text or "love" in text or "amazing" in text:
            return json.dumps({"predicted_stars": 5, "explanation": "Positive sentiment detected."})
        if "worst" in text or "horrible" in text or "disgusting" in text or "terrible" in text:
            return json.dumps({"predicted_stars": 1, "explanation": "Strong negative sentiment detected."})
        return json.dumps({"predicted_stars": 3, "explanation": "Neutral sentiment detected."})

    # ✅ Proper Gemini usage (this is what fixes your JSON problem)
    model = genai.GenerativeModel(MODEL_NAME)

    for attempt in range(MAX_RETRIES):
        try:
            response = model.generate_content(
                prompt,
                generation_config={
                    "temperature": 0.2,
                    "response_mime_type": "application/json"  # ✅ THIS LINE FORCES JSON!
                }
            )

            if hasattr(response, "text") and response.text:
                return response.text.strip()

            # Fallback if SDK returns structured candidates
            if hasattr(response, "candidates") and response.candidates:
                return response.candidates[0].content.parts[0].text.strip()

            return str(response)

        except Exception:
            time.sleep(0.5 * (attempt + 1))

    raise RuntimeError("LLM call failed after retries.")

# ---------------------------
# JSON cleaning & parsing utilities
# ---------------------------
def extract_json_object(text: str) -> Dict:
    """
    Attempts to robustly extract a JSON object from the model's text output.
    Returns a dict on success, raises ValueError on failure.
    """
    # Remove common markdown fences
    cleaned = re.sub(r"```(?:json)?", "", text, flags=re.IGNORECASE).strip()
    # Attempt to locate first {...} block
    match = re.search(r"\{.*\}", cleaned, flags=re.DOTALL)
    if match:
        candidate = match.group(0)
    else:
        candidate = cleaned

    # Some models use single quotes; normalize to double quotes for json.loads
    candidate = candidate.replace("'", '"')

    try:
        parsed = json.loads(candidate)
        return parsed
    except Exception:
        # Try to salvage minimal predicted_stars using regex
        m = re.search(r'[""]?predicted_stars[""]?\s*[:=]\s*(\d)', candidate)
        if m:
            return {"predicted_stars": int(m.group(1)), "explanation": "Recovered via regex due to malformed JSON."}
        raise ValueError("Unable to parse JSON from model output. Raw output truncated: " + cleaned[:200])

# ---------------------------
# Prompt definitions (3 approaches) with STRICT JSON enforcement
# ---------------------------

# ✅ Approach A: Direct (Production-safe JSON)
PROMPT_DIRECT = """
You are a strict JSON-only API.

Task:
Given a Yelp review, predict the star rating from 1 to 5.

Rules:
- Output ONLY valid JSON.
- Do NOT add markdown.
- Do NOT add comments.
- Do NOT add extra text.
- The output MUST match this schema exactly:

{{
  "predicted_stars": 1-5 integer,
  "explanation": "one short sentence"
}}

Review:
"{review}"

Return JSON only.
"""

PROMPT_DIRECT_EXPL = "Direct instruction with hard JSON schema enforcement to eliminate formatting errors."

# ✅ Approach B: Few-Shot (Production-safe JSON)
PROMPT_FEWSHOT = """
You are a strict JSON-only classification API.

Here are examples:

Input: "Worst food ever, cold and tasteless."
Output:
{{"predicted_stars": 1, "explanation": "Extremely negative dining experience."}}

Input: "It was okay, edible but boring."
Output:
{{"predicted_stars": 3, "explanation": "Average experience with no strong positives or negatives."}}

Input: "Absolutely amazing! Best night ever."
Output:
{{"predicted_stars": 5, "explanation": "Outstanding experience with very positive sentiment."}}

Now classify the review below.

Rules:
- Output ONLY valid JSON.
- Follow the exact key names.
- No markdown.
- No extra text.

Review:
"{review}"
"""

PROMPT_FEWSHOT_EXPL = "Few-shot prompt with explicit JSON-only constraint and structured examples."

# ✅ Approach C: Chain-of-Thought (JSON-safe reasoning)
PROMPT_COT = """
You are a strict JSON-only reasoning API.

Task:
Analyze the review internally and return:

Rules:
- Output ONLY valid JSON.
- Keys MUST be exactly:
  - "reasoning"
  - "predicted_stars"
- No markdown.
- No extra text.

Schema:
{{
  "reasoning": "brief step-by-step sentiment reasoning",
  "predicted_stars": 1-5 integer
}}

Review:
"{review}"
"""

PROMPT_COT_EXPL = "Chain-of-thought with forced JSON schema to preserve reasoning and machine readability."

PROMPT_VERSIONS = [
    ("Direct", PROMPT_DIRECT, PROMPT_DIRECT_EXPL),
    ("FewShot", PROMPT_FEWSHOT, PROMPT_FEWSHOT_EXPL),
    ("ChainOfThought", PROMPT_COT, PROMPT_COT_EXPL),
]

# ---------------------------
# Load dataset: Try Kaggle API download first; otherwise ask user to upload CSV
# Dataset: omkarsabnis/yelp-reviews-dataset (Kaggle)
# ---------------------------
def load_kaggle_dataset(sample_size: int = SAMPLE_SIZE) -> pd.DataFrame:
    """
    Attempts to download the Kaggle dataset using environment credentials.
    If not possible, asks the user to upload a CSV file named 'yelp.csv'
    or to mount with other means.
    """
    kaggle_dataset_ref = "omkarsabnis/yelp-reviews-dataset"  # dataset identifier
    # First, check if the user already put yelp.csv in the working directory
    local_csvs = [p for p in os.listdir(".") if p.lower().endswith(".csv")]
    if "yelp.csv" in local_csvs:
        print("Found local 'yelp.csv' -- using that.")
        df = pd.read_csv("yelp.csv").dropna(subset=["text", "stars"])
        return df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Try Kaggle CLI (requires kaggle.json or env vars)
    try:
        # If kaggle CLI not available, install
        !kaggle --version
    except Exception:
        !pip install -q kaggle

    # Check for kaggle credentials
    kaggle_token_exists = os.path.exists(os.path.expanduser("~/.kaggle/kaggle.json")) or \
                          ("KAGGLE_USERNAME" in os.environ and "KAGGLE_KEY" in os.environ)

    if kaggle_token_exists:
        # Attempt to download dataset
        try:
            # Create a temporary directory for dataset
            tmpdir = tempfile.mkdtemp()
            print("Attempting to download dataset from Kaggle into:", tmpdir)
            !kaggle datasets download -d {kaggle_dataset_ref} -p {tmpdir} --unzip -q
            # Find a CSV inside tmpdir
            csv_files = []
            for root, _, files in os.walk(tmpdir):
                for f in files:
                    if f.lower().endswith(".csv"):
                        csv_files.append(os.path.join(root, f))
            if not csv_files:
                raise FileNotFoundError("No CSV found in downloaded Kaggle dataset.")
            # Use the first CSV that contains 'review' or 'stars' columns
            chosen = None
            for c in csv_files:
                try:
                    peek = pd.read_csv(c, nrows=5)
                    if {'text', 'stars'}.issubset(peek.columns):
                        chosen = c
                        break
                except Exception:
                    continue
            if chosen is None:
                chosen = csv_files[0]
            df = pd.read_csv(chosen).dropna(subset=["text", "stars"])
            print(f"Loaded dataset CSV: {chosen}")
            return df.sample(frac=1, random_state=42).reset_index(drop=True)
        except Exception as e:
            print("Kaggle download failed:", str(e))

    # If we reach here, fallback to asking user to upload a file manually (Colab)
    print("Please upload 'yelp.csv' (the Kaggle CSV) into the Colab session or place it in working dir.")
    try:
        from google.colab import files
        uploaded = files.upload()
        # pick first CSV
        for name in uploaded:
            if name.lower().endswith(".csv"):
                df = pd.read_csv(name).dropna(subset=["text", "stars"])
                return df.sample(frac=1, random_state=42).reset_index(drop=True)
    except Exception:
        pass

    # As last fallback, raise informative error
    raise FileNotFoundError(
        "Dataset not found. Please either:\n"
        "1) Place 'yelp.csv' in the working directory (must have 'text' and 'stars' columns), OR\n"
        "2) Provide Kaggle credentials (kaggle.json) and allow the script to download omkarsabnis/yelp-reviews-dataset, OR\n"
        "3) Upload a CSV via Colab file upload."
    )

# ---------------------------
# Evaluation utilities
# ---------------------------
def evaluate_predictions(y_true: List[int], y_pred: List[int]) -> Tuple[float, float]:
    """
    Compute Accuracy and MAE.
    """
    acc = accuracy_score(y_true, y_pred)
    mae = float(np.mean(np.abs(np.array(y_true) - np.array(y_pred))))
    return acc, mae

def compute_json_validity_rate(parsed_results: List[Tuple[bool, dict]]) -> float:
    """
    parsed_results: list of (is_valid_json, parsed_dict_or_none)
    """
    valid_count = sum(1 for ok, _ in parsed_results if ok)
    return valid_count / max(1, len(parsed_results))

def reliability_score(preds_per_run: List[List[int]]) -> float:
    """
    preds_per_run: list (runs) of lists (predictions for each sample)
    Compute fraction of samples for which all runs agreed (exact agreement).
    """
    runs = len(preds_per_run)
    n = len(preds_per_run[0])
    agree_count = 0
    for idx in range(n):
        values = [preds_per_run[r][idx] for r in range(runs)]
        if len(set(values)) == 1:
            agree_count += 1
    return agree_count / n

def pairwise_agreement(preds_per_run: List[List[int]]) -> float:
    """
    Computes average pairwise agreement (Dice) between runs.
    """
    runs = len(preds_per_run)
    n = len(preds_per_run[0])
    total_agree = 0
    count_pairs = 0
    for i in range(runs):
        for j in range(i + 1, runs):
            agree = sum(1 for k in range(n) if preds_per_run[i][k] == preds_per_run[j][k])
            total_agree += agree / n
            count_pairs += 1
    return total_agree / count_pairs if count_pairs else 1.0

# ---------------------------
# Main experiment orchestration
# ---------------------------
def run_experiments(sample_size: int = SAMPLE_SIZE):
    # 1) Load dataset
    df = load_kaggle_dataset(sample_size)
    # Ensure text and stars exist
    df = df.dropna(subset=["text", "stars"]).reset_index(drop=True)
    # Limit to requested sample size (stratified if possible)
    if len(df) > sample_size:
        try:
            # stratified sample by stars
            df = df.groupby("stars", group_keys=False).apply(
                lambda x: x.sample(min(len(x), max(1, sample_size // df['stars'].nunique())))
            ).reset_index(drop=True)
            df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
        except Exception:
            df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
    else:
        df = df.copy()

    texts = df["text"].astype(str).tolist()
    true_stars = df["stars"].astype(int).tolist()

    # 2) Prepare storage for metrics
    comparison_rows = []

    # 3) For each prompt version
    for name, template, explanation in PROMPT_VERSIONS:
        print("\n" + "=" * 80)
        print(f"Running experiment: {name}")
        print("Prompt intention:", explanation)
        # show the prompt skeleton once
        print("Prompt skeleton (example):")
        print(template.format(review=texts[0])[:800])
        print("-" * 80)

        # Run the prompt RUNS_PER_PROMPT independent times
        all_parsed_ok = []  # list of lists for each run: [(is_valid, parsed), ...]
        all_predictions = []  # list of lists for each run: [preds...]

        for run_idx in range(RUNS_PER_PROMPT):
            print(f"Run {run_idx+1}/{RUNS_PER_PROMPT} for {name}")
            run_parsed = []
            run_preds = []
            for i, review in enumerate(tqdm(texts, desc=f"{name}-run{run_idx+1}", leave=False)):
                prompt_text = template.format(review=review)

                # call model and parse robustly with retry
                raw_output = None
                parsed = None
                ok = False
                for attempt in range(MAX_RETRIES):
                    try:
                        raw_output = call_model(prompt_text)
                        parsed = extract_json_object(raw_output)
                        ok = True
                        break
                    except Exception as e:
                        # If parsing failed, try one more time (model output variability)
                        time.sleep(0.5 + attempt * 0.5)
                        continue

                if not ok:
                    # mark as invalid and default to neutral 3
                    run_parsed.append((False, None))
                    run_preds.append(3)
                else:
                    run_parsed.append((True, parsed))
                    # Expect integer predicted_stars key
                    if isinstance(parsed, dict) and ("predicted_stars" in parsed):
                        try:
                            val = int(parsed["predicted_stars"])
                            val = max(1, min(5, val))
                        except Exception:
                            val = 3
                    elif isinstance(parsed, dict) and ("stars" in parsed):
                        try:
                            val = int(parsed["stars"])
                            val = max(1, min(5, val))
                        except Exception:
                            val = 3
                    else:
                        # Some CoT prompt uses "predicted_stars" but might be nested; fallback to regex
                        m = re.search(r'(\d)\s*(?:stars|star)\b', raw_output.lower())
                        val = int(m.group(1)) if m else 3
                    run_preds.append(val)

                time.sleep(DELAY_BETWEEN_CALLS)

            all_parsed_ok.append(run_parsed)
            all_predictions.append(run_preds)

        # 4) Aggregate metrics
        # For JSON validity rate, count valid across the first run (or average across runs)
        json_rates = [compute_json_validity_rate(run) for run in all_parsed_ok]
        avg_json_validity = sum(json_rates) / len(json_rates)

        # For accuracy and MAE, compute using the mode of predictions across runs (consensus)
        # First compute consensus prediction per sample (mode of runs)
        n_samples = len(texts)
        consensus_preds = []
        per_sample_std = []
        for i in range(n_samples):
            values = [all_predictions[r][i] for r in range(RUNS_PER_PROMPT)]
            consensus = Counter(values).most_common(1)[0][0]
            consensus_preds.append(consensus)
            per_sample_std.append(statistics.pstdev(values))

        acc, mae = evaluate_predictions(true_stars, consensus_preds)

        # Reliability measures
        exact_agreement_fraction = reliability_score(all_predictions)
        avg_pairwise_agree = pairwise_agreement(all_predictions)
        avg_std = sum(per_sample_std) / len(per_sample_std)

        # Save run-level errors (count invalid JSONs as errors)
        total_invalid = sum([sum(1 for ok, _ in run if not ok) for run in all_parsed_ok])
        total_calls = RUNS_PER_PROMPT * n_samples

        comparison_rows.append({
            "Approach": name,
            "Accuracy": acc,
            "MAE": mae,
            "JSON_Validity_Rate": avg_json_validity,
            "Exact_Agreement": exact_agreement_fraction,
            "Pairwise_Agreement": avg_pairwise_agree,
            "Avg_Pred_StdDev": avg_std,
            "Invalid_JSONs": total_invalid,
            "Total_Calls": total_calls
        })

        # Optional: Save detailed outputs to CSV for inspection
        out_df = pd.DataFrame({
            "text": texts,
            "true_stars": true_stars,
            "consensus_pred": consensus_preds,
            "pred_stddev": per_sample_std
        })
        out_filename = f"predictions_{name}.csv"
        out_df.to_csv(out_filename, index=False)
        print(f"Saved detailed predictions to {out_filename}")

    # 5) Summary table
    summary_df = pd.DataFrame(comparison_rows)
    # nicer formatting for display
    display_df = summary_df.copy()
    display_df["Accuracy"] = display_df["Accuracy"].map(lambda x: f"{x:.2%}")
    display_df["MAE"] = display_df["MAE"].map(lambda x: f"{x:.3f}")
    display_df["JSON_Validity_Rate"] = display_df["JSON_Validity_Rate"].map(lambda x: f"{x:.2%}")
    display_df["Exact_Agreement"] = display_df["Exact_Agreement"].map(lambda x: f"{x:.2%}")
    display_df["Pairwise_Agreement"] = display_df["Pairwise_Agreement"].map(lambda x: f"{x:.2%}")
    display_df["Avg_Pred_StdDev"] = display_df["Avg_Pred_StdDev"].map(lambda x: f"{x:.3f}")

    print("\n\nFINAL COMPARISON TABLE")
    print(display_df.to_string(index=False))

    # 6) Short automated discussion (human-editable)
    print("\n\nBRIEF DISCUSSION / INTERPRETATION")
    for row in comparison_rows:
        print("-" * 80)
        print(f"Approach: {row['Approach']}")
        print(f"  Accuracy: {row['Accuracy']:.3%}, MAE: {row['MAE']:.3f}")
        print(f"  JSON Validity (avg): {row['JSON_Validity_Rate']:.2%}")
        print(f"  Exact agreement across {RUNS_PER_PROMPT} runs: {row['Exact_Agreement']:.2%}")
        print(f"  Pairwise agreement (avg): {row['Pairwise_Agreement']:.2%}")
        print("  Notes:")
        if row['Approach'] == "Direct":
            print("    - Baseline prompt. Fast and concise but may produce more JSON formatting errors if model attempts to add commentary.")
        elif row['Approach'] == "FewShot":
            print("    - Anchors numeric scale via examples; often improves calibration and reduces borderline mistakes.")
        else:
            print("    - Chain-of-Thought encourages explicit reasoning, which can improve correctness but may increase verbosity and parsing issues.")

    print("\nConcluding remarks:")
    print(" - Compare Accuracy and JSON validity: a higher accuracy with low JSON validity may be less usable (extra parsing required).")
    print(" - Reliability (agreement across runs) indicates how deterministic the chosen prompt is under the model; higher is better for production.")
    print(" - If JSON validity is low for a chosen prompt, consider enforcing stricter formatting constraints or post-processing heuristics.")

    # Return the summary table for programmatic use
    return summary_df

# ---------------------------
# Run full pipeline (configure Gemini if needed)
# ---------------------------
if USE_GEMINI:
    configure_gemini()

summary = run_experiments(SAMPLE_SIZE)

# Save summary to CSV
summary.to_csv("comparison_summary.csv", index=False)
print("\nSaved summary to 'comparison_summary.csv'.")

# End of script


Paste your Google Gemini API Key (input hidden): ··········
Gemini configured for model: gemini-2.5-flash-lite
Found local 'yelp.csv' -- using that.

Running experiment: Direct
Prompt intention: Direct instruction with hard JSON schema enforcement to eliminate formatting errors.
Prompt skeleton (example):

You are a strict JSON-only API.

Task:
Given a Yelp review, predict the star rating from 1 to 5.

Rules:
- Output ONLY valid JSON.
- Do NOT add markdown.
- Do NOT add comments.
- Do NOT add extra text.
- The output MUST match this schema exactly:

{
  "predicted_stars": 1-5 integer,
  "explanation": "one short sentence"
}

Review:
"There was hair in the food. Not just 1 but 2 in the same Bhel Puri dish.

The whole experience was pretty bad. There is no indication of how you order your food and when you finally figure it out the food does not taste good. We checked sodas on the order sheet and when no sodas showed up we asked if we were supposed to go and find our own sodas from the f

Direct-run1:   0%|          | 0/20 [00:00<?, ?it/s]



KeyboardInterrupt: 