In [3]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from pydantic import BaseModel, ValidationError, Field, validator
from google import genai
from google.genai import types
from google.genai.errors import APIError
from tqdm import tqdm
from typing import List, Dict, Any, Tuple

# ============================================================================
# --- 1. SETUP AND CONFIGURATION ---
# ============================================================================

# 1.1. Set up Gemini API Key (NOTE: Placeholder key used for structure)
os.environ["GEMINI_API_KEY"] = "AIzaSyDEuzSUl2PUYHjPcEB5qJQpMZEuy4FNvvI" # Placeholder Key
MODEL_NAME = "gemini-2.5-flash" 

try:
    client = genai.Client()
    print("Gemini Client Initialized.")
except Exception as e:
    print(f"Error initializing Gemini client. Make sure the API key is correct. Error: {e}")
    client = None

# 1.2. Define the REQUIRED Structured Output Schema using Pydantic
class RatingPrediction(BaseModel):
    """Schema for the LLM's structured output."""
    predicted_stars: int = Field(..., description="The predicted star rating from 1 to 5.")
    explanation: str = Field(..., description="A brief explanation for the predicted rating.")
    
    @validator('predicted_stars')
    def check_star_range(cls, v):
        if not (1 <= v <= 5):
            raise ValueError(f'predicted_stars must be between 1 and 5, received {v}')
        return v

# --- 2. DATA LOADING AND SAMPLING ---

# CRITICAL: Initialize df_sampled globally as an empty DataFrame to prevent NameError
df_sampled = pd.DataFrame()

DATA_PATH = r"C:\Users\saksh\Downloads\yelp.csv\yelp.csv" 
SAMPLE_SIZE = 10 

try:
    df_full = pd.read_csv(DATA_PATH)
    
    # Filter, sample, and rename columns for evaluation clarity
    df_sampled = (
        df_full[['text', 'stars']]
        .dropna()
        .sample(SAMPLE_SIZE, random_state=42)
        .reset_index(drop=True)
    )
    
    df_sampled = df_sampled.rename(columns={'stars': 'actual_stars'})

    print(f"\nData loaded and sampled to {len(df_sampled)} rows.")

except FileNotFoundError:
    print(f"\nError: Dataset file not found at {DATA_PATH}. Please check the path and file name.")
    # df_sampled remains empty DataFrame, preventing NameError later
except Exception as e:
    print(f"\nError during data loading/sampling: {e}")
    # df_sampled remains empty DataFrame
# --- Data Loading End ---


# --- 3. PROMPT APPROACH DEFINITIONS (3 VERSIONS REQUIRED) ---

## A. Approach 1: Zero-Shot (Baseline)
PROMPT_1_ZERO_SHOT = """
You are an expert review classifier. Your task is to analyze the following Yelp review and classify it into a star rating from 1 (worst) to 5 (best).
You MUST return your response as a valid JSON object matching the provided schema. Do not include any text outside of the JSON object.
REVIEW: "{review_text}"
"""

## B. Approach 2: Few-Shot with CoT (Chain-of-Thought)
FEW_SHOT_EXAMPLE = """
EXAMPLE REVIEW: "The waiter was rude and spilled coffee on my laptop. The food was mediocre and took an hour to arrive. I will never return here."
EXAMPLE RATING: 1
EXAMPLE EXPLANATION: The review contains multiple strong negative indicators such as 'rude', 'spilled coffee', 'mediocre food', and 'took an hour', all pointing to a terrible experience.
"""
PROMPT_2_FEW_SHOT_COT = f"""
You are an expert review classifier. Your task is to analyze the following Yelp review and classify it into a star rating from 1 (worst) to 5 (best).
Here is an example to guide your classification:
---
{FEW_SHOT_EXAMPLE}
---
Now, classify the following review. First, briefly mention the key sentiment drivers (e.g., positive keywords, negative experience) in a thought process.
You MUST return your final response as a valid JSON object matching the provided schema.
REVIEW: "{{review_text}}"
"""


## C. Approach 3: Role-Play and Format-Focused
PROMPT_3_ROLE_FOCUSED = """
SYSTEM INSTRUCTION: You are a **highly reliable and meticulous Sentiment Analysis Bot** dedicated to classifying customer reviews with perfect JSON adherence.
Your sole output must be a JSON object, and you must verify that the 'predicted_stars' is an integer between 1 and 5, and the 'explanation' is a brief reasoning.
Review to Analyze:
---
{{review_text}}
---
"""

PROMPT_MAP = {
    "Zero-Shot (Baseline)": PROMPT_1_ZERO_SHOT,
    "Few-Shot + CoT": PROMPT_2_FEW_SHOT_COT,
    "Role-Play + Format Focus": PROMPT_3_ROLE_FOCUSED,
}

# --- 4. CORE FUNCTION: LLM CLASSIFICATION AND EVALUATION ---

def classify_reviews_with_llm(df: pd.DataFrame, prompt_template: str, prompt_name: str) -> Tuple[Dict[str, Any], pd.DataFrame]:
    """
    Processes the sampled dataframe using a given prompt and evaluates the results.
    """
    if client is None:
        raise RuntimeError("Gemini Client failed to initialize. Cannot run API calls.")
        
    # Initialize columns for LLM results
    df['llm_output_raw'] = None
    df['predicted_stars'] = None
    df['explanation'] = None
    
    valid_json_count = 0
    
    # Configure the request to use the Pydantic schema for structured output
    config = types.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema=RatingPrediction,
    )

    # 4.1. Iterate through the sampled reviews
    for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Running {prompt_name}"):
        review_text = row['text']
        
        # 4.2. Format the prompt and send the request
        prompt = prompt_template.format(review_text=review_text)
        
        try:
            response = client.models.generate_content(
                model=MODEL_NAME,
                contents=[prompt],
                config=config,
            )
            
            json_response = response.text
            df.at[index, 'llm_output_raw'] = json_response

            # 4.3. Parse and Validate the JSON output
            try:
                # Use Pydantic's parse_raw to validate the JSON against the schema
                parsed_data = RatingPrediction.parse_raw(json_response)
                
                df.at[index, 'predicted_stars'] = parsed_data.predicted_stars
                df.at[index, 'explanation'] = parsed_data.explanation
                valid_json_count += 1

            except (json.JSONDecodeError, ValidationError) as e:
                # JSON invalidity or schema violation
                df.at[index, 'llm_output_raw'] = f"JSON/Validation Error: {json_response} | {e}"
                pass # Values remain None

        except APIError as e:
            # Handle API errors 
            df.at[index, 'llm_output_raw'] = f"API Error: {e}"
        
    # 4.4. Calculate Metrics
    
    df_results = df.dropna(subset=['predicted_stars']).copy()
    if not df_results.empty:
        accuracy = accuracy_score(df_results['actual_stars'].astype(int), df_results['predicted_stars'].astype(int))
    else:
        accuracy = 0.0

    json_validity_rate = valid_json_count / len(df)
    non_empty_explanation_count = df_results['explanation'].apply(lambda x: bool(x) and str(x).strip() != '').sum()
    explanation_rate = non_empty_explanation_count / len(df)
    
    metrics = {
        "Approach": prompt_name,
        "Accuracy": accuracy,
        "JSON Validity Rate": json_validity_rate,
        "Reliability (Non-Empty Explanation Rate)": explanation_rate
    }
    
    return metrics, df.copy()

# --- 5. EXECUTE ALL APPROACHES AND COMPARE ---

def main():
    results_list = []
    all_results = {}

    global df_sampled 

    if df_sampled.empty:
        print("\nCannot run evaluation: Sampled DataFrame is empty. Check your DATA_PATH.")
        return

    for name, template in PROMPT_MAP.items():
        print(f"\n--- Running {name} ---")
        
        # Make a deep copy of the sampled data for each independent run
        df_to_process = df_sampled[['text', 'actual_stars']].copy()
        
        metrics, df_output = classify_reviews_with_llm(df_to_process, template, name)
        
        results_list.append(metrics)
        all_results[name] = df_output

    # Create the final comparison table
    df_comparison = pd.DataFrame(results_list).set_index('Approach')
    print("\n" + "="*50)
    print("           TASK 1: PROMPT COMPARISON RESULTS")
    print("="*50)
    print(df_comparison.to_markdown())

    # --- 6. REQUIRED REPORTING OUTPUTS ---

    best_approach = df_comparison['Accuracy'].idxmax()
    print(f"\n\n--- Report Content Examples (Best Approach: {best_approach}) ---")
    print("\nExample 5 Predictions from the Best Approach:")
    
    report_df = all_results[best_approach][['text', 'actual_stars', 'predicted_stars', 'explanation']].head()
    print(report_df.to_markdown(index=False))

    return df_comparison

if __name__ == "__main__":
    main()

Gemini Client Initialized.


C:\Users\saksh\AppData\Local\Temp\ipykernel_21076\1402381211.py:32: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  @validator('predicted_stars')



Data loaded and sampled to 10 rows.

--- Running Zero-Shot (Baseline) ---


Running Zero-Shot (Baseline):   0%|          | 0/10 [00:00<?, ?it/s]C:\Users\saksh\AppData\Local\Temp\ipykernel_21076\1402381211.py:155: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  parsed_data = RatingPrediction.parse_raw(json_response)
Running Zero-Shot (Baseline): 100%|██████████| 10/10 [00:30<00:00,  3.04s/it]



--- Running Few-Shot + CoT ---


Running Few-Shot + CoT:   0%|          | 0/10 [00:00<?, ?it/s]C:\Users\saksh\AppData\Local\Temp\ipykernel_21076\1402381211.py:155: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  parsed_data = RatingPrediction.parse_raw(json_response)
Running Few-Shot + CoT: 100%|██████████| 10/10 [00:07<00:00,  1.28it/s]



--- Running Role-Play + Format Focus ---


Running Role-Play + Format Focus: 100%|██████████| 10/10 [00:05<00:00,  1.76it/s]


           TASK 1: PROMPT COMPARISON RESULTS
| Approach                 |   Accuracy |   JSON Validity Rate |   Reliability (Non-Empty Explanation Rate) |
|:-------------------------|-----------:|---------------------:|-------------------------------------------:|
| Zero-Shot (Baseline)     |        0.6 |                  1   |                                        1   |
| Few-Shot + CoT           |        0   |                  0.1 |                                        0.1 |
| Role-Play + Format Focus |        0   |                  0   |                                        0   |


--- Report Content Examples (Best Approach: Zero-Shot (Baseline)) ---

Example 5 Predictions from the Best Approach:
| text                                                                                                                                                                                                                                                                                       




In [21]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from datetime import datetime, timedelta
from pydantic import BaseModel, ValidationError, Field, validator
from google import genai
from google.genai import types
from google.genai.errors import APIError
from tqdm import tqdm
from typing import List, Dict, Any, Tuple
from pathlib import Path
import json # Ensure json is imported

# ============================================================================
# --- 1. SETUP AND CONFIGURATION ---
# ============================================================================

# 1.1. Set up Gemini API Key (NOTE: Placeholder key used for structure)
os.environ["GEMINI_API_KEY"] = "AIzaSyDEuzSUl2PUYHjPcEB5qJQpMZEuy4FNvvI" # Placeholder Key
MODEL_NAME = "gemini-2.5-flash" 

try:
    client = genai.Client()
    print("Gemini Client Initialized.")
except Exception as e:
    print(f"Error initializing Gemini client. Make sure the API key is correct. Error: {e}")
    client = None

# 1.2. Define the REQUIRED Structured Output Schema using Pydantic
class RatingPrediction(BaseModel):
    """Schema for the LLM's structured output."""
    predicted_stars: int = Field(..., description="The predicted star rating from 1 to 5.")
    explanation: str = Field(..., description="A brief explanation for the assigned rating.")
    
    @validator('predicted_stars')
    def check_star_range(cls, v):
        if not (1 <= v <= 5):
            raise ValueError(f'predicted_stars must be between 1 and 5, received {v}')
        return v

# --- 2. DATA LOADING AND SAMPLING ---

# CRITICAL: Initialize df_sampled globally as an empty DataFrame to prevent NameError
df_sampled = pd.DataFrame()

DATA_PATH = r"C:\Users\saksh\Downloads\yelp.csv\yelp.csv" 
SAMPLE_SIZE = 5 

try:
    df_full = pd.read_csv(DATA_PATH)
    
    # Filter, sample, and rename columns for evaluation clarity
    df_sampled = (
        df_full[['text', 'stars']]
        .dropna()
        .sample(SAMPLE_SIZE, random_state=42)
        .reset_index(drop=True)
    )
    
    df_sampled = df_sampled.rename(columns={'stars': 'actual_stars'})

    print(f"\nData loaded and sampled to {len(df_sampled)} rows.")

except FileNotFoundError:
    print(f"\nError: Dataset file not found at {DATA_PATH}. Please check the path and file name.")
except Exception as e:
    print(f"\nError during data loading/sampling: {e}")


# --- 3. PROMPT APPROACH DEFINITIONS (3 VERSIONS REQUIRED) ---
## A. Approach 1: Zero-Shot (Baseline)
PROMPT_1_ZERO_SHOT = """
You are an expert review classifier. Your task is to analyze the following Yelp review and classify it into a star rating from 1 (worst) to 5 (best).
You MUST return your response as a valid JSON object matching the provided schema. Do not include any text outside of the JSON object.
REVIEW: "{review_text}"
"""

## B. Approach 2: Few-Shot with CoT (Chain-of-Thought)
FEW_SHOT_EXAMPLE = """
EXAMPLE REVIEW: "The waiter was rude and spilled coffee on my laptop. The food was mediocre and took an hour to arrive. I will never return here."
EXAMPLE RATING: 1
EXAMPLE EXPLANATION: The review contains multiple strong negative indicators such as 'rude', 'spilled coffee', 'mediocre food', and 'took an hour', all pointing to a terrible experience.
"""
PROMPT_2_FEW_SHOT_COT = f"""
You are an expert review classifier. Your task is to analyze the following Yelp review and classify it into a star rating from 1 (worst) to 5 (best).
Here is an example to guide your classification:
---
{FEW_SHOT_EXAMPLE}
---
Now, classify the following review. First, briefly mention the key sentiment drivers (e.g., positive keywords, negative experience) in a thought process.
You MUST return your final response as a valid JSON object matching the provided schema.
REVIEW: "{{review_text}}"
"""


## C. Approach 3: Role-Play and Format-Focused
PROMPT_3_ROLE_FOCUSED = """
SYSTEM INSTRUCTION: You are a **highly reliable and meticulous Sentiment Analysis Bot** dedicated to classifying customer reviews with perfect JSON adherence.
Your sole output must be a JSON object, and you must verify that the 'predicted_stars' is an integer between 1 and 5, and the 'explanation' is a brief reasoning.
Review to Analyze:
---
{{review_text}}
---
"""

PROMPT_MAP = {
    "Zero-Shot (Baseline)": PROMPT_1_ZERO_SHOT,
    "Few-Shot + CoT": PROMPT_2_FEW_SHOT_COT,
    "Role-Play + Format Focus": PROMPT_3_ROLE_FOCUSED,
}
# --- 4. CORE FUNCTION: LLM CLASSIFICATION AND EVALUATION ---

def classify_reviews_with_llm(df: pd.DataFrame, prompt_template: str, prompt_name: str) -> Tuple[Dict[str, Any], pd.DataFrame]:
    """
    Processes the sampled dataframe using a given prompt and evaluates the results.
    """
    if client is None:
        raise RuntimeError("Gemini Client failed to initialize. Cannot run API calls.")
        
    # Initialize columns for LLM results
    df['llm_output_raw'] = None
    df['predicted_stars'] = None
    df['explanation'] = None
    
    valid_json_count = 0
    
    # Configure the request to use the Pydantic schema for structured output
    config = types.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema=RatingPrediction,
    )

    # 4.1. Iterate through the sampled reviews
    for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Running {prompt_name}"):
        review_text = row['text']
        
        # 4.2. Format the prompt and send the request
        prompt = prompt_template.format(review_text=review_text)
        
        try:
            response = client.models.generate_content(
                model=MODEL_NAME,
                contents=[prompt],
                config=config,
            )
            
            json_response = response.text
            df.at[index, 'llm_output_raw'] = json_response

            # 4.3. Parse and Validate the JSON output
            try:
                # Use Pydantic's parse_raw to validate the JSON against the schema
                parsed_data = RatingPrediction.parse_raw(json_response)
                
                df.at[index, 'predicted_stars'] = parsed_data.predicted_stars
                df.at[index, 'explanation'] = parsed_data.explanation
                valid_json_count += 1

            except (json.JSONDecodeError, ValidationError) as e:
                # JSON invalidity or schema violation
                df.at[index, 'llm_output_raw'] = f"JSON/Validation Error: {json_response} | {e}"
                pass # Values remain None

        except APIError as e:
            # Handle API errors 
            df.at[index, 'llm_output_raw'] = f"API Error: {e}"
        
    # 4.4. Calculate Metrics
    
    df_results = df.dropna(subset=['predicted_stars']).copy()
    if not df_results.empty:
        accuracy = accuracy_score(df_results['actual_stars'].astype(int), df_results['predicted_stars'].astype(int))
    else:
        accuracy = 0.0

    json_validity_rate = valid_json_count / len(df)
    non_empty_explanation_count = df_results['explanation'].apply(lambda x: bool(x) and str(x).strip() != '').sum()
    explanation_rate = non_empty_explanation_count / len(df)
    
    metrics = {
        "Approach": prompt_name,
        "Accuracy": accuracy,
        "JSON Validity Rate": json_validity_rate,
        "Reliability (Non-Empty Explanation Rate)": explanation_rate
    }
    
    return metrics, df.copy()

# --- 5. EXECUTE ALL APPROACHES AND COMPARE ---

def main():
    results_list = []
    all_results = {}

    # 5.1. Define Output Path
    OUTPUT_DIR = Path(r"C:\Users\saksh\Downloads\yelp.csv")
    OUTPUT_DIR.mkdir(exist_ok=True)
    
    # Load sampled data (df_sampled must be loaded successfully in Section 2)
    global df_sampled 

    if df_sampled.empty:
        print("\nCannot run evaluation: Sampled DataFrame is empty. Check your DATA_PATH.")
        return

    for name, template in PROMPT_MAP.items():
        print(f"\n--- Running {name} ---")
        
        # Make a deep copy of the sampled data for each independent run
        df_to_process = df_sampled[['text', 'actual_stars']].copy()
        
        metrics, df_output = classify_reviews_with_llm(df_to_process, template, name)
        
        results_list.append(metrics)
        all_results[name] = df_output

    # Create the final comparison table
    df_comparison = pd.DataFrame(results_list).set_index('Approach')
    
    
    # --- 6. SAVE AND PRINT OUTPUTS ---
    
    best_approach = df_comparison['Accuracy'].idxmax()
    
    # Combine all individual results dataframes for a single CSV export
    combined_df = pd.concat([df.assign(Approach=name) for name, df in all_results.items()])
    
    # 6.1. Prepare the structure for JSON export
    final_output_structure = {
        "metadata": {
            "model_name": MODEL_NAME,
            "sample_size": SAMPLE_SIZE,
            "date": datetime.now().isoformat()
        },
        "comparison_metrics": df_comparison.reset_index().to_dict(orient='records'),
        "best_approach": best_approach,
        # Convert the entire combined DataFrame to a list of dictionaries for JSON export
        "full_results_data": combined_df.to_dict(orient='records')
    }
    
    # 6.2. Save the final JSON analysis summary
    json_file_path = OUTPUT_DIR / "final_analysis_summary.json"
    with open(json_file_path, "w", encoding="utf-8") as f:
        json.dump(final_output_structure, f, indent=2, default=str)


    # 6.3. Save the Markdown report and CSV (as in previous versions)
    markdown_content = []
    markdown_content.append("# Gemini Prompt Comparison Analysis\n")
    markdown_content.append("## 1. Metric Comparison\n")
    markdown_content.append(df_comparison.to_markdown())
    markdown_content.append("\n\n---\n")

    markdown_content.append(f"## 2. Detailed Results for Best Approach: {best_approach}\n")
    markdown_content.append("\nExample 5 Predictions from the Best Approach:\n")
    report_df = all_results[best_approach][['text', 'actual_stars', 'predicted_stars', 'explanation']].head()
    markdown_content.append(report_df.to_markdown(index=False))

    report_file_path = OUTPUT_DIR / "analysis_report.md"
    with open(report_file_path, "w", encoding="utf-8") as f:
        f.writelines(markdown_content)
        
    full_df_path = OUTPUT_DIR / "full_predictions_and_metrics.csv"
    combined_df.to_csv(full_df_path, index=False)
    
    
    # 6.4. Print confirmation to console
    print("\n" + "="*50)
    print("           TASK 1: PROMPT COMPARISON RESULTS")
    print("="*50)
    print(df_comparison.to_markdown())
    print("\n--- RESULTS SAVED ---")
    print(f"✓ JSON Analysis Summary: {json_file_path.resolve()}")
    print(f"✓ Comparison Report (Markdown): {report_file_path.resolve()}")
    print(f"✓ Full Data (CSV): {full_df_path.resolve()}")
    print("---------------------\n")

    return df_comparison

if __name__ == "__main__":
    main()

Gemini Client Initialized.


C:\Users\saksh\AppData\Local\Temp\ipykernel_21076\264293846.py:35: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  @validator('predicted_stars')



Data loaded and sampled to 5 rows.

--- Running Zero-Shot (Baseline) ---


Running Zero-Shot (Baseline):   0%|          | 0/5 [00:00<?, ?it/s]C:\Users\saksh\AppData\Local\Temp\ipykernel_21076\264293846.py:153: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  parsed_data = RatingPrediction.parse_raw(json_response)
Running Zero-Shot (Baseline): 100%|██████████| 5/5 [00:16<00:00,  3.29s/it]



--- Running Few-Shot + CoT ---


Running Few-Shot + CoT:   0%|          | 0/5 [00:00<?, ?it/s]C:\Users\saksh\AppData\Local\Temp\ipykernel_21076\264293846.py:153: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  parsed_data = RatingPrediction.parse_raw(json_response)
Running Few-Shot + CoT: 100%|██████████| 5/5 [00:04<00:00,  1.07it/s]



--- Running Role-Play + Format Focus ---


Running Role-Play + Format Focus: 100%|██████████| 5/5 [00:02<00:00,  2.42it/s]


           TASK 1: PROMPT COMPARISON RESULTS
| Approach                 |   Accuracy |   JSON Validity Rate |   Reliability (Non-Empty Explanation Rate) |
|:-------------------------|-----------:|---------------------:|-------------------------------------------:|
| Zero-Shot (Baseline)     |        0.8 |                  1   |                                        1   |
| Few-Shot + CoT           |        0   |                  0.2 |                                        0.2 |
| Role-Play + Format Focus |        0   |                  0   |                                        0   |

--- RESULTS SAVED ---
✓ JSON Analysis Summary: C:\Users\saksh\Downloads\yelp.csv\final_analysis_summary.json
✓ Comparison Report (Markdown): C:\Users\saksh\Downloads\yelp.csv\analysis_report.md
✓ Full Data (CSV): C:\Users\saksh\Downloads\yelp.csv\full_predictions_and_metrics.csv
---------------------




