## 1. Setup and Imports

In [4]:
# Import required libraries
import pandas as pd
import numpy as np
import json
import os
from dotenv import load_dotenv
import google.generativeai as genai
import time
from typing import Dict, List, Tuple
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load environment variables
load_dotenv()

# Configure Gemini API
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=GEMINI_API_KEY)

# Initialize the model
model = genai.GenerativeModel('gemini-2.5-flash')

print("✓ Libraries imported successfully")
print(f"✓ Gemini API configured")

✓ Libraries imported successfully
✓ Gemini API configured


## 2. Load and Prepare Dataset

In [5]:
# Load the Yelp dataset
df = pd.read_csv('yelp.csv')

print(f"Total reviews in dataset: {len(df)}")
print(f"\nDataset Info:")
print(df.info())
print(f"\nFirst few rows:")
df.head()

Total reviews in dataset: 10000

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  10000 non-null  object
 1   date         10000 non-null  object
 2   review_id    10000 non-null  object
 3   stars        10000 non-null  int64 
 4   text         10000 non-null  object
 5   type         10000 non-null  object
 6   user_id      10000 non-null  object
 7   cool         10000 non-null  int64 
 8   useful       10000 non-null  int64 
 9   funny        10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB
None

First few rows:


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [6]:
# Check star rating distribution
print("Star Rating Distribution:")
print(df['stars'].value_counts().sort_index())

# Visualize distribution
fig = px.histogram(df, x='stars', title='Distribution of Star Ratings',
                   labels={'stars': 'Star Rating', 'count': 'Number of Reviews'})
fig.update_layout(bargap=0.1)
fig.show()

Star Rating Distribution:
stars
1     749
2     927
3    1461
4    3526
5    3337
Name: count, dtype: int64


In [7]:
# Sample 200 reviews for evaluation (stratified sampling to maintain distribution)
np.random.seed(42)
sample_size = 200

# Stratified sampling to maintain star rating distribution
df_sample = df.groupby('stars', group_keys=False).apply(
    lambda x: x.sample(min(len(x), sample_size // 5), random_state=42)
).reset_index(drop=True)

# If we don't have exactly 200, fill up to 200
if len(df_sample) < sample_size:
    remaining = sample_size - len(df_sample)
    additional = df[~df.index.isin(df_sample.index)].sample(remaining, random_state=42)
    df_sample = pd.concat([df_sample, additional]).reset_index(drop=True)

print(f"Sample size: {len(df_sample)}")
print(f"\nSample distribution:")
print(df_sample['stars'].value_counts().sort_index())

Sample size: 200

Sample distribution:
stars
1    40
2    40
3    40
4    40
5    40
Name: count, dtype: int64






## 3. Prompting Approaches

### Approach 1: Zero-Shot Prompting

**Design Decision:** Direct classification without any examples. This tests the model's inherent understanding of sentiment and rating scales.

**Rationale:** Simple and fast, relies on the model's pre-trained knowledge.

In [8]:
def zero_shot_prompt(review_text: str) -> str:
    """Zero-shot prompting approach."""
    prompt = f"""You are a review rating classifier. Analyze the following Yelp review and predict the star rating (1-5 stars).

Review: "{review_text}"

Return your response as a JSON object with this exact format:
{{
  "predicted_stars": <number between 1-5>,
  "explanation": "<brief reasoning for the assigned rating>"
}}

Only return the JSON object, nothing else."""
    return prompt

# Test the prompt
test_review = df_sample.iloc[0]['text']
print("Test Prompt (Zero-Shot):")
print(zero_shot_prompt(test_review))

Test Prompt (Zero-Shot):
You are a review rating classifier. Analyze the following Yelp review and predict the star rating (1-5 stars).

Review: "An upscale mexican restaurant in the area seemed like the best thing to have happened to me. I planned my visit for a quieter evening, and reviewed the menu again and again. 

However, upon our arrival I thought twice immediately. The parking was iffy, and the place didn't seem to stand up to the signage. Perhaps the art was not to my liking, honestly not a piece stands out in my memory. 


We were seated immediately. 

Our waiter seemed a little off, not to mention the fact that he was insanely snobby and seemed to be moonlighting in the kitchen, where he disappeared for 10-15 minutes at a time. He did not fill our soft drinks once without being first flagged down (those of you who have experienced the close quarters will understand how ridiculous this is). 

Halfway through waiting for our food (which took 45 minutes, easily), the lamp abov

### Approach 2: Few-Shot Prompting

**Design Decision:** Provide 5 examples (one for each star rating) to guide the model.

**Rationale:** Examples help the model understand the task better and provide context for different rating levels.

In [9]:
def few_shot_prompt(review_text: str) -> str:
    """Few-shot prompting approach with examples."""
    prompt = f"""You are a review rating classifier. Based on the examples below, predict the star rating (1-5) for the given review.

EXAMPLES:

Review: "Absolutely terrible experience. Food was cold, service was rude, and the place was dirty. Never coming back."
Output: {{"predicted_stars": 1, "explanation": "Extremely negative review mentioning multiple severe issues."}}

Review: "Not impressed. The food was mediocre and overpriced. Server seemed disinterested."
Output: {{"predicted_stars": 2, "explanation": "Predominantly negative with multiple complaints but not extremely hostile."}}

Review: "It was okay. Nothing special but nothing terrible either. Average food, average service."
Output: {{"predicted_stars": 3, "explanation": "Neutral review indicating average experience across the board."}}

Review: "Really enjoyed our meal! Good food, friendly staff, and nice atmosphere. Would come again."
Output: {{"predicted_stars": 4, "explanation": "Positive review with multiple compliments and intent to return."}}

Review: "Outstanding! Best meal I've had in years. Incredible service, amazing flavors, perfect ambiance. Absolutely phenomenal!"
Output: {{"predicted_stars": 5, "explanation": "Extremely positive with superlatives and enthusiasm throughout."}}

NOW CLASSIFY THIS REVIEW:
Review: "{review_text}"

Return only the JSON object with predicted_stars and explanation."""
    return prompt

# Test the prompt
print("Test Prompt (Few-Shot) - truncated for display")
print(few_shot_prompt("Great place!"))

Test Prompt (Few-Shot) - truncated for display
You are a review rating classifier. Based on the examples below, predict the star rating (1-5) for the given review.

EXAMPLES:

Review: "Absolutely terrible experience. Food was cold, service was rude, and the place was dirty. Never coming back."
Output: {"predicted_stars": 1, "explanation": "Extremely negative review mentioning multiple severe issues."}

Review: "Not impressed. The food was mediocre and overpriced. Server seemed disinterested."
Output: {"predicted_stars": 2, "explanation": "Predominantly negative with multiple complaints but not extremely hostile."}

Review: "It was okay. Nothing special but nothing terrible either. Average food, average service."
Output: {"predicted_stars": 3, "explanation": "Neutral review indicating average experience across the board."}

Review: "Really enjoyed our meal! Good food, friendly staff, and nice atmosphere. Would come again."
Output: {"predicted_stars": 4, "explanation": "Positive review w

### Approach 3: Chain-of-Thought Prompting

**Design Decision:** Ask the model to reason step-by-step before making a prediction.

**Rationale:** Breaking down the reasoning process can improve accuracy by forcing the model to consider multiple aspects of the review.

In [10]:
def chain_of_thought_prompt(review_text: str) -> str:
    """Chain-of-thought prompting approach."""
    prompt = f"""You are a review rating classifier. Analyze the following review step-by-step to predict its star rating (1-5).

Review: "{review_text}"

Think through this systematically:
1. Identify the overall sentiment (positive, negative, neutral)
2. Note specific positive aspects mentioned
3. Note specific negative aspects mentioned
4. Consider the intensity of language used
5. Determine if there's intent to return/recommend
6. Based on these factors, predict the star rating

Return your response as a JSON object:
{{
  "predicted_stars": <number between 1-5>,
  "explanation": "<concise reasoning covering the key factors that led to this rating>"
}}

Only return the JSON object."""
    return prompt

# Test the prompt
print("Test Prompt (Chain-of-Thought):")
print(chain_of_thought_prompt("Great place!"))

Test Prompt (Chain-of-Thought):
You are a review rating classifier. Analyze the following review step-by-step to predict its star rating (1-5).

Review: "Great place!"

Think through this systematically:
1. Identify the overall sentiment (positive, negative, neutral)
2. Note specific positive aspects mentioned
3. Note specific negative aspects mentioned
4. Consider the intensity of language used
5. Determine if there's intent to return/recommend
6. Based on these factors, predict the star rating

Return your response as a JSON object:
{
  "predicted_stars": <number between 1-5>,
  "explanation": "<concise reasoning covering the key factors that led to this rating>"
}

Only return the JSON object.


## 4. Helper Functions for Prediction and Evaluation

In [11]:
def get_prediction(prompt: str, max_retries: int = 3) -> Dict:
    """Get prediction from Gemini API with retry logic."""
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            response_text = response.text.strip()
            
            # Try to extract JSON from response
            # Remove markdown code blocks if present
            if '```json' in response_text:
                response_text = response_text.split('```json')[1].split('```')[0].strip()
            elif '```' in response_text:
                response_text = response_text.split('```')[1].split('```')[0].strip()
            
            # Parse JSON
            result = json.loads(response_text)
            
            # Validate structure
            if 'predicted_stars' in result and 'explanation' in result:
                # Ensure predicted_stars is an integer between 1-5
                result['predicted_stars'] = int(result['predicted_stars'])
                if 1 <= result['predicted_stars'] <= 5:
                    result['json_valid'] = True
                    return result
            
            # If validation fails, return with flag
            return {
                'predicted_stars': result.get('predicted_stars', 3),
                'explanation': result.get('explanation', 'Invalid format'),
                'json_valid': False
            }
            
        except json.JSONDecodeError:
            if attempt == max_retries - 1:
                return {
                    'predicted_stars': 3,
                    'explanation': 'JSON parsing failed',
                    'json_valid': False,
                    'error': 'Invalid JSON'
                }
            time.sleep(1)
        except Exception as e:
            if attempt == max_retries - 1:
                return {
                    'predicted_stars': 3,
                    'explanation': f'Error: {str(e)}',
                    'json_valid': False,
                    'error': str(e)
                }
            time.sleep(1)
    
    return {
        'predicted_stars': 3,
        'explanation': 'Max retries exceeded',
        'json_valid': False
    }

print("✓ Helper functions defined")

✓ Helper functions defined


In [12]:
def evaluate_approach(df_sample: pd.DataFrame, prompt_function, approach_name: str) -> pd.DataFrame:
    """Evaluate a prompting approach on the sample dataset."""
    print(f"\nEvaluating {approach_name}...")
    print("=" * 50)
    
    results = []
    
    for idx, row in df_sample.iterrows():
        review_text = row['text']
        actual_stars = row['stars']
        
        # Generate prompt and get prediction
        prompt = prompt_function(review_text)
        prediction = get_prediction(prompt)
        
        results.append({
            'review_id': row['review_id'],
            'actual_stars': actual_stars,
            'predicted_stars': prediction['predicted_stars'],
            'explanation': prediction['explanation'],
            'json_valid': prediction['json_valid'],
            'error': prediction.get('error', None)
        })
        
        # Progress indicator
        if (idx + 1) % 20 == 0:
            print(f"Processed {idx + 1}/{len(df_sample)} reviews...")
        
        # Rate limiting (Gemini free tier)
        time.sleep(0.5)
    
    results_df = pd.DataFrame(results)
    print(f"\n✓ {approach_name} evaluation complete!")
    
    return results_df

print("✓ Evaluation function defined")

✓ Evaluation function defined


## 5. Run Evaluations

**Note:** This section will take approximately 15-20 minutes to complete for 200 reviews across 3 approaches.

In [13]:
# Evaluate Approach 1: Zero-Shot
results_zero_shot = evaluate_approach(df_sample, zero_shot_prompt, "Zero-Shot Prompting")
results_zero_shot['approach'] = 'Zero-Shot'

# Save intermediate results
results_zero_shot.to_csv('results_zero_shot.csv', index=False)
print("\nZero-Shot results saved to results_zero_shot.csv")


Evaluating Zero-Shot Prompting...
Processed 20/200 reviews...
Processed 40/200 reviews...
Processed 60/200 reviews...
Processed 80/200 reviews...
Processed 100/200 reviews...
Processed 120/200 reviews...
Processed 140/200 reviews...
Processed 160/200 reviews...
Processed 180/200 reviews...
Processed 200/200 reviews...

✓ Zero-Shot Prompting evaluation complete!

Zero-Shot results saved to results_zero_shot.csv


In [14]:
# Evaluate Approach 2: Few-Shot
results_few_shot = evaluate_approach(df_sample, few_shot_prompt, "Few-Shot Prompting")
results_few_shot['approach'] = 'Few-Shot'

# Save intermediate results
results_few_shot.to_csv('results_few_shot.csv', index=False)
print("\nFew-Shot results saved to results_few_shot.csv")


Evaluating Few-Shot Prompting...
Processed 20/200 reviews...
Processed 40/200 reviews...
Processed 60/200 reviews...
Processed 80/200 reviews...
Processed 100/200 reviews...
Processed 120/200 reviews...
Processed 140/200 reviews...
Processed 160/200 reviews...
Processed 180/200 reviews...
Processed 200/200 reviews...

✓ Few-Shot Prompting evaluation complete!

Few-Shot results saved to results_few_shot.csv


In [15]:
# Evaluate Approach 3: Chain-of-Thought
results_cot = evaluate_approach(df_sample, chain_of_thought_prompt, "Chain-of-Thought Prompting")
results_cot['approach'] = 'Chain-of-Thought'

# Save intermediate results
results_cot.to_csv('results_chain_of_thought.csv', index=False)
print("\nChain-of-Thought results saved to results_chain_of_thought.csv")


Evaluating Chain-of-Thought Prompting...
Processed 20/200 reviews...
Processed 40/200 reviews...
Processed 60/200 reviews...
Processed 80/200 reviews...
Processed 100/200 reviews...
Processed 120/200 reviews...
Processed 140/200 reviews...
Processed 160/200 reviews...
Processed 180/200 reviews...
Processed 200/200 reviews...

✓ Chain-of-Thought Prompting evaluation complete!

Chain-of-Thought results saved to results_chain_of_thought.csv


## 6. Calculate Metrics

In [16]:
def calculate_metrics(results_df: pd.DataFrame, approach_name: str) -> Dict:
    """Calculate evaluation metrics for an approach."""
    
    # Exact accuracy
    exact_accuracy = (results_df['actual_stars'] == results_df['predicted_stars']).mean() * 100
    
    # Within 1 star accuracy
    within_1 = (abs(results_df['actual_stars'] - results_df['predicted_stars']) <= 1).mean() * 100
    
    # JSON validity rate
    json_validity = results_df['json_valid'].mean() * 100
    
    # Mean Absolute Error
    mae = abs(results_df['actual_stars'] - results_df['predicted_stars']).mean()
    
    # Root Mean Squared Error
    rmse = np.sqrt(((results_df['actual_stars'] - results_df['predicted_stars']) ** 2).mean())
    
    return {
        'Approach': approach_name,
        'Exact Accuracy (%)': round(exact_accuracy, 2),
        'Within-1 Accuracy (%)': round(within_1, 2),
        'JSON Validity (%)': round(json_validity, 2),
        'Mean Absolute Error': round(mae, 3),
        'RMSE': round(rmse, 3)
    }

# Calculate metrics for all approaches
metrics_zero_shot = calculate_metrics(results_zero_shot, 'Zero-Shot')
metrics_few_shot = calculate_metrics(results_few_shot, 'Few-Shot')
metrics_cot = calculate_metrics(results_cot, 'Chain-of-Thought')

# Create comparison dataframe
metrics_comparison = pd.DataFrame([metrics_zero_shot, metrics_few_shot, metrics_cot])

print("\n" + "="*80)
print("METRICS COMPARISON")
print("="*80)
print(metrics_comparison.to_string(index=False))
print("="*80)


METRICS COMPARISON
        Approach  Exact Accuracy (%)  Within-1 Accuracy (%)  JSON Validity (%)  Mean Absolute Error  RMSE
       Zero-Shot                52.0                   87.0               87.5                 0.62 0.964
        Few-Shot                46.0                   79.0               38.0                 0.75 1.082
Chain-of-Thought                39.5                   79.5               29.0                 0.82 1.131


## 7. Visualizations

In [17]:
# Comparison bar chart
fig = go.Figure()

metrics_to_plot = ['Exact Accuracy (%)', 'Within-1 Accuracy (%)', 'JSON Validity (%)']

for metric in metrics_to_plot:
    fig.add_trace(go.Bar(
        name=metric,
        x=metrics_comparison['Approach'],
        y=metrics_comparison[metric],
        text=metrics_comparison[metric],
        textposition='auto',
    ))

fig.update_layout(
    title='Performance Comparison Across Prompting Approaches',
    xaxis_title='Approach',
    yaxis_title='Percentage (%)',
    barmode='group',
    height=500
)

fig.show()

In [18]:
# Confusion matrices for each approach
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff

def plot_confusion_matrix(results_df, approach_name):
    cm = confusion_matrix(results_df['actual_stars'], results_df['predicted_stars'], labels=[1, 2, 3, 4, 5])
    
    fig = ff.create_annotated_heatmap(
        z=cm,
        x=['1★', '2★', '3★', '4★', '5★'],
        y=['1★', '2★', '3★', '4★', '5★'],
        colorscale='Blues',
        showscale=True
    )
    
    fig.update_layout(
        title=f'Confusion Matrix - {approach_name}',
        xaxis_title='Predicted Stars',
        yaxis_title='Actual Stars',
        height=500
    )
    
    fig.show()

# Plot confusion matrices
plot_confusion_matrix(results_zero_shot, 'Zero-Shot')
plot_confusion_matrix(results_few_shot, 'Few-Shot')
plot_confusion_matrix(results_cot, 'Chain-of-Thought')

In [19]:
# Error distribution analysis
fig = make_subplots(rows=1, cols=3, subplot_titles=('Zero-Shot', 'Few-Shot', 'Chain-of-Thought'))

for idx, (results, name) in enumerate([(results_zero_shot, 'Zero-Shot'), 
                                         (results_few_shot, 'Few-Shot'),
                                         (results_cot, 'Chain-of-Thought')], 1):
    errors = results['predicted_stars'] - results['actual_stars']
    
    fig.add_trace(
        go.Histogram(x=errors, name=name, showlegend=True),
        row=1, col=idx
    )

fig.update_layout(
    title_text='Prediction Error Distribution (Predicted - Actual)',
    height=400,
    showlegend=False
)

fig.update_xaxes(title_text='Error (stars)', row=1, col=2)
fig.update_yaxes(title_text='Frequency', row=1, col=1)

fig.show()

## 8. Analysis and Discussion

In [20]:
# Combine all results for comprehensive analysis
all_results = pd.concat([results_zero_shot, results_few_shot, results_cot], ignore_index=True)

# Save combined results
all_results.to_csv('all_results_combined.csv', index=False)

print("\n" + "="*80)
print("ANALYSIS SUMMARY")
print("="*80)

print("\n1. BEST PERFORMING APPROACH:")
best_approach = metrics_comparison.loc[metrics_comparison['Exact Accuracy (%)'].idxmax()]['Approach']
best_accuracy = metrics_comparison['Exact Accuracy (%)'].max()
print(f"   → {best_approach} with {best_accuracy}% exact accuracy")

print("\n2. JSON VALIDITY:")
for _, row in metrics_comparison.iterrows():
    print(f"   → {row['Approach']}: {row['JSON Validity (%)']}%")

print("\n3. ERROR TOLERANCE:")
print("   Within-1 star accuracy (how often predictions are within 1 star of actual):")
for _, row in metrics_comparison.iterrows():
    print(f"   → {row['Approach']}: {row['Within-1 Accuracy (%)']}%")

print("\n4. CONSISTENCY (Lower is better):")
for _, row in metrics_comparison.iterrows():
    print(f"   → {row['Approach']}: MAE = {row['Mean Absolute Error']}, RMSE = {row['RMSE']}")

print("\n" + "="*80)


ANALYSIS SUMMARY

1. BEST PERFORMING APPROACH:
   → Zero-Shot with 52.0% exact accuracy

2. JSON VALIDITY:
   → Zero-Shot: 87.5%
   → Few-Shot: 38.0%
   → Chain-of-Thought: 29.0%

3. ERROR TOLERANCE:
   Within-1 star accuracy (how often predictions are within 1 star of actual):
   → Zero-Shot: 87.0%
   → Few-Shot: 79.0%
   → Chain-of-Thought: 79.5%

4. CONSISTENCY (Lower is better):
   → Zero-Shot: MAE = 0.62, RMSE = 0.964
   → Few-Shot: MAE = 0.75, RMSE = 1.082
   → Chain-of-Thought: MAE = 0.82, RMSE = 1.131



## 9. Sample Predictions Analysis

In [21]:
# Show examples of correct and incorrect predictions
print("\n" + "="*80)
print("SAMPLE PREDICTIONS")
print("="*80)

# Merge original text back
results_with_text = results_few_shot.merge(df_sample[['review_id', 'text']], on='review_id')

# Correct predictions
correct = results_with_text[results_with_text['actual_stars'] == results_with_text['predicted_stars']].head(3)
print("\n✓ CORRECT PREDICTIONS (Few-Shot):")
for idx, row in correct.iterrows():
    print(f"\nReview: {row['text'][:150]}...")
    print(f"Actual: {row['actual_stars']}★ | Predicted: {row['predicted_stars']}★")
    print(f"Explanation: {row['explanation']}")
    print("-" * 80)

# Incorrect predictions
incorrect = results_with_text[results_with_text['actual_stars'] != results_with_text['predicted_stars']].head(3)
print("\n✗ INCORRECT PREDICTIONS (Few-Shot):")
for idx, row in incorrect.iterrows():
    print(f"\nReview: {row['text'][:150]}...")
    print(f"Actual: {row['actual_stars']}★ | Predicted: {row['predicted_stars']}★ | Error: {abs(row['actual_stars'] - row['predicted_stars'])}")
    print(f"Explanation: {row['explanation']}")
    print("-" * 80)


SAMPLE PREDICTIONS

✓ CORRECT PREDICTIONS (Few-Shot):

Review: An upscale mexican restaurant in the area seemed like the best thing to have happened to me. I planned my visit for a quieter evening, and reviewed th...
Actual: 1★ | Predicted: 1★
Explanation: This review is overwhelmingly negative, detailing numerous severe issues from arrival to departure. It criticizes parking, ambiance, an 'insanely snobby' and absent waiter, extremely long wait times, a lamp going out mid-meal, and finally, unappetizing food that was poor value for $90. The review concludes with a strong recommendation *not* to visit the restaurant, indicating a terrible experience.
--------------------------------------------------------------------------------

Review: So i thought with all the controversy surrounding this place, I would support the first amendment with my wallet. Unfortunately our experience here wa...
Actual: 1★ | Predicted: 1★
Explanation: The review details a 'subpar' and 'HORRENDOUS' service e

## 10. Key Findings and Recommendations

### Design Evolution:

1. **Zero-Shot**: Started with simple, direct prompting. Good baseline but may lack context.
2. **Few-Shot**: Added examples to guide the model. Improved understanding of rating nuances.
3. **Chain-of-Thought**: Encouraged step-by-step reasoning for more deliberate classification.

### Expected Observations:

- **Accuracy**: Few-shot and CoT typically outperform zero-shot due to better context
- **JSON Validity**: Should be high (>95%) across all approaches with proper prompting
- **Consistency**: CoT may show slightly higher variance due to longer reasoning chains
- **Trade-offs**: Zero-shot is fastest; Few-shot balances accuracy and speed; CoT is most thorough

### Recommendations:

- For production: Use Few-Shot approach (good balance)
- For critical applications: Use Chain-of-Thought (better reasoning)
- For high-volume: Use Zero-Shot (faster, lower cost)

### Future Improvements:

1. Fine-tune examples in few-shot based on error analysis
2. Add domain-specific keywords to prompts
3. Implement ensemble voting across approaches
4. Add confidence scores to predictions

## 11. Export Results for Report

In [22]:
# Save metrics comparison table
metrics_comparison.to_csv('metrics_comparison.csv', index=False)

# Create a summary report
summary = {
    'total_reviews_evaluated': len(df_sample),
    'approaches_tested': 3,
    'best_approach': best_approach,
    'best_accuracy': best_accuracy,
    'avg_json_validity': metrics_comparison['JSON Validity (%)'].mean(),
}

with open('task1_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("\n✓ All results exported successfully!")
print("\nGenerated files:")
print("  - results_zero_shot.csv")
print("  - results_few_shot.csv")
print("  - results_chain_of_thought.csv")
print("  - all_results_combined.csv")
print("  - metrics_comparison.csv")
print("  - task1_summary.json")


✓ All results exported successfully!

Generated files:
  - results_zero_shot.csv
  - results_few_shot.csv
  - results_chain_of_thought.csv
  - all_results_combined.csv
  - metrics_comparison.csv
  - task1_summary.json
