In [1]:
# Import necessary libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from groq import Groq
from dotenv import load_dotenv
from collections import Counter # Import Counter for debugging
import time

In [3]:
# Phase 1: Load Data
df = pd.read_csv('IMDb Dataset.csv')
# Create a small test set (e.g., 100 samples)
train_df, test_df = train_test_split(df, test_size=100, random_state=42)
# Save the test set reviews and labels
test_reviews = test_df['review'].tolist()
true_labels = test_df['sentiment'].tolist()

# Phase 2: Baseline Model (on the larger training set)
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_df['review'])
y_train = train_df['sentiment']

model = MultinomialNB()
model.fit(X_train, y_train)

# Vectorize the test set for the baseline model
X_test = vectorizer.transform(test_reviews)
baseline_predictions = model.predict(X_test)

print("### Baseline Model Performance ###")
print(classification_report(true_labels, baseline_predictions))

# Phase 3: LLM Evaluation

# Initialize the Groq 
load_dotenv()
client = Groq(api_key=os.getenv('GROQ_API_KEY'))

llm_predictions = []

print("Starting Groq LLM evaluation...")
for i, review in enumerate(test_reviews):
    # Craft your prompt
    prompt = f"""
    Classify the following movie review sentiment as only 'positive' or 'negative'. Do not write anything else.

    Review: \"{review}\"
    Sentiment:
    """
    
    try:
        # Use the Groq client
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama-3.1-8b-instant",
            temperature=0.0,
            max_tokens=10
        )
        # Get the raw response and clean it robustly
        raw_response = response.choices[0].message.content.strip().lower()
        
        # Extract the sentiment from the raw response
        if 'positive' in raw_response:
            prediction = 'positive'
        elif 'negative' in raw_response:
            prediction = 'negative'
        else:
            # If it doesn't find either, mark it as an error and see what the weird response was.
            print(f"Unexpected response for review {i}: '{raw_response}'. Marking as error.")
            prediction = 'error'
            
        llm_predictions.append(prediction)
        
        # Print a progress update every 10 reviews
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(test_reviews)} reviews...")
            
    except Exception as e:
        print(f"Error with review {i}: {e}")
        llm_predictions.append("error")
    
    time.sleep(0.5)  # Short pause

print("Groq LLM evaluation complete!\n")

# --- DEBUGGING: Check what we actually got ---
print("=== DEBUGGING OUTPUT ===")
print("First 10 True Labels:", true_labels[:10])
print("First 10 LLM Predictions:", llm_predictions[:10])

print("\nUnique True Labels:", set(true_labels))
print("Unique LLM Predictions:", set(llm_predictions))

prediction_counts = Counter(llm_predictions)
print("\nPrediction Counts:", prediction_counts)
print("=======================\n")

# --- Now calculate the final performance ---
print("### LLM Performance (using Groq & Llama) ###")
print(classification_report(true_labels, llm_predictions))



### Baseline Model Performance ###
              precision    recall  f1-score   support

    negative       0.64      1.00      0.78         9
    positive       1.00      0.76      0.86        21

    accuracy                           0.83        30
   macro avg       0.82      0.88      0.82        30
weighted avg       0.89      0.83      0.84        30

Starting Groq LLM evaluation...
Processed 10/30 reviews...
Processed 20/30 reviews...
Processed 30/30 reviews...
Groq LLM evaluation complete!

=== DEBUGGING OUTPUT ===
First 10 True Labels: ['positive', 'positive', 'negative', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative']
First 10 LLM Predictions: ['positive', 'positive', 'negative', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative']

Unique True Labels: {'positive', 'negative'}
Unique LLM Predictions: {'positive', 'negative'}

Prediction Counts: Counter({'positive': 18, 'negative': 12})

### LLM Performance (

In [4]:
# === ERROR ANALYSIS ===

print("\n" + "="*50)
print("ERROR ANALYSIS")
print("="*50)

#  Create a DataFrame for easy analysis

error_analysis_df = pd.DataFrame({
    'True_Label': true_labels,
    'LLM_Prediction': llm_predictions,
    'Review': test_reviews
})

# Adding a column to flag incorrect predictions
error_analysis_df['Correct'] = error_analysis_df['True_Label'] == error_analysis_df['LLM_Prediction']

#  Filter to only show the mistakes
mistakes_df = error_analysis_df[error_analysis_df['Correct'] == False]

# Print a summary
num_errors = len(mistakes_df)
total_reviews = len(error_analysis_df)
print(f"Number of errors: {num_errors} out of {total_reviews} reviews ({num_errors/total_reviews:.2%})")

if num_errors > 0:
    print("\nLet's look at some of the errors:\n")
    # Display the first 5-10 errors for inspection
    for i, row in mistakes_df.head(10).iterrows():
        print(f"True: {row['True_Label']} | Predicted: {row['LLM_Prediction']}")
        print(f"Review: {row['Review'][:200]}...") # Show first 200 chars to avoid huge output
        print("-" * 80)
else:
    print("Perfect score! No errors to analyze.")

# Count error types
if num_errors > 0:
    print("\nError Type Breakdown:")
    # Count how many times the model predicted 'positive' when it was actually 'negative' (False Positive)
    false_positives = len(mistakes_df[(mistakes_df['True_Label'] == 'negative') & (mistakes_df['LLM_Prediction'] == 'positive')])
    # Count how many times the model predicted 'negative' when it was actually 'positive' (False Negative)
    false_negatives = len(mistakes_df[(mistakes_df['True_Label'] == 'positive') & (mistakes_df['LLM_Prediction'] == 'negative')])
    
    print(f"False Positives (model said POSITIVE but was NEGATIVE): {false_positives}")
    print(f"False Negatives (model said NEGATIVE but was POSITIVE): {false_negatives}")


ERROR ANALYSIS
Number of errors: 3 out of 30 reviews (10.00%)

Let's look at some of the errors:

True: positive | Predicted: negative
Review: This movie starts off somewhat slowly and gets running towards the end. Not that that is bad, it was done to illustrate character trait degression of the main character. Consequently, if you are not i...
--------------------------------------------------------------------------------
True: positive | Predicted: negative
Review: The production quality, cast, premise, authentic New England (Waterbury, CT?) locale and lush John Williams score should have resulted in a 3-4 star collectors item. Unfortunately, all we got was a pa...
--------------------------------------------------------------------------------
True: positive | Predicted: negative
Review: I've never really been sure whether I liked this documentary or not. It was shown on Channel 4 before a cut down version of Revelations, and is on the Revelations video tape before the uncut show.