#### Evaluating with Advanced prompting

In [1]:
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


#### 1. Load Models --------------------------------------------------


In [5]:
# 1. Load Models --------------------------------------------------
base_model_name = "distilbert-base-uncased"
your_finetuned_model = "Ofge/finetuned_distilbert"  # Replace with your actual fine-tuned model path

# Load models and tokenizers
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name)

finetuned_tokenizer = AutoTokenizer.from_pretrained(your_finetuned_model)
finetuned_model = AutoModelForSequenceClassification.from_pretrained(your_finetuned_model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 2. Define Prompt Templates ---------------------------------------

In [6]:

ZERO_SHOT_TEMPLATE = """
Analyze the sentiment of this movie review:
Review: {review}
Sentiment: [POSITIVE/NEGATIVE]
"""

FEW_SHOT_TEMPLATE = """
Classify these movie review examples:
1. "This film was amazing!" → POSITIVE
2. "I hated every minute." → NEGATIVE
3. "The acting was terrible." → NEGATIVE
4. "A masterpiece of cinema." → POSITIVE
5. "{review}" → 
"""

####  3. Evaluation Function with Prompting ----------------------------


In [7]:
def evaluate_with_prompts(model, tokenizer, test_texts, test_labels, prompt_type="zero"):
    predictions = []
    for text in test_texts:
        try:
            if prompt_type == "zero":
                prompted_text = ZERO_SHOT_TEMPLATE.format(review=text)
            else:
                prompted_text = FEW_SHOT_TEMPLATE.format(review=text[:500])
            
            inputs = tokenizer(prompted_text, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
                pred = torch.argmax(outputs.logits).item()
            
            predictions.append(pred)
        except Exception as e:
            print(f"Error processing text: {e}")
            predictions.append(0)
    
    # Generate evaluation report
    report = classification_report(test_labels, predictions, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'f1': report['weighted avg']['f1-score'],
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall']
    }


####  4. Load Test Data -----------------------------------------------


In [None]:
test_df = pd.read_csv("../data/IMDB Dataset.csv")
test_df = test_df.sample(200, random_state=42)  # Use subset for speed
test_df['sentiment'] = test_df['sentiment'].map({'positive': 1, 'negative': 0})


#### 5. Run All Comparisons ------------------------------------------


In [9]:
results = {}

# Base Model Evaluations
results['base_zero'] = evaluate_with_prompts(base_model, base_tokenizer, 
                                             test_df['review'], test_df['sentiment'], "zero")

results['base_few'] = evaluate_with_prompts(base_model, base_tokenizer, 
                                            test_df['review'], test_df['sentiment'], "few")

# Fine-Tuned Model Evaluations
results['ft_zero'] = evaluate_with_prompts(finetuned_model, finetuned_tokenizer, 
                                           test_df['review'], test_df['sentiment'], "zero")

results['ft_few'] = evaluate_with_prompts(finetuned_model, finetuned_tokenizer, 
                                          test_df['review'], test_df['sentiment'], "few")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### 6. Comparative Analysis ----------------------------------------


In [10]:
metrics = ['accuracy', 'f1', 'precision', 'recall']

comparison = pd.DataFrame({
    'Base Zero-shot': [results['base_zero'][m] for m in metrics],
    'Base Few-shot': [results['base_few'][m] for m in metrics],
    'FT Zero-shot': [results['ft_zero'][m] for m in metrics],
    'FT Few-shot': [results['ft_few'][m] for m in metrics]
}, index=['Accuracy', 'F1-Score', 'Precision', 'Recall'])

print("=== Comprehensive Performance Comparison ===")
print(comparison.round(3))


=== Comprehensive Performance Comparison ===
           Base Zero-shot  Base Few-shot  FT Zero-shot  FT Few-shot
Accuracy            0.520          0.525         0.525        0.525
F1-Score            0.359          0.361         0.361        0.361
Precision           0.274          0.276         0.276        0.276
Recall              0.520          0.525         0.525        0.525


#### 7. Example Predictions ------------------------------------------


In [11]:
sample_texts = [
    "The cinematography was stunning but the plot was weak",
    "Absolutely the worst movie I've seen this year",
    "A perfect blend of humor and heart"
]

print("\n=== Prediction Examples ===")
for text in sample_texts:
    print(f"\nReview: {text[:100]}...")

    def simple_predict(model, tokenizer, text, prompt_type):
        prompted = ZERO_SHOT_TEMPLATE.format(review=text) if prompt_type == "zero" else FEW_SHOT_TEMPLATE.format(review=text)
        inputs = tokenizer(prompted, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            logits = model(**inputs).logits
        return "POS" if torch.argmax(logits).item() == 1 else "NEG"

    print(f"Base Zero: {simple_predict(base_model, base_tokenizer, text, 'zero')}")
    print(f"Base Few: {simple_predict(base_model, base_tokenizer, text, 'few')}")
    print(f"FT Zero: {simple_predict(finetuned_model, finetuned_tokenizer, text, 'zero')}")
    print(f"FT Few: {simple_predict(finetuned_model, finetuned_tokenizer, text, 'few')}")



=== Prediction Examples ===

Review: The cinematography was stunning but the plot was weak...
Base Zero: POS
Base Few: POS
FT Zero: POS
FT Few: POS

Review: Absolutely the worst movie I've seen this year...
Base Zero: POS
Base Few: POS
FT Zero: POS
FT Few: POS

Review: A perfect blend of humor and heart...
Base Zero: POS
Base Few: POS
FT Zero: POS
FT Few: POS
