### Evaluating Loading fine-tuned model and the base model from Hugging Face, then perform a comparative analysis

import libraries

In [5]:
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TextClassificationPipeline
)


from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

#### 1. Load Models --------------------------------------------------


In [None]:
base_model_name = "distilbert-base-uncased"
finetuned_model = "Ofge/finetuned_distilbert"  


#### Base model


In [None]:
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Your fine-tuned model


In [None]:
finetuned_tokenizer = AutoTokenizer.from_pretrained(your_finetuned_model)
finetuned_model = AutoModelForSequenceClassification.from_pretrained(your_finetuned_model)


#### 2. Create Pipelines with safety defaults


In [None]:

base_pipe = pipeline(
    "text-classification",
    model=base_model_name,
    device=device,
    truncation=True,  # Add these defaults
    max_length=512
)

finetuned_pipe = pipeline(
    "text-classification",
    model=your_finetuned_model,
    device=device,
    truncation=True,
    max_length=512
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Device set to use cpu


#### 3. Evaluation Function ------------------------------------------


In [None]:
def evaluate(pipeline, test_texts, test_labels):
    predictions = pipeline(test_texts.tolist())
    pred_labels = [1 if pred['label'] == 'POSITIVE' else 0 for pred in predictions]
    return classification_report(test_labels, pred_labels, output_dict=True)

#### 4. Load Test Data -----------------------------------------------


In [None]:
test_df = pd.read_csv("../data/IMDB Dataset.csv")
test_df = test_df.sample(10, random_state=42)  # Smaller subset for demo
test_df['sentiment'] = test_df['sentiment'].map({'positive':1, 'negative':0})


#### 5. Run Comparisons ----------------------------------------------


In [None]:
base_results = evaluate(base_pipe, test_df['review'], test_df['sentiment'])
finetuned_results = evaluate(finetuned_pipe, test_df['review'], test_df['sentiment'])


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### 6. Comparative Analysis ----------------------------------------


In [None]:
comparison = pd.DataFrame({
    'Base Model': {
        'Accuracy': base_results['accuracy'],
        'F1-Score': base_results['weighted avg']['f1-score'],
        'Precision': base_results['weighted avg']['precision'],
        'Recall': base_results['weighted avg']['recall']
    },
    'Fine-tuned Model': {
        'Accuracy': finetuned_results['accuracy'],
        'F1-Score': finetuned_results['weighted avg']['f1-score'],
        'Precision': finetuned_results['weighted avg']['precision'],
        'Recall': finetuned_results['weighted avg']['recall']
    }
}).T

print("=== Performance Comparison ===")
print(comparison)


=== Performance Comparison ===
                  Accuracy  F1-Score  Precision  Recall
Base Model             0.4  0.228571       0.16     0.4
Fine-tuned Model       0.4  0.228571       0.16     0.4


#### 7. Example Predictions ------------------------------------------


In [None]:
sample_texts = [
    "This movie was absolutely wonderful!",
    "The plot was terrible and boring.",
    "It was okay, nothing special."
]

print("\n=== Sample Predictions ===")
for text in sample_texts:
    base_pred = base_pipe(text)[0]
    finetuned_pred = finetuned_pipe(text)[0]
    print(f"\nText: {text}")
    print(f"Base: {base_pred['label']} ({base_pred['score']:.2f})")
    print(f"Fine-tuned: {finetuned_pred['label']} ({finetuned_pred['score']:.2f})")



=== Sample Predictions ===

Text: This movie was absolutely wonderful!
Base: LABEL_0 (0.57)
Fine-tuned: LABEL_1 (0.52)

Text: The plot was terrible and boring.
Base: LABEL_0 (0.55)
Fine-tuned: LABEL_1 (0.53)

Text: It was okay, nothing special.
Base: LABEL_0 (0.57)
Fine-tuned: LABEL_1 (0.52)


#### 8. Advanced Analysis -------------------------------------------


In [None]:
def compare_predictions(texts, true_labels):
    results = []
    for text, label in zip(texts, true_labels):
        base_pred = base_pipe(text)[0]
        ft_pred = finetuned_pipe(text)[0]
        results.append({
            'text': text[:50] + "...",
            'true': 'POSITIVE' if label == 1 else 'NEGATIVE',
            'base_pred': base_pred['label'],
            'base_score': base_pred['score'],
            'ft_pred': ft_pred['label'],
            'ft_score': ft_pred['score'],
            'agreement': base_pred['label'] == ft_pred['label']
        })
    return pd.DataFrame(results)

disagreements = compare_predictions(test_df['review'], test_df['sentiment'])
print("\n=== Prediction Disagreements ===")
print(disagreements[~disagreements['agreement']].head())


=== Prediction Disagreements ===
                                                text      true base_pred  \
0  I really liked this Summerslam due to the look...  POSITIVE   LABEL_0   
1  Not many television shows appeal to quite as m...  POSITIVE   LABEL_0   
2  The film quickly gets to a major chase scene w...  NEGATIVE   LABEL_0   
3  Jane Austen would definitely approve of this o...  POSITIVE   LABEL_0   
4  Expectations were somewhat high for me when I ...  NEGATIVE   LABEL_0   

   base_score  ft_pred  ft_score  agreement  
0    0.542781  LABEL_1  0.550927      False  
1    0.535357  LABEL_1  0.517670      False  
2    0.536125  LABEL_1  0.531626      False  
3    0.544320  LABEL_1  0.527887      False  
4    0.526478  LABEL_1  0.537122      False  
