# Sentiment Classification tasks with DistilBERT (HF, transformers) over WER-binned sample sentences

DistilBERT is a smaller, faster, cheaper version of BERT that retains 97% of its language understanding capabilities.

**Important:** Unlike the generative models (Llama, Vicuna, Mistral), DistilBERT is an **encoder-only model** that doesn't generate text. Instead, it classifies input directly.

- Base Model: [distilbert/distilbert-base-uncased](https://huggingface.co/distilbert/distilbert-base-uncased)
- Sentiment Model: [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
- Parameters: 67M (much smaller than 13B-22B generative models)
- License: Apache 2.0
- Output: Direct classification labels (POSITIVE/NEGATIVE) with confidence scores


### Setup


In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from typing import List, Dict
import logging
import random
from datetime import datetime
import re
logging.basicConfig(level=logging.INFO)


###### Model Configuration

DistilBERT is very lightweight (67M parameters) and runs efficiently on CPU or GPU.
No quantization needed - the model is small enough to run on most hardware.


In [None]:
# Using the sentiment-fine-tuned version of DistilBERT
# The base model (distilbert-base-uncased) is for feature extraction/fine-tuning
# This fine-tuned version is ready for sentiment classification
model_id = "distilbert-base-uncased-finetuned-sst-2-english"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

# Check GPU memory (if available)
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Load tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)

# Move model to device
model = model.to(device)

# Initialize the sentiment analysis pipeline
# DistilBERT uses "sentiment-analysis" or "text-classification" pipeline (NOT text-generation)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

print(f"✓ DistilBERT sentiment model loaded successfully")
print(f"  Model size: ~67M parameters")
print(f"  Output labels: POSITIVE, NEGATIVE")


###### Test Pipeline (Optional - Run this to verify setup)


In [None]:
# Quick test to verify the pipeline works
# DistilBERT is fast - this should complete almost instantly
print("Testing DistilBERT sentiment pipeline...")

test_sentences = [
    "I love this product, it's amazing!",
    "This is terrible, I hate it.",
    "The weather is okay today."
]

for sentence in test_sentences:
    result = sentiment_pipeline(sentence)[0]
    print(f"  '{sentence[:40]}...' -> {result['label']} ({result['score']:.4f})")

print("\n✓ Test successful! DistilBERT is working correctly.")


### Data Loading and Processing


###### Load Data from XLSX


In [None]:
def load_data_from_xlsx(xlsx_path: str, sheet_index: int = 1) -> pd.DataFrame:
    """
    Load data from the second sheet (index 1) of an XLSX file.
    Expected columns:
    - Machine Transcription (ground truth)
    - Human Transcription (hypothesis)
    - Sentiment Label
    - WER
    - CER
    - List of AAE features
    - List of transcription errors
    - Interview
    """
    xl_file = pd.ExcelFile(xlsx_path)
    sheet_names = xl_file.sheet_names
    if len(sheet_names) <= sheet_index:
        raise ValueError(f"Sheet index {sheet_index} not available. Available sheets: {sheet_names}")
    
    df = pd.read_excel(xlsx_path, sheet_name=sheet_index)
    logging.info(f"Loaded {len(df)} rows from sheet '{sheet_names[sheet_index]}'")
    return df


In [None]:
def split_data_into_sets(df: pd.DataFrame, test_ratio: float = 0.5, random_seed: int = None) -> tuple:
    """
    Split dataframe into test and few-shot example sets.
    Returns (test_df, few_shot_df) with equal sizes (or as close as possible).
    
    Note: For DistilBERT, few-shot examples are not used (model is pre-trained for classification).
    This function is kept for API consistency with other model notebooks.
    """
    if random_seed is not None:
        random.seed(random_seed)
    
    # Shuffle the dataframe
    df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    
    # Split in half
    split_idx = len(df_shuffled) // 2
    test_df = df_shuffled.iloc[:split_idx].copy()
    few_shot_df = df_shuffled.iloc[split_idx:].copy()
    
    logging.info(f"Split data: {len(test_df)} test samples, {len(few_shot_df)} few-shot examples")
    return test_df, few_shot_df


### Classify with Model


###### Classify with DistilBERT

**Key Difference:** DistilBERT doesn't use prompts or generate text. It directly classifies the input and returns:
- `label`: POSITIVE or NEGATIVE
- `score`: confidence score (0-1)

Note: The SST-2 fine-tuned model only outputs POSITIVE/NEGATIVE (binary classification).
NEUTRAL is approximated based on low confidence scores.


In [None]:
def map_distilbert_to_sentiment(label: str, score: float, neutral_threshold: float = 0.6) -> str:
    """
    Map DistilBERT's binary output (POSITIVE/NEGATIVE) to our three-class system.
    
    If the confidence score is below the threshold, classify as NEUTRAL.
    This approximates neutral sentiment for borderline cases.
    
    Args:
        label: DistilBERT output label (POSITIVE or NEGATIVE)
        score: Confidence score (0-1)
        neutral_threshold: Below this confidence, classify as NEUTRAL
        
    Returns:
        Mapped sentiment label: Positive, Negative, or Neutral
    """
    if score < neutral_threshold:
        return "Neutral"
    elif label == "POSITIVE":
        return "Positive"
    else:
        return "Negative"


In [None]:
def classify_with_distilbert(sentence: str) -> tuple:
    """
    Classify sentiment for a single sentence using DistilBERT.
    
    Unlike generative models, DistilBERT:
    - Doesn't use prompts
    - Doesn't generate text
    - Returns classification directly
    
    Returns:
        (raw_output, sentiment_label, reason)
        - raw_output: Dictionary with label and score
        - sentiment_label: Mapped sentiment (Positive/Negative/Neutral)
        - reason: Explanation based on confidence score
    """
    # Run classification
    with torch.no_grad():
        result = sentiment_pipeline(sentence)[0]
    
    label = result['label']
    score = result['score']
    
    # Map to three-class sentiment
    sentiment_label = map_distilbert_to_sentiment(label, score)
    
    # Generate a reason based on the classification
    if sentiment_label == "Neutral":
        reason = f"Low confidence ({score:.2%}) suggests ambiguous sentiment"
    else:
        reason = f"Classified as {label} with {score:.2%} confidence"
    
    # Format raw output as string for consistency with other models
    raw_output = f"[Label: {label}, Score: {score:.4f}]"
    
    return raw_output, sentiment_label, reason


### Main Workflow


###### Process All Samples

Note: DistilBERT doesn't support few-shot learning (it's a fine-tuned classification model).
All processing is effectively "zero-shot" - the model classifies directly based on its training.


In [None]:
def process_all_samples(xlsx_path: str):
    """
    Process ALL samples for sentiment analysis.
    
    DistilBERT processes all samples the same way (no few-shot/zero-shot distinction).
    This is much faster than generative models due to:
    - Smaller model size (67M vs 13B+ parameters)
    - Direct classification (no text generation)
    - Batch processing capability
    
    Args:
        xlsx_path: Path to the XLSX file
        
    Returns:
        DataFrame with results for all samples
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    logging.info(f"Processing all {len(df)} samples with DistilBERT")
    
    # Prepare results list
    results = []
    
    # Process each sample
    for idx, (_, row) in enumerate(df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            # Fallback to first two columns
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Classify (no prompt needed for DistilBERT)
        try:
            raw_output, sentiment_label, reason = classify_with_distilbert(hypothesis)
            
            print(f"\nModel Output: {raw_output}")
            print(f"Parsed Sentiment: {sentiment_label}")
            print(f"Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            sentiment_label = "ERROR"
            reason = str(e)
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'sentiment_label': sentiment_label,
            'reason': reason
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"distilbert_base_uncased_all_samples_{today}.csv"
    
    # Create output directory if it doesn't exist
    output_dir = r"data/model_outputs/distilbert_base_uncased"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"Total samples processed: {len(results_df)}")
    print(f"{'='*80}")
    
    return results_df


In [None]:
def process_sentiment_analysis(xlsx_path: str, few_shot: bool = False, random_seed: int = 42):
    """
    Process sentiment analysis with optional data splitting.
    
    Note: The few_shot parameter is kept for API consistency with other model notebooks,
    but DistilBERT doesn't actually use few-shot examples - it's a pre-trained classifier.
    
    Args:
        xlsx_path: Path to the XLSX file
        few_shot: Ignored for DistilBERT (kept for API consistency)
        random_seed: Random seed for data splitting
        
    Returns:
        DataFrame with results
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    # Split data (for consistency, even though DistilBERT doesn't use few-shot)
    test_df, _ = split_data_into_sets(df, test_ratio=0.5, random_seed=random_seed)
    
    logging.info(f"Processing {len(test_df)} samples with DistilBERT (split mode)")
    
    # Prepare results list
    results = []
    
    # Process each test sentence
    for idx, (_, row) in enumerate(test_df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(test_df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            # Fallback to first two columns
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Classify
        try:
            raw_output, sentiment_label, reason = classify_with_distilbert(hypothesis)
            
            print(f"\nModel Output: {raw_output}")
            print(f"Parsed Sentiment: {sentiment_label}")
            print(f"Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            sentiment_label = "ERROR"
            reason = str(e)
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'sentiment_label': sentiment_label,
            'reason': reason
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"distilbert_base_uncased_split_{today}.csv"
    
    # Create output directory if it doesn't exist
    output_dir = r"data/model_outputs/distilbert_base_uncased"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"{'='*80}")
    
    return results_df


### Batch Processing (Faster)

DistilBERT supports efficient batch processing, which is much faster than processing one sentence at a time.


In [None]:
def process_all_samples_batch(xlsx_path: str, batch_size: int = 32):
    """
    Process ALL samples using batch processing for faster inference.
    
    This is the recommended approach for DistilBERT as it's much faster
    than processing one sentence at a time.
    
    Args:
        xlsx_path: Path to the XLSX file
        batch_size: Number of sentences to process at once
        
    Returns:
        DataFrame with results for all samples
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    logging.info(f"Processing all {len(df)} samples with DistilBERT (batch mode, batch_size={batch_size})")
    
    # Get column names from first row
    sample_row = df.iloc[0]
    machine_col = None
    human_col = None
    
    for col in sample_row.index:
        if 'machine' in col.lower() and 'transcription' in col.lower():
            machine_col = col
        elif 'human' in col.lower() and 'transcription' in col.lower():
            human_col = col
    
    if machine_col is None or human_col is None:
        cols = list(sample_row.index)
        machine_col = cols[0] if machine_col is None else machine_col
        human_col = cols[1] if human_col is None else human_col
    
    # Extract sentences
    ground_truths = df[machine_col].tolist()
    hypotheses = df[human_col].tolist()
    
    # Process in batches
    print(f"Processing {len(hypotheses)} sentences in batches of {batch_size}...")
    
    all_results = []
    for i in range(0, len(hypotheses), batch_size):
        batch = hypotheses[i:i+batch_size]
        batch_results = sentiment_pipeline(batch)
        all_results.extend(batch_results)
        print(f"  Processed {min(i+batch_size, len(hypotheses))}/{len(hypotheses)} sentences")
    
    # Build results dataframe
    results = []
    for idx, (gt, hyp, res) in enumerate(zip(ground_truths, hypotheses, all_results)):
        sentiment_label = map_distilbert_to_sentiment(res['label'], res['score'])
        
        if sentiment_label == "Neutral":
            reason = f"Low confidence ({res['score']:.2%}) suggests ambiguous sentiment"
        else:
            reason = f"Classified as {res['label']} with {res['score']:.2%} confidence"
        
        results.append({
            'ground_truth': gt,
            'hypothesis': hyp,
            'sentiment_label': sentiment_label,
            'reason': reason
        })
    
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"distilbert_base_uncased_batch_all_samples_{today}.csv"
    
    output_dir = r"data/model_outputs/distilbert_base_uncased"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"Total samples processed: {len(results_df)}")
    print(f"{'='*80}")
    
    return results_df


### Usage


In [None]:
# Specify the path to your XLSX file
xlsx_file_path = r"C:\Users\pryce\OneDrive\Desktop\Lost in Transcription\Text Inputs\Samples.xlsx"  # Update this path

# Set random seed for reproducibility
RANDOM_SEED = 33


###### Option 1: Process with Split (50% of data)


In [None]:
# Run sentiment analysis on 50% of data (split mode)
split_results = process_sentiment_analysis(
    xlsx_file_path, 
    random_seed=RANDOM_SEED
)


###### Option 2: Process All Samples (One by One)


In [None]:
# Run sentiment analysis on ALL samples (verbose, one by one)
all_results = process_all_samples(xlsx_file_path)


###### Option 3: Batch Processing (Recommended - Fastest)


In [None]:
# Run sentiment analysis on ALL samples using batch processing (fastest)
batch_results = process_all_samples_batch(xlsx_file_path, batch_size=32)
