# Sentiment Classification tasks with RoBERTa Large MNLI (HF, transformers) over WER-binned sample sentences

RoBERTa Large MNLI is RoBERTa fine-tuned on the Multi-Genre Natural Language Inference (MNLI) corpus - ideal for zero-shot classification.

**Key Features:**
- Model: [FacebookAI/roberta-large-mnli](https://huggingface.co/FacebookAI/roberta-large-mnli)
- Parameters: ~355M (RoBERTa Large)
- Training: Fine-tuned on MNLI (433k sentence pairs)
- GLUE Score: 90.2% on MNLI
- License: MIT
- Approach: Zero-shot classification via Natural Language Inference

**How it works:** Uses NLI to determine if text "entails" a sentiment label, enabling classification without task-specific fine-tuning.


### Setup


In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from typing import List, Dict
import logging
import random
from datetime import datetime
import re
logging.basicConfig(level=logging.INFO)


###### Model Configuration

RoBERTa Large MNLI is specifically trained for Natural Language Inference.
It achieves 90.2% accuracy on the MNLI benchmark and is excellent for zero-shot classification.


In [None]:
# RoBERTa Large fine-tuned on MNLI - excellent for zero-shot classification
model_id = "FacebookAI/roberta-large-mnli"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

# Check GPU memory (if available)
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Initialize the zero-shot classification pipeline
# RoBERTa MNLI is the go-to model for zero-shot classification
zero_shot_pipeline = pipeline(
    "zero-shot-classification",
    model=model_id,
    device=0 if torch.cuda.is_available() else -1
)

# Define sentiment labels for classification
SENTIMENT_LABELS = ["positive", "negative", "neutral"]

print(f"✓ RoBERTa Large MNLI loaded successfully")
print(f"  Model: {model_id}")
print(f"  Parameters: ~355M")
print(f"  MNLI Accuracy: 90.2%")
print(f"  Candidate labels: {SENTIMENT_LABELS}")


###### Test Pipeline (Optional - Run this to verify setup)


In [None]:
# Quick test to verify the pipeline works
print("Testing RoBERTa MNLI zero-shot classification pipeline...")

test_sentences = [
    "I love this product, it's amazing!",
    "This is terrible, I hate it.",
    "The weather is okay today."
]

for sentence in test_sentences:
    result = zero_shot_pipeline(sentence, SENTIMENT_LABELS)
    top_label = result['labels'][0]
    top_score = result['scores'][0]
    print(f"  '{sentence[:40]}...' -> {top_label} ({top_score:.4f})")

print("\n✓ Test successful! RoBERTa MNLI is working correctly.")


### Data Loading and Processing


###### Load Data from XLSX


In [None]:
def load_data_from_xlsx(xlsx_path: str, sheet_index: int = 1) -> pd.DataFrame:
    """
    Load data from the second sheet (index 1) of an XLSX file.
    Expected columns:
    - Machine Transcription (ground truth)
    - Human Transcription (hypothesis)
    - Sentiment Label
    - WER
    - CER
    - List of AAE features
    - List of transcription errors
    - Interview
    """
    xl_file = pd.ExcelFile(xlsx_path)
    sheet_names = xl_file.sheet_names
    if len(sheet_names) <= sheet_index:
        raise ValueError(f"Sheet index {sheet_index} not available. Available sheets: {sheet_names}")
    
    df = pd.read_excel(xlsx_path, sheet_name=sheet_index)
    logging.info(f"Loaded {len(df)} rows from sheet '{sheet_names[sheet_index]}'")
    return df


In [None]:
def split_data_into_sets(df: pd.DataFrame, test_ratio: float = 0.5, random_seed: int = None) -> tuple:
    """
    Split dataframe into test and few-shot example sets.
    Returns (test_df, few_shot_df) with equal sizes (or as close as possible).
    
    Note: For zero-shot classification, few-shot examples are not used.
    This function is kept for API consistency with other model notebooks.
    """
    if random_seed is not None:
        random.seed(random_seed)
    
    # Shuffle the dataframe
    df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    
    # Split in half
    split_idx = len(df_shuffled) // 2
    test_df = df_shuffled.iloc[:split_idx].copy()
    few_shot_df = df_shuffled.iloc[split_idx:].copy()
    
    logging.info(f"Split data: {len(test_df)} test samples, {len(few_shot_df)} few-shot examples")
    return test_df, few_shot_df


### Classify with Model


###### Zero-Shot Classification with RoBERTa MNLI

**How NLI-based Zero-Shot Works:**
1. For each candidate label, construct: "This text is about [label]"
2. Use NLI to score if the text *entails* the hypothesis
3. Return the label with highest entailment score

RoBERTa MNLI achieves 90.2% on MNLI, making it one of the best models for this task.


In [None]:
def classify_with_roberta(sentence: str) -> tuple:
    """
    Classify sentiment for a single sentence using RoBERTa MNLI zero-shot classification.
    
    RoBERTa MNLI uses Natural Language Inference to classify text into arbitrary labels
    without requiring fine-tuning on sentiment data.
    
    Returns:
        (raw_output, sentiment_label, reason)
        - raw_output: String representation of classification results
        - sentiment_label: Top predicted sentiment (Positive/Negative/Neutral)
        - reason: Explanation based on confidence scores
    """
    # Run zero-shot classification
    with torch.no_grad():
        result = zero_shot_pipeline(sentence, SENTIMENT_LABELS)
    
    # Get results
    labels = result['labels']
    scores = result['scores']
    
    top_label = labels[0].capitalize()
    top_score = scores[0]
    
    # Build reason with all scores
    score_details = ", ".join([f"{l}: {s:.2%}" for l, s in zip(labels, scores)])
    reason = f"RoBERTa MNLI scores: {score_details}"
    
    # Format raw output for consistency with other models
    raw_output = f"[Top: {top_label} ({top_score:.4f}), All: {score_details}]"
    
    return raw_output, top_label, reason


### Main Workflow


###### Process All Samples

RoBERTa MNLI performs zero-shot classification - no training data or few-shot examples needed.


In [None]:
def process_all_samples(xlsx_path: str):
    """
    Process ALL samples for sentiment analysis using RoBERTa MNLI.
    
    Args:
        xlsx_path: Path to the XLSX file
        
    Returns:
        DataFrame with results for all samples
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    logging.info(f"Processing all {len(df)} samples with RoBERTa MNLI")
    
    # Prepare results list
    results = []
    
    # Process each sample
    for idx, (_, row) in enumerate(df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            # Fallback to first two columns
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Classify
        try:
            raw_output, sentiment_label, reason = classify_with_roberta(hypothesis)
            
            print(f"\nModel Output: {raw_output}")
            print(f"Parsed Sentiment: {sentiment_label}")
            print(f"Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            sentiment_label = "ERROR"
            reason = str(e)
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'sentiment_label': sentiment_label,
            'reason': reason
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"roberta_large_mnli_all_samples_{today}.csv"
    
    # Create output directory if it doesn't exist
    output_dir = r"data/model_outputs/roberta_large_mnli"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"Total samples processed: {len(results_df)}")
    print(f"{'='*80}")
    
    return results_df


In [None]:
def process_sentiment_analysis(xlsx_path: str, few_shot: bool = False, random_seed: int = 42):
    """
    Process sentiment analysis with optional data splitting.
    
    Note: The few_shot parameter is kept for API consistency with other model notebooks,
    but zero-shot classification doesn't use few-shot examples.
    
    Args:
        xlsx_path: Path to the XLSX file
        few_shot: Ignored for zero-shot classification (kept for API consistency)
        random_seed: Random seed for data splitting
        
    Returns:
        DataFrame with results
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    # Split data (for consistency, even though zero-shot doesn't use few-shot)
    test_df, _ = split_data_into_sets(df, test_ratio=0.5, random_seed=random_seed)
    
    logging.info(f"Processing {len(test_df)} samples with RoBERTa MNLI (split mode)")
    
    # Prepare results list
    results = []
    
    # Process each test sentence
    for idx, (_, row) in enumerate(test_df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(test_df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            # Fallback to first two columns
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Classify
        try:
            raw_output, sentiment_label, reason = classify_with_roberta(hypothesis)
            
            print(f"\nModel Output: {raw_output}")
            print(f"Parsed Sentiment: {sentiment_label}")
            print(f"Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            sentiment_label = "ERROR"
            reason = str(e)
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'sentiment_label': sentiment_label,
            'reason': reason
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"roberta_large_mnli_split_{today}.csv"
    
    # Create output directory if it doesn't exist
    output_dir = r"data/model_outputs/roberta_large_mnli"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"{'='*80}")
    
    return results_df


### Batch Processing (Faster)

RoBERTa MNLI supports batch processing for improved throughput.


In [None]:
def process_all_samples_batch(xlsx_path: str, batch_size: int = 8):
    """
    Process ALL samples using batch processing for faster inference.
    
    Note: Zero-shot classification requires NLI inference for each label,
    so batch size should be smaller than direct classification models.
    
    Args:
        xlsx_path: Path to the XLSX file
        batch_size: Number of sentences to process at once
        
    Returns:
        DataFrame with results for all samples
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    logging.info(f"Processing all {len(df)} samples with RoBERTa MNLI (batch mode, batch_size={batch_size})")
    
    # Get column names from first row
    sample_row = df.iloc[0]
    machine_col = None
    human_col = None
    
    for col in sample_row.index:
        if 'machine' in col.lower() and 'transcription' in col.lower():
            machine_col = col
        elif 'human' in col.lower() and 'transcription' in col.lower():
            human_col = col
    
    if machine_col is None or human_col is None:
        cols = list(sample_row.index)
        machine_col = cols[0] if machine_col is None else machine_col
        human_col = cols[1] if human_col is None else human_col
    
    # Extract sentences
    ground_truths = df[machine_col].tolist()
    hypotheses = df[human_col].tolist()
    
    # Process in batches
    print(f"Processing {len(hypotheses)} sentences in batches of {batch_size}...")
    
    all_results = []
    for i in range(0, len(hypotheses), batch_size):
        batch = hypotheses[i:i+batch_size]
        # Zero-shot pipeline can handle lists
        batch_results = zero_shot_pipeline(batch, SENTIMENT_LABELS)
        # Handle single item vs list return
        if not isinstance(batch_results, list):
            batch_results = [batch_results]
        all_results.extend(batch_results)
        print(f"  Processed {min(i+batch_size, len(hypotheses))}/{len(hypotheses)} sentences")
    
    # Build results dataframe
    results = []
    for idx, (gt, hyp, res) in enumerate(zip(ground_truths, hypotheses, all_results)):
        top_label = res['labels'][0].capitalize()
        scores = res['scores']
        labels = res['labels']
        
        score_details = ", ".join([f"{l}: {s:.2%}" for l, s in zip(labels, scores)])
        reason = f"RoBERTa MNLI scores: {score_details}"
        
        results.append({
            'ground_truth': gt,
            'hypothesis': hyp,
            'sentiment_label': top_label,
            'reason': reason
        })
    
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"roberta_large_mnli_batch_all_samples_{today}.csv"
    
    output_dir = r"data/model_outputs/roberta_large_mnli"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"Total samples processed: {len(results_df)}")
    print(f"{'='*80}")
    
    return results_df


### Usage


In [None]:
# Specify the path to your XLSX file
xlsx_file_path = r"C:\Users\pryce\OneDrive\Desktop\Lost in Transcription\Text Inputs\Samples.xlsx"  # Update this path

# Set random seed for reproducibility
RANDOM_SEED = 33


###### Option 1: Process with Split (50% of data)


In [None]:
# Run sentiment analysis on 50% of data (split mode)
split_results = process_sentiment_analysis(
    xlsx_file_path, 
    random_seed=RANDOM_SEED
)


###### Option 2: Process All Samples (One by One)


In [None]:
# Run sentiment analysis on ALL samples (verbose, one by one)
all_results = process_all_samples(xlsx_file_path)


###### Option 3: Batch Processing (Faster)


In [None]:
# Run sentiment analysis on ALL samples using batch processing
# batch_size=8 is recommended for zero-shot classification
batch_results = process_all_samples_batch(xlsx_file_path, batch_size=8)
