# Sentiment Classification tasks with Vicuna-13B-v1.3 (HF, transformers) over WER-binned sample sentences

Vicuna is a chat assistant trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
- Model: [lmsys/vicuna-13b-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3)
- License: Non-commercial license
- Base model: LLaMA


### Setup


In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from typing import List, Dict
import logging
import random
from datetime import datetime
import re
logging.basicConfig(level=logging.INFO)


###### Memory Management Setup


In [None]:
# Set environment variable to help with memory fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear any existing CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared")


###### Model Configuration


In [None]:
model_id = "lmsys/vicuna-13b-v1.3"
hf_token = os.environ.get("HF_TOKEN")

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)

# Set pad_token for Vicuna models (they don't have one by default)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    #device_map = "auto",
    trust_remote_code=True,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    # TODO: consider quantization settings based on hardware
)

# Initialize the generation pipeline
# Don't use device_map in pipeline when model already has device_map="auto"
gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device=0 if torch.cuda.is_available() else -1
)


###### Test Pipeline (Optional - Run this to verify setup)


In [None]:
# Quick test to verify the pipeline works
# This should complete in a few seconds, not minutes
print("Testing Vicuna pipeline with a simple prompt...")
test_prompt = "USER: Hello, how are you?\nASSISTANT:"
test_output = gen_pipeline(
    test_prompt,
    max_new_tokens=20,
    do_sample=False,
    return_full_text=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)
print(f"Test successful! Output: {test_output[0]['generated_text']}")
print("\nIf this test hangs or takes more than 30 seconds, there's a configuration issue.")


###### Prompt Structure

Vicuna uses a conversation format with USER/ASSISTANT turns.


In [None]:
# Zero-shot prompt using Vicuna's conversation format
ZERO_SHOT_PROMPT = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.

USER: You are an assistant that classifies the sentiment of user utterances. You must respond with three parts:
1) A single label: `Positive`, `Negative`, or `Neutral`
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable

Please classify the sentiment of this utterance: "{sentence}"
ASSISTANT:"""


In [None]:
# Zero-shot prompt for continuous sentiment (integer scale)
ZERO_SHOT_PROMPT_CONTINUOUS = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.

USER: You are an assistant that classifies the sentiment of user utterances. You must respond with three parts:
1) An integer value for sentiment between -10 and 10, with -10 being the most negative and 10 being the most positive, and a score of 0 being neutral
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable

Please classify the sentiment of this utterance: "{sentence}"
ASSISTANT:"""


In [None]:
# Format for few-shot examples - will be populated dynamically
FEW_SHOT_EXAMPLES_TEMPLATE = """
### EXAMPLES ###
{examples}
### END EXAMPLES ###
"""


In [None]:
# Few-shot prompt template using Vicuna's conversation format
FEW_SHOT_PROMPT = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.

USER: You are an assistant that classifies the sentiment of user utterances. You must respond with three parts:
1) A single label: `Positive`, `Negative`, or `Neutral`
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable

Here are some examples of how to classify sentiment:
{examples}

Now, please classify the sentiment of this utterance: "{sentence}"
ASSISTANT:"""


In [None]:
# Few-shot prompt for continuous sentiment (integer scale)
FEW_SHOT_PROMPT_CONTINUOUS = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.

USER: You are an assistant that classifies the sentiment of user utterances. You must respond with three parts:
1) An integer value for sentiment between -10 and 10, with -10 being the most negative and 10 being the most positive, and a score of 0 being neutral
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable

Here are some examples of how to classify sentiment:
{examples}

Now, please classify the sentiment of this utterance: "{sentence}"
ASSISTANT:"""


### Data Loading and Processing


###### Load Data from XLSX


In [None]:
def load_data_from_xlsx(xlsx_path: str, sheet_index: int = 1) -> pd.DataFrame:
    """
    Load data from the second sheet (index 1) of an XLSX file.
    Expected columns:
    - Machine Transcription (ground truth)
    - Human Transcription (hypothesis)
    - Sentiment Label
    - WER
    - CER
    - List of AAE features
    - List of transcription errors
    - Interview
    """
    xl_file = pd.ExcelFile(xlsx_path)
    sheet_names = xl_file.sheet_names
    if len(sheet_names) <= sheet_index:
        raise ValueError(f"Sheet index {sheet_index} not available. Available sheets: {sheet_names}")
    
    df = pd.read_excel(xlsx_path, sheet_name=sheet_index)
    logging.info(f"Loaded {len(df)} rows from sheet '{sheet_names[sheet_index]}'")
    return df


In [None]:
def split_data_into_sets(df: pd.DataFrame, test_ratio: float = 0.5, random_seed: int = None) -> tuple:
    """
    Split dataframe into test and few-shot example sets.
    Returns (test_df, few_shot_df) with equal sizes (or as close as possible).
    """
    if random_seed is not None:
        random.seed(random_seed)
    
    # Shuffle the dataframe
    df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    
    # Split in half
    split_idx = len(df_shuffled) // 2
    test_df = df_shuffled.iloc[:split_idx].copy()
    few_shot_df = df_shuffled.iloc[split_idx:].copy()
    
    logging.info(f"Split data: {len(test_df)} test samples, {len(few_shot_df)} few-shot examples")
    return test_df, few_shot_df


### Classify with Model


###### Classify with Vicuna


In [None]:
def format_few_shot_examples(few_shot_df: pd.DataFrame, num_examples: int = 5) -> str:
    """
    Format 5 randomly selected examples from few_shot_df.
    Uses Machine Transcription (column 1) for examples as specified.
    """
    if len(few_shot_df) < num_examples:
        num_examples = len(few_shot_df)
    
    # Randomly select examples
    selected = few_shot_df.sample(n=num_examples, random_state=None)
    
    examples_text = []
    for idx, (_, row) in enumerate(selected.iterrows(), 1):
        # Get column names - handle variations
        machine_col = None
        human_col = None
        sentiment_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
            elif 'sentiment' in col.lower():
                sentiment_col = col
        
        if machine_col is None or human_col is None:
            # Try to find by position or common names
            cols = list(row.index)
            if len(cols) >= 2:
                machine_col = cols[0] if machine_col is None else machine_col
                human_col = cols[1] if human_col is None else human_col
        
        # Use Machine Transcription (column 1) for examples as specified
        machine_text = row.get(machine_col, "N/A")
        sentiment = row.get(sentiment_col, "N/A")
        
        # Format as example for the prompt
        example = f"{idx}. Utterance: \"{machine_text}\"\n   Classification: [Sentiment: {sentiment}, Reason: This is an example sentence.]"
        examples_text.append(example)
    
    return "\n".join(examples_text)


In [None]:
def parse_sentiment_output(output_text: str) -> tuple:
    """
    Parse sentiment label and reason from model output.
    Returns (sentiment_label, reason) or (None, None) if parsing fails.
    """
    # Try to extract from [Sentiment: <label>, Reason: <explanation>] format
    pattern = r'\[Sentiment:\s*([^,]+),\s*Reason:\s*([^\]]+)\]'
    match = re.search(pattern, output_text, re.IGNORECASE)
    
    if match:
        sentiment = match.group(1).strip()
        reason = match.group(2).strip()
        return sentiment, reason
    
    # Fallback: try to find sentiment keywords
    sentiment_keywords = {
        'positive': 'Positive',
        'negative': 'Negative',
        'neutral': 'Neutral'
    }
    
    output_lower = output_text.lower()
    for keyword, label in sentiment_keywords.items():
        if keyword in output_lower:
            # Try to extract reason after sentiment
            reason_start = output_lower.find(keyword) + len(keyword)
            reason = output_text[reason_start:].strip()
            if not reason:
                reason = "No reason provided"
            return label, reason
    
    return None, output_text


In [None]:
def classify_with_vicuna(sentence: str, few_shot_examples: str = None, few_shot: bool = False) -> tuple:
    """
    Classify sentiment for a single sentence using Vicuna.
    Returns (raw_output, sentiment_label, reason)
    """
    if few_shot and few_shot_examples:
        examples_section = FEW_SHOT_EXAMPLES_TEMPLATE.format(examples=few_shot_examples)
        prompt = FEW_SHOT_PROMPT.format(examples=examples_section, sentence=sentence)
    else:
        prompt = ZERO_SHOT_PROMPT.format(sentence=sentence)

    # Clear CUDA cache before inference to prevent memory issues
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Use torch.no_grad() for inference to save memory
    with torch.no_grad():
        outputs = gen_pipeline(
            prompt,
            max_new_tokens=100,
            do_sample=False,
            temperature=0.0,
            return_full_text=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # Clear cache after inference to free memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # outputs is a list of dicts, we take the first
    raw_output = outputs[0]["generated_text"].strip()
    sentiment_label, reason = parse_sentiment_output(raw_output)
    
    return raw_output, sentiment_label, reason


### Main Workflow


###### Zero-shot (All Samples - No Splitting)


In [None]:
def process_all_samples_zero_shot(xlsx_path: str):
    """
    Process ALL samples for zero-shot sentiment analysis without splitting.
    This uses every row in the dataset as a test sample.
    
    Args:
        xlsx_path: Path to the XLSX file
        
    Returns:
        DataFrame with results for all samples
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    logging.info(f"Processing all {len(df)} samples for zero-shot analysis (no splitting)")
    
    # Prepare results list
    results = []
    
    # Process each sample
    for idx, (_, row) in enumerate(df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            # Fallback to first two columns
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Build the zero-shot prompt
        full_prompt = ZERO_SHOT_PROMPT.format(sentence=hypothesis)
        
        print(f"\nFull Input Prompt:\n{full_prompt}")
        
        # Classify
        try:
            raw_output, sentiment_label, reason = classify_with_vicuna(
                hypothesis, 
                few_shot_examples=None,
                few_shot=False
            )
            
            print(f"\nModel Output:\n{raw_output}")
            print(f"\nParsed Sentiment: {sentiment_label}")
            print(f"Parsed Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            sentiment_label = None
            reason = None
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'sentiment_label': sentiment_label if sentiment_label else "ERROR",
            'reason': reason if reason else raw_output
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"vicuna_13b_v1.3_zero_shot_all_samples_{today}.csv"
    
    # Create output directory if it doesn't exist
    output_dir = r"data/model_outputs/vicuna_13b_v1.3"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"Total samples processed: {len(results_df)}")
    print(f"{'='*80}")
    
    return results_df


In [None]:
def process_sentiment_analysis(xlsx_path: str, few_shot: bool = False, random_seed: int = 42):
    """
    Main workflow:
    1. Load data from second sheet of XLSX
    2. Split into test and few-shot sets (50/50)
    3. Process each test sentence with 5 random examples
    4. Save results to CSV
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    # Split data
    test_df, few_shot_df = split_data_into_sets(df, test_ratio=0.5, random_seed=random_seed)
    
    # Prepare results list
    results = []
    
    # Process each test sentence
    for idx, (_, row) in enumerate(test_df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(test_df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            # Fallback to first two columns
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Get few-shot examples if needed
        few_shot_examples = None
        if few_shot:
            few_shot_examples = format_few_shot_examples(few_shot_df, num_examples=5)
            print(f"\nFew-shot Examples:\n{few_shot_examples}")
        
        # Build the prompt to print it
        if few_shot and few_shot_examples:
            examples_section = FEW_SHOT_EXAMPLES_TEMPLATE.format(examples=few_shot_examples)
            full_prompt = FEW_SHOT_PROMPT.format(examples=examples_section, sentence=hypothesis)
        else:
            full_prompt = ZERO_SHOT_PROMPT.format(sentence=hypothesis)
        
        print(f"\nFull Input Prompt:\n{full_prompt}")
        
        # Classify
        try:
            raw_output, sentiment_label, reason = classify_with_vicuna(
                hypothesis, 
                few_shot_examples=few_shot_examples,
                few_shot=few_shot
            )
            
            print(f"\nModel Output:\n{raw_output}")
            print(f"\nParsed Sentiment: {sentiment_label}")
            print(f"Parsed Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            sentiment_label = None
            reason = None
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'sentiment_label': sentiment_label if sentiment_label else "ERROR",
            'reason': reason if reason else raw_output
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    shot_type = "few_shot" if few_shot else "zero_shot"
    output_filename = f"vicuna_13b_v1.3_{shot_type}_{today}.csv"
    
    # Create output directory if it doesn't exist
    output_dir = r"data/model_outputs/vicuna_13b_v1.3"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"{'='*80}")
    
    return results_df


### Usage


In [None]:
# Specify the path to your XLSX file
xlsx_file_path = r"C:\Users\pryce\OneDrive\Desktop\Lost in Transcription\Text Inputs\Samples.xlsx"  # Update this path

# Set random seed for reproducibility
RANDOM_SEED = 33


###### Zero-shot (Split)


In [None]:
# Run zero-shot sentiment analysis (50/50 split)
zero_shot_results = process_sentiment_analysis(
    xlsx_file_path, 
    few_shot=False,
    random_seed=RANDOM_SEED
)


###### Zero-shot (All Samples - No Split)


In [None]:
# Run zero-shot sentiment analysis on ALL samples (no splitting)
all_samples_zero_shot_results = process_all_samples_zero_shot(xlsx_file_path)


###### Few-shot


In [None]:
# Run few-shot sentiment analysis
few_shot_results = process_sentiment_analysis(
    xlsx_file_path, 
    few_shot=True,
    random_seed=RANDOM_SEED
)
