# Sentiment Classification tasks with Llama-2-13B (HF, transformers) over WER-binned sample sentences

### Setup

In [1]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from typing import List, Dict
import logging
import random
from datetime import datetime
import re
logging.basicConfig(level = logging.INFO)

  import pynvml  # type: ignore[import]


###### Memory Management Setup


In [2]:
# Set environment variable to help with memory fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear any existing CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared")


CUDA cache cleared


###### Model Configuration

In [3]:
model_id = "meta-llama/Llama-2-13b-hf"
hf_token = os.environ.get("HF_TOKEN")

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast = False, trust_remote_code = True)

# Set pad_token for Llama models (they don't have one by default)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    #device_map = "auto",
    trust_remote_code = True,
    dtype = torch.float16 if torch.cuda.is_available() else torch.float32,
    # TODO: consider quantization settings based on hardware
)

# Initialize the generation pipeline
# Don't use device_map in pipeline when model already has device_map="auto"
gen_pipeline = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32,
    device = 0 if torch.cuda.is_available() else -1
)

Using: cuda


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0
  return t.to(


AcceleratorError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [3]:
model_id = "meta-llama/Llama-2-13b-hf"
hf_token = os.environ.get("HF_TOKEN")

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

# Check GPU memory
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast = False, trust_remote_code = True)

# Set pad_token for Llama models (they don't have one by default)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Try to use 8-bit quantization to reduce memory usage from ~26GB to ~7GB
# This is essential for GPUs with limited memory
try:
    from transformers import BitsAndBytesConfig

    # Configure 8-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False,
        llm_int8_enable_fp32_cpu_offload=True
    )

    print("Loading model with 8-bit quantization (reduces memory from ~26GB to ~7GB)...")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",  # Automatically offload to CPU if needed
        trust_remote_code=True,
        quantization_config=quantization_config,
    )
    print("✓ Model loaded with 8-bit quantization")

except ImportError:
    print("⚠ bitsandbytes not available. Please install: pip install bitsandbytes")
    print("Falling back to CPU offloading with float16...")

    # Fallback: Use device_map="auto" to offload layers to CPU
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        max_memory={0: "4GiB", "cpu": "30GiB"} if torch.cuda.is_available() else None,
    )
    print("✓ Model loaded with CPU offloading")

# Initialize the generation pipeline
# When using device_map="auto", the pipeline handles device placement automatically
gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

Using: cuda
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
Total GPU Memory: 4.00 GB
Loading model with 8-bit quantization (reduces memory from ~26GB to ~7GB)...


  _ = torch.tensor([0], device=i)
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


✓ Model loaded with 8-bit quantization


###### Test Pipeline (Optional - Run this to verify setup)


In [5]:
# Quick test to verify the pipeline works
# This should complete in a few seconds, not minutes
print("Testing pipeline with a simple prompt...")
test_prompt = "Hello, how are you?"
test_output = gen_pipeline(
    test_prompt,
    max_new_tokens = 20,
    do_sample = False,
    return_full_text = False,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id = tokenizer.pad_token_id,
)
print(f"Test successful! Output: {test_output[0]['generated_text']}")
print("\nIf this test hangs or takes more than 30 seconds, there's a configuration issue.")


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Testing pipeline with a simple prompt...


NotImplementedError: Cannot copy out of meta tensor; no data!

###### Prompt Structure

In [6]:
ZERO_SHOT_PROMPT = """
<<SYS>>
You are an assistant that classifies the sentiment of user utterances.  You must respond with three parts:
1) A single label: `Positive`, `Negative`, or `Neutral`
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable
<</SYS>>
[INST]
User: {sentence}
[/INST]
Assistant:
"""

In [4]:
ZERO_SHOT_PROMPT_CONTINUOUS = """
<<SYS>>
You are an assistant that classifies the sentiment of user utterances.  You must respond with three parts:
1) An integer value for sentiment between -10 and 10, with -10 being the most negative and 10 being the most positive, and a score of 0 being neutral
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable
<</SYS>>
[INST]
User: {sentence}
[/INST]
Assistant:
"""

In [5]:
# Format for few-shot examples - will be populated dynamically
FEW_SHOT_EXAMPLES_TEMPLATE = """
### EXAMPLES ###
{examples}
### END EXAMPLES ###
"""

In [6]:
# Few-shot prompt template
FEW_SHOT_PROMPT = """
<<SYS>>
You are an assistant that classifies the sentiment of user utterances.  You must respond with three parts:
1) A single label: `Positive`, `Negative`, or `Neutral`
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable
<</SYS>>
{examples}
[INST]
User: {sentence}
[/INST]
Assistant:
"""

In [7]:
# TODO: stipulate that the model should return in the format of the provided examples
FEW_SHOT_PROMPT_CONTINUOUS = """
<<SYS>>
You are an assistant that classifies the sentiment of user utterances.  You must respond with three parts:
1) An integer value for sentiment between -10 and 10, with -10 being the most negative and 10 being the most positive, and a score of 0 being neutral
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable
<</SYS>>
{examples}
[INST]
User: {sentence}
[/INST]
Assistant:
"""

### Data Loading and Processing

###### Load Data from XLSX

In [7]:
def load_data_from_xlsx(xlsx_path: str, sheet_index: int = 1) -> pd.DataFrame:
    """
    Load data from the second sheet (index 1) of an XLSX file.
    Expected columns:
    - Machine Transcription (ground truth)
    - Human Transcription (hypothesis)
    - Sentiment Label
    - WER
    - CER
    - List of AAE features
    - List of transcription errors
    - Interview
    """
    xl_file = pd.ExcelFile(xlsx_path)
    sheet_names = xl_file.sheet_names
    if len(sheet_names) <= sheet_index:
        raise ValueError(f"Sheet index {sheet_index} not available. Available sheets: {sheet_names}")
    
    df = pd.read_excel(xlsx_path, sheet_name=sheet_index)
    logging.info(f"Loaded {len(df)} rows from sheet '{sheet_names[sheet_index]}'")
    return df

In [8]:
def split_data_into_sets(df: pd.DataFrame, test_ratio: float = 0.5, random_seed: int = None) -> tuple:
    """
    Split dataframe into test and few-shot example sets.
    Returns (test_df, few_shot_df) with equal sizes (or as close as possible).
    """
    if random_seed is not None:
        random.seed(random_seed)
    
    # Shuffle the dataframe
    df_shuffled = df.sample(frac = 1, random_state = random_seed).reset_index(drop = True)
    
    # Split in half
    split_idx = len(df_shuffled) // 2
    test_df = df_shuffled.iloc[:split_idx].copy()
    few_shot_df = df_shuffled.iloc[split_idx:].copy()
    
    logging.info(f"Split data: {len(test_df)} test samples, {len(few_shot_df)} few-shot examples")
    return test_df, few_shot_df

### Classify with Model


###### Classify with Llama


In [9]:
def format_few_shot_examples(few_shot_df: pd.DataFrame, num_examples: int = 5) -> str:
    """
    Format 5 randomly selected examples from few_shot_df.
    Uses Machine Transcription (column 1) for examples as specified.
    """
    if len(few_shot_df) < num_examples:
        num_examples = len(few_shot_df)
    
    # Randomly select examples
    selected = few_shot_df.sample(n=num_examples, random_state=None)
    
    examples_text = []
    for idx, (_, row) in enumerate(selected.iterrows(), 1):
        # Get column names - handle variations
        machine_col = None
        human_col = None
        sentiment_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
            elif 'sentiment' in col.lower():
                sentiment_col = col
        
        if machine_col is None or human_col is None:
            # Try to find by position or common names
            cols = list(row.index)
            if len(cols) >= 2:
                machine_col = cols[0] if machine_col is None else machine_col
                human_col = cols[1] if human_col is None else human_col
        
        # Use Machine Transcription (column 1) for examples as specified
        machine_text = row.get(machine_col, "N/A")
        sentiment = row.get(sentiment_col, "N/A")
        
        # Format as example for the prompt
        example = f"{idx}. User: {machine_text}\n   Assistant: [Sentiment: {sentiment}, Reason: This is an example sentence.]"
        examples_text.append(example)
    
    return "\n".join(examples_text)


In [10]:
def parse_sentiment_output(output_text: str) -> tuple:
    """
    Parse sentiment label and reason from model output.
    Returns (sentiment_label, reason) or (None, None) if parsing fails.
    """
    # Try to extract from [Sentiment: <label>, Reason: <explanation>] format
    pattern = r'\[Sentiment:\s*([^,]+),\s*Reason:\s*([^\]]+)\]'
    match = re.search(pattern, output_text, re.IGNORECASE)
    
    if match:
        sentiment = match.group(1).strip()
        reason = match.group(2).strip()
        return sentiment, reason
    
    # Fallback: try to find sentiment keywords
    sentiment_keywords = {
        'positive': 'Positive',
        'negative': 'Negative',
        'neutral': 'Neutral'
    }
    
    output_lower = output_text.lower()
    for keyword, label in sentiment_keywords.items():
        if keyword in output_lower:
            # Try to extract reason after sentiment
            reason_start = output_lower.find(keyword) + len(keyword)
            reason = output_text[reason_start:].strip()
            if not reason:
                reason = "No reason provided"
            return label, reason
    
    return None, output_text


In [11]:
def classify_with_llama(sentence: str, few_shot_examples: str = None, few_shot: bool = False) -> tuple:
    """
    Classify sentiment for a single sentence.
    Returns (raw_output, sentiment_label, reason)
    """
    if few_shot and few_shot_examples:
        examples_section = FEW_SHOT_EXAMPLES_TEMPLATE.format(examples = few_shot_examples)
        prompt = FEW_SHOT_PROMPT.format(examples = examples_section, sentence = sentence)
    else:
        prompt = ZERO_SHOT_PROMPT.format(sentence = sentence)

    # Use torch.no_grad() for inference to save memory
    with torch.no_grad():
        outputs = gen_pipeline(
            prompt,
            max_new_tokens = 100,
            do_sample = False,
            temperature = 0.0,
            return_full_text = False,
            eos_token_id = tokenizer.eos_token_id,
            pad_token_id = tokenizer.pad_token_id,
        )

    # outputs is a list of dicts, we take the first
    raw_output = outputs[0]["generated_text"].strip()
    sentiment_label, reason = parse_sentiment_output(raw_output)
    
    return raw_output, sentiment_label, reason


### Main Workflow


###### Zero-shot (All Samples - No Splitting)


In [None]:
def process_all_samples_zero_shot(xlsx_path: str):
    """
    Process ALL samples for zero-shot sentiment analysis without splitting.
    This uses every row in the dataset as a test sample.
    
    Args:
        xlsx_path: Path to the XLSX file
        
    Returns:
        DataFrame with results for all samples
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    logging.info(f"Processing all {len(df)} samples for zero-shot analysis (no splitting)")
    
    # Prepare results list
    results = []
    
    # Process each sample
    for idx, (_, row) in enumerate(df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            # Fallback to first two columns
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Build the zero-shot prompt
        full_prompt = ZERO_SHOT_PROMPT.format(sentence=hypothesis)
        
        print(f"\nFull Input Prompt:\n{full_prompt}")
        
        # Classify
        try:
            raw_output, sentiment_label, reason = classify_with_llama(
                hypothesis, 
                few_shot_examples=None,
                few_shot=False
            )
            
            print(f"\nModel Output:\n{raw_output}")
            print(f"\nParsed Sentiment: {sentiment_label}")
            print(f"Parsed Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            sentiment_label = None
            reason = None
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'sentiment_label': sentiment_label if sentiment_label else "ERROR",
            'reason': reason if reason else raw_output
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"llama_2_13b_zero_shot_all_samples_{today}.csv"
    
    # Create output directory if it doesn't exist
    output_dir = r"data/model_outputs/llama_2_13b"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"Total samples processed: {len(results_df)}")
    print(f"{'='*80}")
    
    return results_df

In [12]:
def process_sentiment_analysis(xlsx_path: str, few_shot: bool = False, random_seed: int = 42):
    """
    Main workflow:
    1. Load data from second sheet of XLSX
    2. Split into test and few-shot sets (50/50)
    3. Process each test sentence with 5 random examples
    4. Save results to CSV
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index = 1)
    
    # Split data
    test_df, few_shot_df = split_data_into_sets(df, test_ratio=0.5, random_seed=random_seed)
    
    # Prepare results list
    results = []
    
    # Process each test sentence
    for idx, (_, row) in enumerate(test_df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(test_df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            # Fallback to first two columns
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Get few-shot examples if needed
        few_shot_examples = None
        if few_shot:
            few_shot_examples = format_few_shot_examples(few_shot_df, num_examples=5)
            print(f"\nFew-shot Examples:\n{few_shot_examples}")
        
        # Build the prompt to print it
        if few_shot and few_shot_examples:
            examples_section = FEW_SHOT_EXAMPLES_TEMPLATE.format(examples=few_shot_examples)
            full_prompt = FEW_SHOT_PROMPT.format(examples=examples_section, sentence=hypothesis)
        else:
            full_prompt = ZERO_SHOT_PROMPT.format(sentence=hypothesis)
        
        print(f"\nFull Input Prompt:\n{full_prompt}")
        
        # Classify
        try:
            raw_output, sentiment_label, reason = classify_with_llama(
                hypothesis, 
                few_shot_examples=few_shot_examples,
                few_shot=few_shot
            )
            
            print(f"\nModel Output:\n{raw_output}")
            print(f"\nParsed Sentiment: {sentiment_label}")
            print(f"Parsed Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            sentiment_label = None
            reason = None
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'sentiment_label': sentiment_label if sentiment_label else "ERROR",
            'reason': reason if reason else raw_output
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    shot_type = "few_shot" if few_shot else "zero_shot"
    output_filename = f"llama_2_13b_{shot_type}_{today}.csv"
    
    # Create output directory if it doesn't exist
    output_dir = r"data\model_outputs\llama_2_13b"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"{'='*80}")
    
    return results_df


### Usage


In [13]:
# Specify the path to your XLSX file
xlsx_file_path = r"C:\Users\pryce\OneDrive\Desktop\Lost in Transcription\Text Inputs\Samples.xlsx"  # Update this path

# Set random seed for reproducibility
RANDOM_SEED = 33


###### Zero-shot (Split)


In [14]:
# Run zero-shot sentiment analysis
zero_shot_results = process_sentiment_analysis(
    xlsx_file_path, 
    few_shot = False,
    random_seed = RANDOM_SEED
)


INFO:root:Loaded 124 rows from sheet 'Samples'
INFO:root:Split data: 62 test samples, 62 few-shot examples



Processing sentence 1/62

Ground Truth: Even when we come up from grade school, you know
Hypothesis: Even when we coming up in grade school.

Full Input Prompt:

<<SYS>>
You are an assistant that classifies the sentiment of user utterances.  You must respond with three parts:
1) A single label: `Positive`, `Negative`, or `Neutral`
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable
<</SYS>>
[INST]
User: Even when we coming up in grade school.
[/INST]
Assistant:


ERROR: Cannot copy out of meta tensor; no data!

Processing sentence 2/62

Ground Truth: Because you're supposed to give advice at home  first.
Hypothesis: ‘Cause you supposed to get advice from home first.

Full Input Prompt:

<<SYS>>
You are an assistant that classifies the sentiment of user utterances.  You must respond with three parts:
1) A single label: `Positive`, `Nega

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



ERROR: Cannot copy out of meta tensor; no data!

Processing sentence 8/62

Ground Truth: Oh, yeah. No problem with it.
Hypothesis: Oh, they didn’t have no problem with it.

Full Input Prompt:

<<SYS>>
You are an assistant that classifies the sentiment of user utterances.  You must respond with three parts:
1) A single label: `Positive`, `Negative`, or `Neutral`
2) A short explanation (1–2 sentences) of why you chose that label
3) Format your response as follows: [Sentiment: <label>, Reason: <explanation>]
4) (Optionally) any caveats or uncertainty if applicable
<</SYS>>
[INST]
User: Oh, they didn’t have no problem with it.
[/INST]
Assistant:


ERROR: Cannot copy out of meta tensor; no data!

Processing sentence 9/62

Ground Truth: I ain’t never, ain't never been prejudiced about me. Now, people, you sense it, they they they prejudiced against them n that, you know.
Hypothesis: I ain’t never been prejudiced about me; now, people, you sense it, they prejudiced against me, you know.

Ful

###### Zero-Shot (No Split)

In [None]:
all_samples_zero_shot_results = process_all_samples_zero_shot(xlsx_file_path)

###### Few-shot


In [None]:
# Run few-shot sentiment analysis
few_shot_results = process_sentiment_analysis(
    xlsx_file_path, 
    few_shot = True,
    random_seed = RANDOM_SEED
)


### Usage

In [None]:
save_directory = r"data/model_outputs/llama_2_13b"
data_path = r"data/reference"
input_files = []  # include ".csv" in the path

###### Zero-shot

In [None]:
# run zero-shot over all csv files in input_files
zs_results = []
for file in input_files:
    print("Processing zero-shot on:", file)
    path = os.path.join(data_path, file)
    classifications = run_sentiment_on_csv(path, "Sentences", few_shot = False)
    zs_results.append(classifications)

for file_name, classifications in zip(input_files, zs_results):
    out_file = os.path.join(save_directory, file_name)
    classifications.to_csv(out_file, index = False)

###### Few-shot

In [None]:
fs_results = []
for file in input_files:
    print("Processing few-shot on:", file)
    path = os.path.join(data_path, file)
    classifications = run_sentiment_on_csv(path, "Sentences", few_shot = True)
    fs_results.append(classifications)

for file_name, classifications in zip(input_files, fs_results):
    out_file = os.path.join(save_directory, file_name)
    classifications.to_csv(out_file, index = False)