# Sentiment Classification tasks with Llama-2-13B (HF, transformers) over WER-binned sample sentences

### Setup

In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from typing import List, Dict
import logging
logging.basicConfig(level = logging.INFO)

###### Model Configuration

In [None]:
model_id = "meta-llama/Llama-2-13b-hf"
hf_token = os.environ.get("HF_TOKEN")

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast = False, trust_remote_code = True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    trust_remote_code = True,
    # TODO: consider quantization settings based on hardware
)

###### Prompt Structure

In [None]:
ZERO_SHOT_PROMPT = """
<<SYS>>
You are an assistant that classifies the sentiment of user utterances.  You must respond with three parts:
1) A single label: `Positive`, `Negative`, or `Neutral`
2) A short explanation (1–2 sentences) of why you chose that label
3) (Optionally) any caveats or uncertainty if applicable
<</SYS>>
[INST]
User: {sentence}
[/INST]
Assistant:
"""

In [None]:
# TODO: fill these manually
FEW_SHOT_EXAMPLES = """
### EXAMPLES ###
{examples}
### END EXAMPLES ###
"""

In [None]:
FEW_SHOT_PROMPT = """
<<SYS>>
You are an assistant that classifies the sentiment of user utterances.  You must respond with three parts:
1) A single label: `Positive`, `Negative`, or `Neutral`
2) A short explanation (1–2 sentences) of why you chose that label
3) (Optionally) any caveats or uncertainty if applicable
<</SYS>>
{examples}
[INST]
User: {sentence}
[/INST]
Assistant:
"""

### Classify with Model

###### Classify with Llama

In [None]:
# pass in a single sentence (either zero or few-shot) and return raw output from model
# TODO: separate sentiment label and provided reason
def classify_with_llama(sentence: str, few_shot: bool = False) -> str:
    if few_shot:
        prompt = FEW_SHOT_PROMPT.format(examples = FEW_SHOT_EXAMPLES, sentence = sentence)
    else:
        prompt = ZERO_SHOT_PROMPT.format(sentence = sentence)

    outputs = gen_pipeline(
        prompt,
        max_new_tokens = 100,
        do_sample = False,  # deterministic; you may try True for more variety
        temperature = 0.0,  # consider increasing slightly
        return_full_text = False,
        eos_token_id = tokenizer.eos_token_id,
    )

    # outputs is a list of dicts, we take the first
    text = outputs[0]["generated_text"].strip()
    return text

In [None]:
# read inputs sentences from CSV containing sentences in a column headed with the value for inputs, returns DF with original + output
def run_sentiment_on_csv(csv_path: str, inputs: str, few_shot: bool = False) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    sentence_col = inputs

    results = []
    for _, row in df.iterrows():
        sent = row[sentence_col]
        try:
            out = classify_with_llama(sent, few_shot = few_shot)
        except Exception as e:
            out = f"ERROR: {e}"
        results.append(out)
    df["llama_output"] = results
    return df

### Usage

In [None]:
save_directory = r"data/model_outputs/llama_2_13b"
data_path = r"data/reference"
input_files = []  # include ".csv" in the path

###### Zero-shot

In [None]:
# run zero-shot over all csv files in input_files
zs_results = []
for file in input_files:
    print("Processing zero-shot on:", file)
    path = os.path.join(data_path, file)
    classifications = run_sentiment_on_csv(path, "Sentences", few_shot = False)
    zs_results.append(classifications)

for file_name, classifications in zip(input_files, zs_results):
    out_file = os.path.join(save_directory, file_name)
    classifications.to_csv(out_file, index = False)

###### Few-shot

In [None]:
fs_results = []
for file in input_files:
    print("Processing few-shot on:", file)
    path = os.path.join(data_path, file)
    classifications = run_sentiment_on_csv(path, "Sentences", few_shot = True)
    fs_results.append(classifications)

for file_name, classifications in zip(input_files, fs_results):
    out_file = os.path.join(save_directory, file_name)
    classifications.to_csv(out_file, index = False)