In [1]:
import random
import re
from time import time

import numpy as np
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import os
import pickle


In [2]:
import kagglehub

# Download dataset from Kaggle
ds_path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset", path="train.csv")
print("Path to dataset files:", ds_path)
ds_path


Path to dataset files: /home/szymon/.cache/kagglehub/datasets/abhi8923shriv/sentiment-analysis-dataset/versions/9/train.csv


'/home/szymon/.cache/kagglehub/datasets/abhi8923shriv/sentiment-analysis-dataset/versions/9/train.csv'

### Text processing

In [3]:
def clean_text(text: str) -> str:
    """Clean and normalize tweet text.

    Args:
        text (str): Raw tweet text

    Returns:
        str: Cleaned text with URLs, HTML, mentions removed

    Example:
        >>> clean_text("@user I LOVE this! #amazing")
        'i love this! amazing'

    """
    if not text:
        return ""

    text = text.lower()
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#(\w+)", r"\1", text)
    return re.sub(r"\s+", " ", text).strip()



def load_train_dataset(
    path: str,
    encoding: str = "latin1",
    samples: int = 512,
) -> tuple[np.ndarray, np.ndarray]:
    """Load and preprocess the sentiment dataset.

    Args:
        path (str): Path to the CSV file
        encoding (str): File encoding (default: 'latin1')
        samples (int): Number of samples to randomly select

    Returns:
        tuple: (X, y) where X is text array and y is label array

    Example:
        >>> X, y = load_train_dataset("data.csv", samples=100)
        >>> print(X[0], y[0])
        'i love this product' 'positive'

    """
    df = pd.read_csv(path, encoding=encoding)
    df = df.dropna()

    df["preprocessed_text"] = df["selected_text"].apply(clean_text)

    idxes = random.sample(range(df.shape[0]), samples)
    return df["preprocessed_text"].values[idxes], df["sentiment"].values[idxes]


### Prompt Engineering Functions for ICL

In [4]:
def inject_example(text: str, sentiment: str) -> str:
    return f"Tweet: {text}\nSentiment: {sentiment}\n\n"


def inject_sample(text: str) -> str:
    return (
        f"### Now classify the following:\n"
        f"Tweet: {text}\nSentiment:"
    )


### Label Extraction

In [5]:
def extract_label(result: str, labels: list[str]) -> str:

    for lab in labels:
        if lab in result.lower():
            return lab
    return "unknown"


### IN-CONTEXT LEARNING EXPERIMENT

In [6]:
def run_experiment(
    base_prompt: str, X: np.ndarray, y: np.ndarray, model_name: str,
) -> tuple[list, list]:
    """Run ICL classification experiment on a test set.

    This function:
    1. Loads the specified language model
    2. For each test sample, constructs a full prompt
    3. Generates a prediction using the model
    4. Extracts the predicted label
    5. Collects all predictions and ground truth labels

    Args:
        base_prompt (str): Base prompt with instructions and examples
        X (np.ndarray): Array of text samples to classify
        y (np.ndarray): Array of true labels
        model_name (str): HuggingFace model identifier (e.g., 'facebook/opt-1.3b')

    Returns:
        tuple[list, list]: (predicted_labels, true_labels)

    """
    labels = list(np.unique(y))

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, dtype=torch.float16, device_map="auto",
    )

    y_true = []
    y_pred = []

    for idx, x in enumerate(X):
        prompt = base_prompt + inject_sample(x)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=1,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
        )

        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = result.split()[-1]
        pred = extract_label(pred, labels)

        y_true.append(y[idx])
        y_pred.append(pred)

    return y_pred, y_true


###  Evaluation Function

In [7]:
def evaluate(
    model: str, base_prompt: str, X, y, shots_n: int = 3, experiment_func = run_experiment,
) -> tuple[float, float]:
    """Evaluate ICL performance with a specific number of examples (shots).

    Workflow:
    1. Load (samples + shots_n) data points
    2. Use first shots_n samples as examples in the prompt
    3. Use remaining samples as the test set
    4. Run classification and compute metrics

    Args:
        model (str): HuggingFace model name
        base_prompt (str): Base instruction prompt
        path (str): Path to dataset CSV
        encoding (str): File encoding
        samples (int): Number of test samples
        shots_n (int): Number of examples to include (0=zero-shot, 1=one-shot, etc.)

    Returns:
        tuple[float, float, float, float]: (accuracy, f1_score, precision, recall)

    """
    if shots_n > 0:
        base_prompt += "### Examples\n"

    for i in range(shots_n):
        base_prompt += inject_example(X[i], y[i])

    X, y = X[shots_n:], y[shots_n:]

    y_pred, y_true = experiment_func(base_prompt, X, y, model)

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="macro")
    precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_true, y_pred, average="macro", zero_division=0)

    return acc, f1, precision, recall, y_true, y_pred



# Eksperymnet ICL konfiguracja

In [8]:
prompt = """Classify the sentiment of the tweet.
Choose only one label: positive, negative, neutral.
Reply ONLY with the label. Do not explain.

"""
shots = [0, 1, 3, 20]
reps = 10
samples_n = 300
result_path = f"result_{round(time())}.csv"
model_name = "facebook/opt-2.7b"
safe_name = model_name.replace("/", "_")


df_prot = {
    "lp": [],
}
for shot in shots:
    df_prot[f"{shot}-shots f1"] = []
    df_prot[f"{shot}-shots acc"] = []
    df_prot[f"{shot}-shots precision"] = []
    df_prot[f"{shot}-shots recall"] = []


confusion_matrices = {shot: [] for shot in shots}

for i in range(reps):
    df_prot["lp"].append(i)
    for shot in shots:
        X, y = load_train_dataset(
            ds_path,
            "latin1",
            samples=samples_n + shot,
        )
        acc, f1, precision, recall, y_true, y_pred = evaluate(
            model_name,
            prompt,
            X,
            y,
            shots_n=shot,
            experiment_func=run_experiment,
        )
        df_prot[f"{shot}-shots f1"].append(f1)
        df_prot[f"{shot}-shots acc"].append(acc)
        df_prot[f"{shot}-shots precision"].append(precision)
        df_prot[f"{shot}-shots recall"].append(recall)

        cm = confusion_matrix(y_true, y_pred, labels=["positive", "negative", "neutral"])
        confusion_matrices[shot].append(cm)
    print(f"Completed repetition {i + 1}/{reps}")

df = pd.DataFrame(df_prot)
df.to_csv(result_path)

with open(f"../res/shots/results/confusion_matrices_ICL_{safe_name}_{round(time())}.pkl", "wb") as f:
    pickle.dump(confusion_matrices, f)


Completed repetition 1/10
Completed repetition 2/10
Completed repetition 2/10
Completed repetition 3/10
Completed repetition 3/10
Completed repetition 4/10
Completed repetition 4/10
Completed repetition 5/10
Completed repetition 5/10
Completed repetition 6/10
Completed repetition 6/10
Completed repetition 7/10
Completed repetition 7/10
Completed repetition 8/10
Completed repetition 8/10
Completed repetition 9/10
Completed repetition 9/10
Completed repetition 10/10
Completed repetition 10/10


# Model size comparasion (with 3-shot)

In [None]:
prompt = """Classify the sentiment of the tweet.
Choose only one label: positive, negative, neutral.
Reply ONLY with the label. Do not explain.

"""
shots = 3
reps = 10
samples_n = 300
result_path = f"../res/models/result_{round(time())}.csv"

if not os.path.exists("../res/models/"):
    os.makedirs("../res/models/")

models = [
    "facebook/opt-125m",
    "facebook/opt-350m",
    "facebook/opt-1.3b",
    "facebook/opt-2.7b",
    # "facebook/opt-6.7b",   # Out of memory on 8GB GPU
]
safe_name = "model_size_clf"


df_prot = {
    "lp": [],
}
for model in models:
    df_prot[f"{model} f1"] = []
    df_prot[f"{model} acc"] = []
    df_prot[f"{model} precision"] = []
    df_prot[f"{model} recall"] = []


confusion_matrices = {model: [] for model in models}

with torch.no_grad():
    torch.cuda.empty_cache()

for i in range(reps):
    df_prot["lp"].append(i)
    X, y = load_train_dataset(
        ds_path, "latin1", samples=samples_n + shots,
    )
    for model in models:
        acc, f1, precision, recall, y_true, y_pred = evaluate(
            model,
            prompt,
            X,
            y,
            shots_n=shots,
            experiment_func=run_experiment,
        )
        df_prot[f"{model} f1"].append(f1)
        df_prot[f"{model} acc"].append(acc)
        df_prot[f"{model} precision"].append(precision)
        df_prot[f"{model} recall"].append(recall)

        cm = confusion_matrix(y_true, y_pred, labels=["positive", "negative", "neutral"])
        confusion_matrices[model].append(cm)

        # Clearn GPU memory after each model evaluation, CUDA memory leakage!
        print("Done model: ", model)
        with torch.no_grad():
            torch.cuda.empty_cache()

    print(f"Completed repetition {i + 1}/{reps}")

df = pd.DataFrame(df_prot)
df.to_csv(result_path)

with open(f"../res/models/confusion_matrices_ICL_{safe_name}_{round(time())}.pkl", "wb") as f:
    pickle.dump(confusion_matrices, f)


Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-2.7b
Completed repetition 1/10
Done model:  facebook/opt-2.7b
Completed repetition 1/10
Done model:  facebook/opt-125m
Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-2.7b
Completed repetition 2/10
Done model:  facebook/opt-2.7b
Completed repetition 2/10
Done model:  facebook/opt-125m
Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-2.7b
Completed repetition 3/10
Done model:  facebook/opt-2.7b
Completed repetition 3/10
Done model:  facebook/opt-125m
Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-350m
Done mo

In [None]:
# Experiment cell: compare generate-based extraction vs token-prob scoring
# This cell implements a helper `choose_label_by_prob` and runs a small test across models
import torch
import torch.nn.functional as F

def _encode_labels(tokenizer, labels):
    return [tokenizer.encode(l, add_special_tokens=False) for l in labels]

@torch.no_grad()
def choose_label_by_prob(model, tokenizer, prompt: str, labels: list[str], device: str = None):
    """Score candidate labels by computing log-probability of their token sequences."""
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    label_token_ids = _encode_labels(tokenizer, labels)
    enc = tokenizer(prompt, return_tensors='pt')
    input_ids = enc['input_ids'].to(device)
    attention_mask = enc.get('attention_mask', None)
    if attention_mask is not None:
        attention_mask = attention_mask.to(device)

    scores = []
    for tokens in label_token_ids:
        cur_input = input_ids.clone()
        cur_attn = attention_mask.clone() if attention_mask is not None else None
        logprob = 0.0
        valid = True
        for t in tokens:
            outputs = model(input_ids=cur_input, attention_mask=cur_attn)
            logits = outputs.logits
            last_logits = logits[0, -1, :]
            probs = F.log_softmax(last_logits, dim=-1)
            logp_token = probs[t].item()
            logprob += logp_token
            # append token to input and continue scoring
            new_token = torch.tensor([[t]], device=device)
            cur_input = torch.cat([cur_input, new_token], dim=1)
            if cur_attn is not None:
                cur_attn = torch.cat([cur_attn, torch.ones((1,1), device=device)], dim=1)
        if not valid:
            scores.append(float('-inf'))
        else:
            scores.append(logprob)
    best_idx = int(torch.tensor(scores).argmax().item())
    return labels[best_idx], scores

def predict_and_eval(model_name: str, base_prompt: str, X: list, y: list, labels: list[str], shots_n: int = 1):
    # load tokenizer and model (fallback to cpu if needed)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map='auto')
    except Exception as e:
        print('Falling back to CPU load for', model_name, '->', e)
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, device_map={'': 'cpu'})
    device = model.device

    y_gen = []
    y_score = []
    for i, text in enumerate(X):
        prompt = base_prompt
        if shots_n > 0:
            # use first `shots_n` from X as examples (simple deterministic choice)
            for j in range(shots_n):
                prompt += inject_example(X[j], y[j])
        prompt += inject_sample(text)
        # generation-based prediction
        inputs = tokenizer(prompt, return_tensors='pt').to(device)
        outputs = model.generate(**inputs, max_new_tokens=5, do_sample=False, eos_token_id=tokenizer.eos_token_id)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        gen_part = result[len(prompt):].strip() if result.startswith(prompt) else result
        y_gen.append(extract_label(gen_part, labels))
        # scoring-based prediction
        best_label, scores = choose_label_by_prob(model, tokenizer, prompt, labels, device=str(device))
        y_score.append(best_label)

    # metrics
    gen_acc = accuracy_score(y, y_gen)
    gen_f1 = f1_score(y, y_gen, average='macro', zero_division=0)
    score_acc = accuracy_score(y, y_score)
    score_f1 = f1_score(y, y_score, average='macro', zero_division=0)
    return { 'gen_acc': gen_acc, 'gen_f1': gen_f1 }, { 'score_acc': score_acc, 'score_f1': score_f1 }, y_gen, y_score


prompt = """Classify the sentiment of the tweet.
Choose only one label: positive, negative, neutral.
Reply ONLY with the label. Do not explain.

"""
# Run a small comparison across small models (adjust samples and shots as needed)
models_to_test = ['facebook/opt-125m', 'facebook/opt-350m']
labels = ['positive', 'negative', 'neutral']
samples_for_test = 40
# load samples (shots + test). We'll use shots_n=1 and the rest for evaluation
X_all, y_all = load_train_dataset(ds_path, samples=samples_for_test + 1)
X_test = X_all[1:]
y_test = y_all[1:]
shots_n = 1

for m in models_to_test:
    print('Testing model', m)
    try:
        gen_metrics, score_metrics, y_gen, y_score = predict_and_eval(m, prompt, X_test, y_test, labels, shots_n=shots_n)
    except Exception as e:
        print('Error testing', m, e)
        continue
    print(f'Model: {m} | Generate -> acc: {gen_metrics["gen_acc"]:.3f}, f1: {gen_metrics["gen_f1"]:.3f} | Score -> acc: {score_metrics["score_acc"]:.3f}, f1: {score_metrics["score_f1"]:.3f}')

# End of experiment cell