In [11]:
import random
import re
from typing import Callable
from time import time

import numpy as np
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import os
import pickle

from scipy.stats import shapiro, f_oneway, kruskal

In [2]:

import kagglehub

# Download dataset from Kaggle
ds_path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset", path="train.csv")
print("Path to dataset files:", ds_path)
ds_path


Path to dataset files: /home/szymon/.cache/kagglehub/datasets/abhi8923shriv/sentiment-analysis-dataset/versions/9/train.csv


'/home/szymon/.cache/kagglehub/datasets/abhi8923shriv/sentiment-analysis-dataset/versions/9/train.csv'

### Text processing

In [3]:
def clean_text(text: str) -> str:
    """
    Clean and normalize tweet text.

    Args:
        text (str): Raw tweet text

    Returns:
        str: Cleaned text with URLs, HTML, mentions removed

    Example:
        >>> clean_text("@user I LOVE this! #amazing")
        'i love this! amazing'
    """
    if not text:
        return ""

    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def load_train_dataset(
    path: str, encoding: str = "latin1", samples: int = 512,
) -> tuple[np.ndarray, np.ndarray]:
    """
    Load and preprocess the sentiment dataset.

    Args:
        path (str): Path to the CSV file
        encoding (str): File encoding (default: 'latin1')
        samples (int): Number of samples to randomly select

    Returns:
        tuple: (X, y) where X is text array and y is label array

    Example:
        >>> X, y = load_train_dataset("data.csv", samples=100)
        >>> print(X[0], y[0])
        'i love this product' 'positive'
    """

    df = pd.read_csv(path, encoding=encoding)
    df = df.dropna()

    df['preprocessed_text'] = df['selected_text'].apply(clean_text)

    idxes = random.sample(range(df.shape[0]), samples)
    return df['preprocessed_text'].values[idxes], df['sentiment'].values[idxes]


### Prompt Engineering Functions for ICL

In [4]:
def inject_example(text: str, sentiment: str) -> str:
    return f"Tweet: {text}\nSentiment: {sentiment}\n\n"


def inject_sample(text: str) -> str:
    return (
        f"### Now classify the following:\n"
        f"Tweet: {text}\nSentiment:"
    )


### Label Extraction

In [5]:
def extract_label(result: str, labels: list[str]) -> str:

    for lab in labels:
        if lab in result.lower():
            return lab
    return "unknown"


### IN-CONTEXT LEARNING EXPERIMENT

In [6]:
def run_experiment(
    base_prompt: str, X: np.ndarray, y: np.ndarray, model_name: str,
) -> tuple[list, list]:
    """
    Run ICL classification experiment on a test set.

    This function:
    1. Loads the specified language model
    2. For each test sample, constructs a full prompt
    3. Generates a prediction using the model
    4. Extracts the predicted label
    5. Collects all predictions and ground truth labels

    Args:
        base_prompt (str): Base prompt with instructions and examples
        X (np.ndarray): Array of text samples to classify
        y (np.ndarray): Array of true labels
        model_name (str): HuggingFace model identifier (e.g., 'facebook/opt-1.3b')

    Returns:
        tuple[list, list]: (predicted_labels, true_labels)

    """
    labels = list(np.unique(y))

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, dtype=torch.float16, device_map="auto",
    )

    y_true = []
    y_pred = []

    for idx, x in enumerate(X):
        prompt = base_prompt + inject_sample(x)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=3,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
        )

        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = result.split()[-1]
        pred = extract_label(pred, labels)

        y_true.append(y[idx])
        y_pred.append(pred)

    return y_pred, y_true


###  Evaluation Function

In [7]:
def evaluate(
    model: str, base_prompt: str, path: str, encoding: str = "latin1",
    samples: int = 512, shots_n: int = 3,
) -> tuple[float, float]:
    """
    Evaluate ICL performance with a specific number of examples (shots).

    Workflow:
    1. Load (samples + shots_n) data points
    2. Use first shots_n samples as examples in the prompt
    3. Use remaining samples as the test set
    4. Run classification and compute metrics

    Args:
        model (str): HuggingFace model name
        base_prompt (str): Base instruction prompt
        path (str): Path to dataset CSV
        encoding (str): File encoding
        samples (int): Number of test samples
        shots_n (int): Number of examples to include (0=zero-shot, 1=one-shot, etc.)

    Returns:
        tuple[float, float, float, float]: (accuracy, f1_score, precision, recall)

    """
    X, y = load_train_dataset(
        path, encoding=encoding, samples=samples + shots_n,
    )

    if shots_n > 0:
        base_prompt += "### Examples\n"

    for i in range(shots_n):
        base_prompt += inject_example(X[i], y[i])

    X, y = X[shots_n:], y[shots_n:]

    y_pred, y_true = run_experiment(base_prompt, X, y, model)

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="macro")
    precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_true, y_pred, average="macro", zero_division=0)

    return acc, f1, precision, recall, y_true, y_pred



# Eksperymnet ICL konfiguracja

In [None]:
prompt = """Classify the sentiment of the tweet.
Choose only one: positive, negative, neutral.
Reply ONLY with the label. Do not explain.

"""
shots = [0, 1, 3, 4, 20]
reps = 10
samples_n = 300
result_path = f"result_{round(time())}.csv"
model_name = "facebook/opt-1.3b"
safe_name = model_name.replace("/", "_")


df_prot = {
    "lp": [],
}
for shot in shots:
    df_prot[f"{shot}-shots f1"] = []
    df_prot[f"{shot}-shots acc"] = []
    df_prot[f"{shot}-shots precision"] = []
    df_prot[f"{shot}-shots recall"] = []


confusion_matrices = {shot: [] for shot in shots}

for i in range(reps):
    df_prot["lp"].append(i)
    for shot in shots:
        acc, f1, precision, recall, y_true, y_pred = evaluate(model_name, prompt, ds_path, shots_n=shot, samples=samples_n)
        df_prot[f"{shot}-shots f1"].append(f1)
        df_prot[f"{shot}-shots acc"].append(acc)
        df_prot[f"{shot}-shots precision"].append(precision)
        df_prot[f"{shot}-shots recall"].append(recall)

        cm = confusion_matrix(y_true, y_pred, labels=["positive", "negative", "neutral"])
        confusion_matrices[shot].append(cm)

df = pd.DataFrame(df_prot)
df.to_csv(result_path)

with open(f"confusion_matrices_ICL_{safe_name}_{round(time())}.pkl", "wb") as f:
    pickle.dump(confusion_matrices, f)


KeyboardInterrupt: 

# Model size comparasion (with 3-shot)

In [8]:
prompt = """Classify the sentiment of the tweet.
Choose only one: positive, negative, neutral.
Reply ONLY with the label. Do not explain.

"""
shots = 3
reps = 10
samples_n = 300
result_path = f"../res/models/result_{round(time())}.csv"

if not os.path.exists("../res/models/"):
    os.makedirs("../res/models/")

models = [
    "facebook/opt-125m",
    "facebook/opt-350m",
    "facebook/opt-1.3b",
    "facebook/opt-2.7b",
    # "facebook/opt-6.7b",   # Out of memory on 8GB GPU
]
safe_name = "model_size_clf"


df_prot = {
    "lp": [],
}
for model in models:
    df_prot[f"{model} f1"] = []
    df_prot[f"{model} acc"] = []
    df_prot[f"{model} precision"] = []
    df_prot[f"{model} recall"] = []


confusion_matrices = {model: [] for model in models}

with torch.no_grad():
    torch.cuda.empty_cache()
    
for i in range(reps):
    df_prot["lp"].append(i)
    for model in models:
        acc, f1, precision, recall, y_true, y_pred = evaluate(model, prompt, ds_path, shots_n=shots, samples=samples_n)
        df_prot[f"{model} f1"].append(f1)
        df_prot[f"{model} acc"].append(acc)
        df_prot[f"{model} precision"].append(precision)
        df_prot[f"{model} recall"].append(recall)

        cm = confusion_matrix(y_true, y_pred, labels=["positive", "negative", "neutral"])
        confusion_matrices[model].append(cm)
        
        # Clearn GPU memory after each model evaluation, CUDA memory leakage!
        print("Done model: ", model)
        with torch.no_grad():
            torch.cuda.empty_cache()
    
    print(f"Completed repetition {i+1}/{reps}")

df = pd.DataFrame(df_prot)
df.to_csv(result_path)

with open(f"confusion_matrices_ICL_{safe_name}_{round(time())}.pkl", "wb") as f:
    pickle.dump(confusion_matrices, f)


Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-2.7b
Completed repetition 1/10
Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-2.7b
Completed repetition 2/10
Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-2.7b
Completed repetition 3/10
Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-2.7b
Completed repetition 4/10
Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-2.7b
Completed repetition 5/10
Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done model:  facebook/opt-2.7b
Completed repetition 6/10
Done model:  facebook/opt-125m
Done model:  facebook/opt-350m
Done model:  facebook/opt-1.3b
Done mo

## Wizualizacja macierzy pomyłek - TODO

In [10]:
with open("confusion_matrices_XXX.pkl", "rb") as f:
    confusion_matrices = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'confusion_matrices_XXX.pkl'

## Analiza statystyczna - ANOVA + post-hoc


?? dla porónania klasyfikacji  w zależności od różnej ilości przykładów w prompcie to do analizy statystycznej tylko F1?

In [15]:
def stat_test(result_path: str, variants: list[str], formaterr: Callable[[str], str]) -> None:
    # load data
    df_results = pd.read_csv(result_path)

    # only for F1
    print("Test normalności (Shapiro-Wilk)")
    normality_results = {}

    for var in variants:
        f1_values = df_results[formaterr(var)].values
        stat, p = shapiro(f1_values)
        is_normal = "Normalny" if p > 0.05 else "Nienormalny"
        normality_results[var] = p > 0.05
        print(f"{var}-shot: stat={stat:.6f}, p={p:.6f}, {is_normal}")
    all_normal = all(normality_results.values())

    print("Test porównawczy - ANOVA/Kruskal-Wallis")

    groups = [df_results[formaterr(var)].values for var in variants]

    if all_normal:
        f_stat, p_value = f_oneway(*groups)
        test_name = "ANOVA"
    else:
        f_stat, p_value = kruskal(*groups)
        test_name = "Kruskal-Wallis"

    print(f"Test: {test_name}")
    print(f"Statystyka: {f_stat:.4f}")
    print(f"p-value: {p_value:.8f}")

    if p_value < 0.05:
        print("Istnieje wpływ na F1 (p < 0.05)")
    else:
        print("Brak wpływu na F1 (p >= 0.05)")

    # TODO post-hoc


print(f"{'n-shots testing':=^50}")
stat_test("../res/shots/results/result_1764935332.csv", [0, 1, 3, 4, 20], lambda var: f"{var}-shots f1")

print(f"{'models size testing':=^50}")
stat_test("../res/models/result_1764938794.csv", models, lambda var: f"{var} f1")

Test normalności (Shapiro-Wilk)
0-shot: stat=0.912754, p=0.300448, Normalny
1-shot: stat=0.803410, p=0.015949, Nienormalny
3-shot: stat=0.909448, p=0.277222, Normalny
4-shot: stat=0.950597, p=0.675571, Normalny
20-shot: stat=0.864116, p=0.085312, Normalny
Test porównawczy - ANOVA/Kruskal-Wallis
Test: Kruskal-Wallis
Statystyka: 18.5939
p-value: 0.00094427
Istnieje wpływ na F1 (p < 0.05)
Test normalności (Shapiro-Wilk)
facebook/opt-125m-shot: stat=0.945496, p=0.615622, Normalny
facebook/opt-350m-shot: stat=0.944266, p=0.601376, Normalny
facebook/opt-1.3b-shot: stat=0.903584, p=0.239759, Normalny
facebook/opt-2.7b-shot: stat=0.889719, p=0.168344, Normalny
Test porównawczy - ANOVA/Kruskal-Wallis
Test: ANOVA
Statystyka: 8.1898
p-value: 0.00027657
Istnieje wpływ na F1 (p < 0.05)
