### Information Retrieval

In [1]:
import pandas as pd
import numpy as np
from math import log2

In [2]:
INPUT_SYSTEM_RESULTS = "system_results.csv"   
INPUT_QRELS = "qrels.csv"
OUTPUT_EVAL = "ir_eval.csv"

In [3]:
def precision_at_k(ranked_docs, rel_set, k):
    if k <= 0:
        return 0.0
    top_docs = ranked_docs[:k]
    if not top_docs:
        return 0.0
    rel_count = sum(1 for d in top_docs if d in rel_set)
    return rel_count / len(top_docs)


In [4]:
def recall_at_k(ranked_docs, rel_set, k):
    if not rel_set:
        return 0.0
    top_docs = ranked_docs[:k]
    rel_count = sum(1 for d in top_docs if d in rel_set)
    return rel_count / len(rel_set)

In [5]:
def r_precision(ranked_docs, rel_set):
    R = len(rel_set)
    if R == 0:
        return 0.0
    return precision_at_k(ranked_docs, rel_set, R)

In [6]:
def average_precision(ranked_docs, rel_set):
    R = len(rel_set)
    if R == 0:
        return 0.0

    sum_prec = 0.0
    rel_seen = 0
    for i, doc in enumerate(ranked_docs, start=1):
        if doc in rel_set:
            rel_seen += 1
            sum_prec += rel_seen / i
    return sum_prec / R

In [7]:
def dcg_at_k(ranked_docs, rel_grades, k):
    dcg = 0.0
    for i, doc in enumerate(ranked_docs[:k], start=1):
        rel = rel_grades.get(doc, 0)
        if i == 1:
            dcg += rel
        else:
            dcg += rel / log2(i)
    return dcg

In [8]:
def ndcg_at_k(ranked_docs, rel_grades, k):
    if not rel_grades:
        return 0.0

    dcg = dcg_at_k(ranked_docs, rel_grades, k)
    ideal_docs_sorted = sorted(rel_grades.items(), key=lambda x: x[1], reverse=True)
    ideal_ranked_docs = [doc_id for doc_id, _ in ideal_docs_sorted]
    ideal_dcg = dcg_at_k(ideal_ranked_docs, rel_grades, k)

    if ideal_dcg == 0:
        return 0.0

    return dcg / ideal_dcg

In [9]:
def evaluate_systems(system_results_path, qrels_path, output_path):
    # Load the input CSVs
    system_results = pd.read_csv('ttdssystemresults.csv')
    qrels = pd.read_csv('qrels.csv')

    # Binary relevance: rel>0 -> relevant
    qrels_binary = {
        q: set(sub.loc[sub["relevance"] > 0, "doc_id"])
        for q, sub in qrels.groupby("query_id")
    }

    # Graded relevance
    qrels_graded = {
        q: dict(zip(sub["doc_id"], sub["relevance"]))
        for q, sub in qrels.groupby("query_id")
    }

    rows = []

    # Loop over systems 
    for system_id, sys_group in system_results.groupby("system_number"):
        per_query_metrics = {}

        # Loop over queries 
        for query_id, q_group in sys_group.groupby("query_number"):
            ranked_docs = (
                q_group.sort_values("rank_of_doc")["doc_number"].tolist()
            )

            rel_set = qrels_binary.get(query_id, set())
            rel_grades = qrels_graded.get(query_id, {})

            P10 = precision_at_k(ranked_docs, rel_set, 10)
            R50 = recall_at_k(ranked_docs, rel_set, 50)
            Rprec = r_precision(ranked_docs, rel_set)
            AP_val = average_precision(ranked_docs, rel_set)
            nDCG10 = ndcg_at_k(ranked_docs, rel_grades, 10)
            nDCG20 = ndcg_at_k(ranked_docs, rel_grades, 20)

            per_query_metrics[query_id] = (
                P10, R50, Rprec, AP_val, nDCG10, nDCG20
            )

            # One row per (system, query)
            rows.append(
                {
                    "system_number": system_id,
                    "query_number": query_id,
                    "P@10": P10,
                    "R@50": R50,
                    "r-precision": Rprec,
                    "AP": AP_val,
                    "nDCG@10": nDCG10,
                    "nDCG@20": nDCG20,
                }
            )

        # Mean row per system across the 10 queries
        if per_query_metrics:
            metrics_array = np.array(list(per_query_metrics.values()))  # shape (10,6)
            means = metrics_array.mean(axis=0)

            rows.append(
                {
                    "system_number": system_id,
                    "query_number": "mean",
                    "P@10": means[0],
                    "R@50": means[1],
                    "r-precision": means[2],
                    "AP": means[3],
                    "nDCG@10": means[4],
                    "nDCG@20": means[5],
                }
            )

    # Build DataFrame in the EXACT column order required
    out_df = pd.DataFrame(rows)
    out_df = out_df[
        [
            "system_number",
            "query_number",
            "P@10",
            "R@50",
            "r-precision",
            "AP",
            "nDCG@10",
            "nDCG@20",
        ]
    ]

    # Sort: for each system_number, queries 1..10 then "mean"
    def query_sort_key(q):
        if isinstance(q, str):
            return 9999  
        return int(q)

    out_df["query_sort"] = out_df["query_number"].apply(query_sort_key)
    out_df = out_df.sort_values(["system_number", "query_sort"]).drop(columns=["query_sort"])

    # Write CSV with values rounded to 3 decimal places
    out_df.to_csv(output_path, index=False, float_format="%.3f")

    return out_df


if __name__ == "__main__":
    df_eval = evaluate_systems(INPUT_SYSTEM_RESULTS, INPUT_QRELS, OUTPUT_EVAL)
    print(f"Saved evaluation results to {OUTPUT_EVAL}")

Saved evaluation results to ir_eval.csv


In [10]:
if __name__ == "__main__":
    df_eval = evaluate_systems(INPUT_SYSTEM_RESULTS, INPUT_QRELS, OUTPUT_EVAL)
    print(f"Saved evaluation results to {OUTPUT_EVAL}")
    print(df_eval.head(12))

Saved evaluation results to ir_eval.csv
    system_number query_number  P@10      R@50  r-precision        AP  \
0               1            1  0.40  0.666667     0.166667  0.244022   
1               1            2  0.30  1.000000     0.250000  0.405211   
2               1            3  0.00  1.000000     0.000000  0.050000   
3               1            4  0.60  0.875000     0.700000  0.625316   
4               1            5  0.20  0.428571     0.285714  0.119048   
5               1            6  0.70  1.000000     0.750000  0.692321   
6               1            7  0.20  0.666667     0.333333  0.233333   
7               1            8  0.60  1.000000     0.625000  0.704088   
8               1            9  0.90  0.900000     0.900000  0.850429   
9               1           10  0.00  0.800000     0.000000  0.078333   
10              1         mean  0.39  0.833690     0.401071  0.400210   
11              2            1  0.10  0.666667     0.000000  0.088808   

     nDCG@

### Text Analysis

In [11]:
import re
import nltk
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.decomposition import LatentDirichletAllocation

In [12]:
DATA_PATH = "bible_and_quran.tsv"   
STOPWORDS_PATH = "stopwords.txt"    
N_TOPICS = 20 

In [13]:
def load_stopwords(path: str) -> set:
    stopwords_set = set()
    with open(path, encoding="utf-8") as f:
        for line in f:
            w = line.strip().lower()
            if w:
                stopwords_set.add(w)
    return stopwords_set


stop_words = load_stopwords(STOPWORDS_PATH)
print(f"Loaded {len(stop_words)} custom stopwords from {STOPWORDS_PATH}")

Loaded 570 custom stopwords from stopwords.txt


In [14]:
df = pd.read_csv(DATA_PATH, sep="\t", header=None, names=["corpus", "text"])
print("Loaded data shape:", df.shape)
print("Example rows:")
print(df.head(), "\n")

Loaded data shape: (30367, 2)
Example rows:
  corpus                                               text
0     OT  In the beginning God created the heavens and t...
1     OT  The earth was without form, and void; and dark...
2     OT  Then God said, "Let there be light"; and there...
3     OT  And God saw the light, that it was good; and G...
4     OT  God called the light Day, and the darkness He ... 



In [15]:
stemmer = PorterStemmer()

def preprocess(text: str) -> str:
  
    text = str(text)
    text = text.lower()
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = text.split()

    # Remove stopwords and very short tokens, then stem
    processed_tokens = []
    for tok in tokens:
        if tok in stop_words:
            continue
        if len(tok) <= 1:
            continue
        stemmed = stemmer.stem(tok)
        processed_tokens.append(stemmed)

    return " ".join(processed_tokens)


df["clean"] = df["text"].apply(preprocess)

print("After preprocessing, example cleaned verses:")
print(df[["corpus", "text", "clean"]].head(), "\n")

After preprocessing, example cleaned verses:
  corpus                                               text  \
0     OT  In the beginning God created the heavens and t...   
1     OT  The earth was without form, and void; and dark...   
2     OT  Then God said, "Let there be light"; and there...   
3     OT  And God saw the light, that it was good; and G...   
4     OT  God called the light Day, and the darkness He ...   

                                               clean  
0                       begin god creat heaven earth  
1  earth form void dark face deep spirit god hove...  
2                                    god light light  
3                god light good god divid light dark  
4   god call light day dark call night even morn day   



In [16]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["clean"])
feature_names = np.array(vectorizer.get_feature_names_out())

print("Document–term matrix shape:", X.shape)
print("Number of unique tokens:", len(feature_names), "\n")

Document–term matrix shape: (30367, 8765)
Number of unique tokens: 8765 



In [17]:
corpus_labels = sorted(df["corpus"].unique())
print("Detected corpus labels:", corpus_labels, "\n")

for corpus_label in corpus_labels:
    print(f"=== Computing MI and χ² for corpus: {corpus_label} ===")

    # Binary target: this corpus vs all others
    y_binary = (df["corpus"] == corpus_label).astype(int).values

    # Mutual Information 
    mi_scores = mutual_info_classif(
        X, y_binary, discrete_features=True, random_state=0
    )

    mi_df = pd.DataFrame({
        "token": feature_names,
        "score": mi_scores
    }).sort_values("score", ascending=False)

    # Chi-square
    chi2_scores, _ = chi2(X, y_binary)

    chi2_df = pd.DataFrame({
        "token": feature_names,
        "score": chi2_scores
    }).sort_values("score", ascending=False)

    # Save to CSV as required: token,score
    safe_label = corpus_label.lower().replace(" ", "_")
    mi_filename = f"mi_{safe_label}.csv"
    chi2_filename = f"chi2_{safe_label}.csv"

    mi_df.to_csv(mi_filename, index=False)
    chi2_df.to_csv(chi2_filename, index=False)

    print(f"Saved MI scores to   {mi_filename}")
    print(f"Saved χ² scores to   {chi2_filename}\n")

print("MI/χ² computation finished.\n")


Detected corpus labels: ['NT', 'OT', 'Quran'] 

=== Computing MI and χ² for corpus: NT ===
Saved MI scores to   mi_nt.csv
Saved χ² scores to   chi2_nt.csv

=== Computing MI and χ² for corpus: OT ===
Saved MI scores to   mi_ot.csv
Saved χ² scores to   chi2_ot.csv

=== Computing MI and χ² for corpus: Quran ===
Saved MI scores to   mi_quran.csv
Saved χ² scores to   chi2_quran.csv

MI/χ² computation finished.



In [18]:
print(f"Fitting LDA with {N_TOPICS} topics...")
lda = LatentDirichletAllocation(
    n_components=N_TOPICS,
    random_state=0,
    learning_method="batch"
)

doc_topic_probs = lda.fit_transform(X)
print("LDA fitted.\n")

Fitting LDA with 20 topics...
LDA fitted.



In [19]:
topic_avgs = {}
top_topic_index = {}

for corpus_label in corpus_labels:
    mask = (df["corpus"] == corpus_label).values
    avg = doc_topic_probs[mask].mean(axis=0)  # average over docs
    topic_avgs[corpus_label] = avg
    best_topic = int(np.argmax(avg))
    top_topic_index[corpus_label] = best_topic

    print(f"Corpus: {corpus_label}")
    print("  First few average topic scores:", np.round(avg[:5], 4))
    print(f"  Top topic index for this corpus: {best_topic}\n")

Corpus: NT
  First few average topic scores: [0.0266 0.0483 0.0281 0.0309 0.0449]
  Top topic index for this corpus: 5

Corpus: OT
  First few average topic scores: [0.0496 0.0392 0.0837 0.0561 0.0464]
  Top topic index for this corpus: 19

Corpus: Quran
  First few average topic scores: [0.0216 0.0163 0.0207 0.0164 0.0695]
  Top topic index for this corpus: 8



In [20]:
def top_tokens_for_topic(topic_idx: int, n: int = 10):
    topic_vector = lda.components_[topic_idx]
    top_ids = topic_vector.argsort()[::-1][:n]
    tokens = feature_names[top_ids]
    weights = topic_vector[top_ids]
    return tokens, weights


for corpus_label in corpus_labels:
    t_idx = top_topic_index[corpus_label]
    tokens, weights = top_tokens_for_topic(t_idx, n=10)

    print(f"=== Corpus: {corpus_label} — Dominant Topic {t_idx} ===")
    for tok, w in zip(tokens, weights):
        print(f"{tok:15s}  {w:.4f}")
    print()

    # Save to CSV for report tables
    safe_label = corpus_label.lower().replace(" ", "_")
    out_df = pd.DataFrame({"token": tokens, "weight": weights})
    out_df.to_csv(f"lda_top_topic_tokens_{safe_label}.csv", index=False)

print("LDA topic analysis complete.")
print("Top-topic tokens per corpus saved as lda_top_topic_tokens_<corpus>.csv")

=== Corpus: NT — Dominant Topic 5 ===
jesu             881.0184
thing            811.7532
god              605.4264
christ           501.3980
spirit           299.6900
faith            265.0224
work             236.4003
man              187.6697
discipl          180.6544
receiv           171.1061

=== Corpus: OT — Dominant Topic 19 ===
lord             1799.4951
god              880.4480
word             807.6088
hear             604.6053
sin              576.0461
command          563.7937
nt               408.1103
heart            400.7994
law              394.7199
israel           388.7234

=== Corpus: Quran — Dominant Topic 8 ===
god              3084.6533
lord             915.4582
peopl            523.8709
fear             494.8636
merci            453.4935
messeng          444.2564
worship          398.6101
believ           397.0213
forgiv           248.3855
seek             243.0494

LDA topic analysis complete.
Top-topic tokens per corpus saved as lda_top_topic_tokens_<corpus>.c

### Text Classification

In [21]:
import csv
import string
import scipy.sparse

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [22]:
import scipy.sparse

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [23]:
train_path = "train.txt"                
test_path  = "ttds_2025_cw2_test.txt"   

# train/dev data
train_df = pd.read_csv(
    train_path,
    sep="\t",
    header=None,
    names=["id", "sentiment", "tweet"],
    quoting=csv.QUOTE_NONE,
    on_bad_lines="skip"
)

# keep only the 3 sentiment labels
train_df = train_df[train_df["sentiment"].isin(["positive", "negative", "neutral"])]

print("Loaded train/dev data:", train_df.shape)
print(train_df["sentiment"].value_counts(), "\n")

# labelled test data
test_df = pd.read_csv(
    test_path,
    sep="\t",
    header=None,
    names=["id", "sentiment", "tweet"],
    quoting=csv.QUOTE_NONE,
    on_bad_lines="skip"
)

test_df = test_df[test_df["sentiment"].isin(["positive", "negative", "neutral"])]

print("Loaded test data:", test_df.shape)
print(test_df["sentiment"].value_counts(), "\n")

Loaded train/dev data: (18646, 3)
sentiment
neutral     8789
positive    5979
negative    3878
Name: count, dtype: int64 

Loaded test data: (4662, 3)
sentiment
neutral     2197
positive    1495
negative     970
Name: count, dtype: int64 



In [24]:
train_split_df, dev_split_df = train_test_split(
    train_df,
    test_size=0.1,             # 90% train, 10% dev
    random_state=42,
    stratify=train_df["sentiment"]
)

print("Train split size:", len(train_split_df))
print("Dev split size:  ", len(dev_split_df), "\n")

Train split size: 16781
Dev split size:   1865 



In [25]:
punct_re = re.compile(f"[{re.escape(string.punctuation)}]")

def tokenize(text: str):
    text = str(text)
    text = punct_re.sub(" ", text)
    text = text.lower()
    return text.split()

# tokenised docs
train_tokens = [tokenize(t) for t in train_split_df["tweet"]]
dev_tokens   = [tokenize(t) for t in dev_split_df["tweet"]]
test_tokens  = [tokenize(t) for t in test_df["tweet"]]


In [26]:
def build_vocab(docs_tokens):
    vocab = set()
    for doc in docs_tokens:
        vocab.update(doc)
    return {word: idx for idx, word in enumerate(sorted(vocab))}

word2id = build_vocab(train_tokens)
vocab_size = len(word2id)
print("Vocabulary size:", vocab_size)

def convert_to_bow_matrix(preprocessed_docs, word2id):
    n_docs = len(preprocessed_docs)
    oov_index = len(word2id)
    mat_size = (n_docs, oov_index + 1)

    X_dok = scipy.sparse.dok_matrix(mat_size, dtype=np.int32)

    for doc_id, doc in enumerate(preprocessed_docs):
        for word in doc:
            token_id = word2id.get(word, oov_index)
            X_dok[doc_id, token_id] += 1

    return X_dok.tocsr()

X_train = convert_to_bow_matrix(train_tokens, word2id)
X_dev   = convert_to_bow_matrix(dev_tokens,   word2id)
X_test  = convert_to_bow_matrix(test_tokens,  word2id)

print("Feature matrix shapes:")
print("  X_train:", X_train.shape)
print("  X_dev:  ", X_dev.shape)
print("  X_test: ", X_test.shape, "\n")

Vocabulary size: 37127
Feature matrix shapes:
  X_train: (16781, 37128)
  X_dev:   (1865, 37128)
  X_test:  (4662, 37128) 



In [27]:
sentiment2id = {"positive": 0, "negative": 1, "neutral": 2}
id2sentiment = {v: k for k, v in sentiment2id.items()}

y_train = train_split_df["sentiment"].map(sentiment2id).to_numpy()
y_dev   = dev_split_df["sentiment"].map(sentiment2id).to_numpy()
y_test  = test_df["sentiment"].map(sentiment2id).to_numpy()


In [28]:
def compute_scores(y_true, y_pred):
    labels = [
        sentiment2id["positive"],
        sentiment2id["negative"],
        sentiment2id["neutral"],
    ]
    p, r, f, _ = precision_recall_fscore_support(
        y_true,
        y_pred,
        labels=labels,
        zero_division=0
    )

    scores = {
        "p-pos": p[0], "r-pos": r[0], "f-pos": f[0],
        "p-neg": p[1], "r-neg": r[1], "f-neg": f[1],
        "p-neu": p[2], "r-neu": r[2], "f-neu": f[2],
        "p-macro": p.mean(),
        "r-macro": r.mean(),
        "f-macro": f.mean(),
    }
    return scores

In [29]:
print("================================")
print(" BASELINE MODEL: SVC (BOW)     ")
print("================================")

baseline_clf = SVC(C=1000, kernel="linear")

baseline_clf.fit(X_train, y_train)

y_train_pred_baseline = baseline_clf.predict(X_train)
y_dev_pred_baseline   = baseline_clf.predict(X_dev)
y_test_pred_baseline  = baseline_clf.predict(X_test)

baseline_dev_scores = compute_scores(y_dev, y_dev_pred_baseline)
baseline_test_scores = compute_scores(y_test, y_test_pred_baseline)

print("Baseline dev macro F1:",
      f"{baseline_dev_scores['f-macro']:.4f}")
print("Baseline test macro F1:",
      f"{baseline_test_scores['f-macro']:.4f}", "\n")

 BASELINE MODEL: SVC (BOW)     
Baseline dev macro F1: 0.5603
Baseline test macro F1: 0.5583 



In [30]:
def train_and_eval_model(name, clf, X_train, y_train, X_dev, y_dev, X_test, y_test,
                         use_dense=False):
    
    if use_dense:
        X_train_fit = X_train.toarray()
        X_dev_fit   = X_dev.toarray()
        X_test_fit  = X_test.toarray()
    else:
        X_train_fit = X_train
        X_dev_fit   = X_dev
        X_test_fit  = X_test

    print(f"----- {name} -----")
    clf.fit(X_train_fit, y_train)

    y_train_pred = clf.predict(X_train_fit)
    y_dev_pred   = clf.predict(X_dev_fit)
    y_test_pred  = clf.predict(X_test_fit)

    dev_scores = compute_scores(y_dev, y_dev_pred)
    test_scores = compute_scores(y_test, y_test_pred)

    print("Dev macro F1: ", f"{dev_scores['f-macro']:.4f}")
    print("Test macro F1:", f"{test_scores['f-macro']:.4f}", "\n")

    return {
        "name": name,
        "clf": clf,
        "y_train_pred": y_train_pred,
        "y_dev_pred": y_dev_pred,
        "y_test_pred": y_test_pred,
        "dev_scores": dev_scores,
        "test_scores": test_scores,
    }

improved_candidates = []

# 1) Logistic Regression 
logreg = LogisticRegression(max_iter=2000, n_jobs=-1)
improved_candidates.append(
    train_and_eval_model(
        "LogisticRegression (BOW)",
        logreg,
        X_train, y_train,
        X_dev,   y_dev,
        X_test,  y_test,
        use_dense=False
    )
)

# 2) Random Forest 
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
improved_candidates.append(
    train_and_eval_model(
        "RandomForestClassifier (BOW)",
        rf,
        X_train, y_train,
        X_dev,   y_dev,
        X_test,  y_test,
        use_dense=True  
    )
)

# 3) Multinomial Naive Bayes 
mnb = MultinomialNB()
improved_candidates.append(
    train_and_eval_model(
        "MultinomialNB (BOW)",
        mnb,
        X_train, y_train,
        X_dev,   y_dev,
        X_test,  y_test,
        use_dense=False
    )
)

# 4) LinearSVC 
linsvm = LinearSVC(C=1.0)
improved_candidates.append(
    train_and_eval_model(
        "LinearSVC (BOW)",
        linsvm,
        X_train, y_train,
        X_dev,   y_dev,
        X_test,  y_test,
        use_dense=False
    )
)

----- LogisticRegression (BOW) -----
Dev macro F1:  0.6147
Test macro F1: 0.6094 

----- RandomForestClassifier (BOW) -----
Dev macro F1:  0.4843
Test macro F1: 0.4783 

----- MultinomialNB (BOW) -----
Dev macro F1:  0.5926
Test macro F1: 0.5986 

----- LinearSVC (BOW) -----
Dev macro F1:  0.5957
Test macro F1: 0.5810 



In [31]:

best_candidate = None
best_f_macro = -1.0

for cand in improved_candidates:
    f_macro = cand["dev_scores"]["f-macro"]
    if f_macro > best_f_macro:
        best_f_macro = f_macro
        best_candidate = cand

print("=============================================")
print(" BEST IMPROVED MODEL (among 4 candidates)")
print(" Model:       ", best_candidate["name"])
print(" Dev macro F1:", f"{best_candidate['dev_scores']['f-macro']:.4f}")
print(" Test macro F1:",
      f"{best_candidate['test_scores']['f-macro']:.4f}")
print("=============================================\n")

improved_name = best_candidate["name"]
improved_clf = best_candidate["clf"]
y_train_pred_improved = best_candidate["y_train_pred"]
y_dev_pred_improved   = best_candidate["y_dev_pred"]
y_test_pred_improved  = best_candidate["y_test_pred"]

 BEST IMPROVED MODEL (among 4 candidates)
 Model:        LogisticRegression (BOW)
 Dev macro F1: 0.6147
 Test macro F1: 0.6094



In [32]:
def add_result_row(result_list, system_name, split_name, y_true, y_pred):
    scores = compute_scores(y_true, y_pred)
    row = {"system": system_name, "split": split_name}
    row.update(scores)
    result_list.append(row)

results = []

# baseline rows
add_result_row(results, "baseline", "train", y_train, y_train_pred_baseline)
add_result_row(results, "baseline", "dev",   y_dev,   y_dev_pred_baseline)
add_result_row(results, "baseline", "test",  y_test,  y_test_pred_baseline)

# improved rows (using best candidate)
add_result_row(results, "improved", "train", y_train, y_train_pred_improved)
add_result_row(results, "improved", "dev",   y_dev,   y_dev_pred_improved)
add_result_row(results, "improved", "test",  y_test,  y_test_pred_improved)

header = [
    "system", "split",
    "p-pos", "r-pos", "f-pos",
    "p-neg", "r-neg", "f-neg",
    "p-neu", "r-neu", "f-neu",
    "p-macro", "r-macro", "f-macro"
]

with open("classification.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=header)
    writer.writeheader()
    for row in results:
        writer.writerow(row)

print("Saved classification.csv with rows:")
for row in results:
    print(f"{row['system']:8s} {row['split']:5s}  f-macro={row['f-macro']:.4f}")

Saved classification.csv with rows:
baseline train  f-macro=0.9993
baseline dev    f-macro=0.5603
baseline test   f-macro=0.5583
improved train  f-macro=0.9721
improved dev    f-macro=0.6147
improved test   f-macro=0.6094
