# 2025 COMP90042 Project #ellina
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
# Standard library imports for file handling, data types, and math
import os, json, pickle, math
from typing import Dict, List, Tuple

# Third-party libraries for NLP and retrieval
# spaCy is used for tokenising text and filtering stopwords/punctuation
# rank_bm25 provides the BM25 algorithm for evidence retrieval
# sentence_transformers is used for SBERT encoding and cosine similarity
import spacy                                # We use this for Tokenisation
from rank_bm25 import BM25Okapi             # We use this for BM25 retrieval (Supposed to be better than TF-IDF)
from sentence_transformers import SentenceTransformer, util  # SBERT encoding & cos-sim utils

# File paths to the datasets
TRAIN_CLAIMS_PATH  = "data/train-claims.json"
DEV_CLAIMS_PATH    = "data/dev-claims.json"
EVIDENCE_PATH      = "data/evidence.json"

# Load a small English spaCy model for tokenisation
nlp = spacy.load("en_core_web_sm") # SpaCy model
# Extra
doc = nlp("The Earth’s climate sensitivity is important.")
print([token.text for token in doc])

# Load claims from the given JSON file
def load_claims(claims_path: str) -> Dict:
    """Return claims from JSON file."""
    with open(claims_path) as f:
        claims = json.load(f)
    return claims

# Load evidence passages from the given JSON file
def load_evidences(evidence_path: str) -> Dict:
    """Return evidences from JSON file."""
    with open(evidence_path) as f:
        evidences = json.load(f)
    return evidences

# Tokenise a list of texts using spaCy and save the result to a cache file.
# If the cache already exists, load tokenised data directly from disk.
# This speeds up repeated runs by avoiding re-processing the same data.
def tokenise_cached(texts: List[str], cache_file: str) -> List[List[str]]:   
    """Simple spaCy tokenisation that caches to disk."""
    # Attempt to retrieve cached data 
    os.makedirs(os.path.dirname(cache_file), exist_ok=True)                  
    if os.path.exists(cache_file):                                           
        with open(cache_file, "rb") as f:                                    
            return pickle.load(f)                                            

    # Prepare to tokenise the list of documents and save to a file
    out: List[List[str]] = []                                                
    for doc in nlp.pipe(texts, batch_size=64):                               
        tokens = [t.text for t in doc if not t.is_stop and not t.is_punct]   
        out.append(tokens)                                                   
    with open(cache_file, "wb") as f:                                        
        pickle.dump(out, f)                                                  
    return out  


['The', 'Earth', '’s', 'climate', 'sensitivity', 'is', 'important', '.']


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [4]:
# import json
# import math
# from typing import Dict, List, Tuple
# from itertools import islice
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import precision_score, recall_score, f1_score
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.preprocessing import MultiLabelBinarizer
# from rank_bm25 import BM25Okapi
# import pandas as pd

# # === Load data ===
# with open("data/dev-claims.json") as f:
#     dev_claims = json.load(f)
# with open("data/evidence.json") as f:
#     evidence_data = json.load(f)

# # === Use first X claims for testing ===
# X = 10
# claims = dict(islice(dev_claims.items(), X))
# evidences = evidence_data  # Use full evidence set


# # === Build gold evidence map ===
# gold_map = {
#     cid: list(map(str, data["evidences"]))
#     for cid, data in claims.items()
#     if data["evidences"]
# }

# # === TF-IDF retrieval ===
# def tfidf_candidates(claims, evidences, top_k=3) -> Dict[str, List[str]]:
#     claim_ids = list(claims)
#     claim_texts = [claims[cid]["claim_text"] for cid in claim_ids]
#     evidence_ids, evidence_texts = zip(*[(eid, txt) for eid, txt in evidences.items() if txt])

#     all_texts = claim_texts + list(evidence_texts)
#     vectorizer = TfidfVectorizer()
#     tfidf_matrix = vectorizer.fit_transform(all_texts)

#     claim_vecs = tfidf_matrix[:len(claim_texts)]
#     evidence_vecs = tfidf_matrix[len(claim_texts):]

#     result = {}
#     for i, cid in enumerate(claim_ids):
#         sims = (claim_vecs[i] @ evidence_vecs.T).toarray().flatten()
#         top_idx = sims.argsort()[::-1][:top_k]
#         result[cid] = [evidence_ids[j] for j in top_idx]
#     return result

# # === BM25 retrieval ===
# def bm25_candidates(claims, evidences, top_k=3) -> Dict[str, List[str]]:
#     claim_ids = list(claims)
#     claim_texts = [claims[cid]["claim_text"] for cid in claim_ids]
#     evidence_ids, evidence_texts = zip(*[(eid, txt) for eid, txt in evidences.items() if txt])

#     tok_e = [text.lower().split() for text in evidence_texts]
#     tok_c = [text.lower().split() for text in claim_texts]

#     bm25 = BM25Okapi(tok_e)

#     result = {}
#     for cid, toks in zip(claim_ids, tok_c):
#         scores = bm25.get_scores(toks)
#         top_idx = sorted(range(len(scores)), key=scores.__getitem__, reverse=True)[:top_k]
#         result[cid] = [evidence_ids[i] for i in top_idx]
#     return result

# # === Evaluation ===
# def evaluate(gold: Dict[str, List[str]], predicted: Dict[str, List[str]]):
#     mlb = MultiLabelBinarizer()
#     y_true = mlb.fit_transform([gold[cid] for cid in gold])
#     y_pred = mlb.transform([predicted.get(cid, []) for cid in gold])

#     p = precision_score(y_true, y_pred, average='micro', zero_division=0)
#     r = recall_score(y_true, y_pred, average='micro', zero_division=0)
#     f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
#     return p, r, f1

# # === Run retrieval and evaluate ===
# tfidf_preds = tfidf_candidates(claims, evidences, top_k=3)
# bm25_preds = bm25_candidates(claims, evidences, top_k=3)

# # Filter claims that exist in all maps
# common_ids = [cid for cid in gold_map if cid in tfidf_preds and cid in bm25_preds]
# gold_eval = {cid: gold_map[cid] for cid in common_ids}
# tfidf_eval = {cid: tfidf_preds[cid] for cid in common_ids}
# bm25_eval = {cid: bm25_preds[cid] for cid in common_ids}

# tfidf_scores = evaluate(gold_eval, tfidf_eval)
# bm25_scores = evaluate(gold_eval, bm25_eval)

# # === Show results ===
# df = pd.DataFrame([
#     {"Method": "TF-IDF", "Precision": tfidf_scores[0], "Recall": tfidf_scores[1], "F1 Score": tfidf_scores[2]},
#     {"Method": "BM25", "Precision": bm25_scores[0], "Recall": bm25_scores[1], "F1 Score": bm25_scores[2]},
# ])
# print(df.to_string(index=False))


Method  Precision   Recall  F1 Score
TF-IDF        1.0 0.027027  0.052632
  BM25        1.0 0.081081  0.150000




In [5]:
# === BM25 filtering =================================================
def bm25_candidates(
    claims: Dict[str, dict], evidences: Dict[str, str],
    top_k: int, ratio: float
) -> Tuple[List[str], List[str], Dict[str, List[str]]]:
    """Return claim_ids, claim_texts, and BM25 top-k evidence IDs per claim."""
    claim_ids   = list(claims)
    claim_texts = [claims[cid]["claim_text"] for cid in claim_ids]

    evidence_ids, evidence_texts = zip(*[
        (eid, txt) for eid, txt in evidences.items() if txt
    ]) if evidences else ([], [])

    tok_e = tokenise_cached(list(evidence_texts), f"cache/evid_{len(evidence_texts)}.pkl")
    tok_c = tokenise_cached(claim_texts,          f"cache/claim_{len(claim_texts)}.pkl")

    bm25 = BM25Okapi(tok_e)
    k = top_k if len(evidence_ids) >= 500 else max(1, math.ceil(len(evidence_ids)*ratio))

    cand_map: Dict[str, List[str]] = {}
    for cid, toks in zip(claim_ids, tok_c):
        scores  = bm25.get_scores(toks)
        top_idx = sorted(range(len(scores)), key=scores.__getitem__, reverse=True)[:k]
        cand_map[cid] = [evidence_ids[i] for i in top_idx]
    return claim_ids, claim_texts, cand_map
  
# === Sentence-Bert re-ranking =================================================
def sbert_rerank(
    claim_ids: List[str], claim_texts: List[str], cand_map: Dict[str, List[str]],
    evidences: Dict[str, str], model: SentenceTransformer, score_th: float
) -> Dict[str, dict]:
    """Rerank BM25 candidates with SBERT cosine similarity."""
    results: Dict[str, dict] = {}

    emb_claims = model.encode(claim_texts, convert_to_tensor=True)
    idx_of = {cid: i for i, cid in enumerate(claim_ids)}

    for cid in claim_ids:
        c_vec = emb_claims[idx_of[cid]]
        ev_scores = []
        for eid in cand_map[cid]:
            if eid not in evidences:
                continue
            e_vec = model.encode(evidences[eid], convert_to_tensor=True)
            score = util.cos_sim(c_vec, e_vec).item()
            ev_scores.append((eid, score))
        kept = [p for p in ev_scores if p[1] >= score_th] or sorted(ev_scores, key=lambda x:x[1], reverse=True)[:1]
        results[cid] = {
            "evidences": [eid for eid, _ in kept],
            "scores":    [round(s,4) for _, s in kept],
        }
    return results

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [6]:

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_recall_fscore_support

# === Parameters ================================================
BASE_MODEL_NAME    = "all-MiniLM-L6-v2"     # SBERT checkpoint (no fine-tune), TODO: Fine tune

TOP_K_FIXED        = 100    # BM25 candidates per claim (upper bound)
TOP_K_RATIO        = 0.20   # Ratio fallback when corpus is tiny
SBERT_SCORE_TH     = 0.93   # Cosine-similarity threshold for sentence-bert

LIMIT_DEV_CLAIMS   = True  # Quick-iteration switch, to limit the size of evidences for faster processing
LIMIT_COUNT        = 100

def evaluate(pred: dict, actual: dict):
    gold_sets = [set(actual[c]["evidences"])          for c in actual]
    pred_sets = [set(pred.get(c, {}).get("evidences", [])) for c in actual]  

    mlb = MultiLabelBinarizer()                     # ← turns set-of-IDs → multi-hot vector
    y_true = mlb.fit_transform(gold_sets)
    y_pred = mlb.transform(pred_sets)               # use same classes_

    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="micro", zero_division=0
    )
    return rec, prec, f1

def main():
    print("Loading datasets…")
    train_claims = load_claims(TRAIN_CLAIMS_PATH) # Todo: TRAIN S-BERT
    development_claims   = load_claims(DEV_CLAIMS_PATH)
    evidence_corpus = load_evidences(EVIDENCE_PATH)

    if LIMIT_DEV_CLAIMS:
        selected_claim_ids = list(development_claims)[:LIMIT_COUNT]
        development_claims = {cid: development_claims[cid] for cid in selected_claim_ids}

        # Gather only the evidence IDs actually referenced by the claims in the limited dev claims
        required_evidence_ids = {
            e_id for claim in development_claims.values() for e_id in claim["evidences"]
        }
        evidence_corpus = {
            e_id: evidence_corpus[e_id]                    # keep text
            for e_id in required_evidence_ids
            if e_id in evidence_corpus
        }

    # --- BM25 retrieval ---
    print("BM25 candidate selection")
    candidate_ids, candidate_texts, candidate_map = bm25_candidates(
        development_claims,
        evidence_corpus,
        TOP_K_FIXED,
        TOP_K_RATIO
    )

    # candidate_ids – ascending order of claim IDs 
    # candidate_texts – parallel list of raw claim strings
    # candidate_map – each claim mapped to its K best evidence IDs 

    # --- SBERT rerank ---
    print("Loading SBERT model")
    sentence_bert = SentenceTransformer(BASE_MODEL_NAME)    # encodes claim & evidence texts

    print("SBERT reranking")
    predictions = sbert_rerank(
        candidate_ids,
        candidate_texts,
        candidate_map,
        evidence_corpus,
        sentence_bert,
        SBERT_SCORE_TH
    )
    # --- evaluation ---
    rec, prec, f1 = evaluate(predictions, development_claims)
    print(f"\nRecall: {rec:.4f} Precision: {prec:.4f} F1: {f1:.4f}")

    # --- save file ---
    with open("dev-claims-predictions.json", "w") as f:
        json.dump(predictions, f, indent=2)
    print("Exported predictions to dev-claims-predictions.json")

if __name__ == "__main__":
    main()


## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*