# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

## 0. Setup


In [None]:
!pip install -q tqdm
!pip install -q rank_bm25
!pip install -q sentence-transformers hnswlib
!pip install -U bitsandbytes transformers accelerate

In [None]:
import os
import re
import gzip 
import json
import torch
import pickle
import hnswlib
import pathlib 
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from rank_bm25 import BM25Okapi
from torch.utils.data import DataLoader
from nltk.tokenize import wordpunct_tokenize
from sentence_transformers import InputExample
from sentence_transformers import CrossEncoder
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

### 0.1 Log in to huggingface

In [None]:
from huggingface_hub import login

login(os.getenv("HUGGING_FACE_API_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


### 0.2 Check GPU availability

In [157]:
# Make sure CUDA is available
assert torch.cuda.is_available(), "CUDA is not available!"
print(torch.cuda.get_device_name())

NVIDIA GeForce GTX 1660 Ti


# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## DATA COLLECTION

In [None]:
DATA_DIR   = "./data"
TRAIN_FILE = os.path.join(DATA_DIR, "train-claims.json")
DEV_FILE   = os.path.join(DATA_DIR, "dev-claims.json")
TEST_FILE  = os.path.join(DATA_DIR, "test-claims-unlabelled.json")
EVID_FILE  = os.path.join(DATA_DIR, "evidence.json")


def load_claims(path: str, labelled: bool = True) -> pd.DataFrame:
    '''
    Load the claims from the given path.
    Save into dataframes.
    '''
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)

    rows = []
    for cid, info in raw.items():
        row = {
            "claim_id":   cid,
            "claim_text": info.get("claim_text", "")
        }
        if labelled:
            row["label"]      = info["claim_label"]
            row["evid_ids"]   = info["evidences"]
        rows.append(row)

    df = pd.DataFrame(rows)

   
    if labelled:
        df["label"] = df["label"].astype("category")

    return df


def load_evidence(path: str):
    '''
    Load the evidence from the given path.
    Save into dataframes.
    '''
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)

    df   = pd.DataFrame([{"evid_id": k, "evid_text": v} for k, v in raw.items()])
    edict = {k: v for k, v in raw.items()}
    return df, edict



df_train = load_claims(TRAIN_FILE, labelled=True)
df_dev   = load_claims(DEV_FILE,   labelled=True)
df_test  = load_claims(TEST_FILE,  labelled=False)

df_evid, evid_dict = load_evidence(EVID_FILE)    


LABEL2ID = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2, "DISPUTED": 3}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

for df in (df_train, df_dev):
    df["label_id"] = df["label"].map(LABEL2ID).astype("int8")

print(f"Train size: {len(df_train):,}")
print(f"Dev   size: {len(df_dev):,}")
print(f"Test  size: {len(df_test):,}")
print(f"Evidence passages: {len(df_evid):,}")

display(df_train.head())
display(df_evid.head())

print("Train label distribution:")
display(df_train["label"].value_counts())

print("Dev label distribution:")
display(df_dev["label"].value_counts())


Train size: 1,228
Dev   size: 154
Test  size: 153
Evidence passages: 1,208,827


Unnamed: 0,claim_id,claim_text,label,evid_ids,label_id
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1...",3
1,claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]",1
2,claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]",0
3,claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5...",3
4,claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72...",2


Unnamed: 0,evid_id,evid_text
0,evidence-0,"John Bennet Lawes, English entrepreneur and ag..."
1,evidence-1,Lindberg began his professional career at the ...
2,evidence-2,``Boston (Ladies of Cambridge)'' by Vampire We...
3,evidence-3,"Gerald Francis Goyer (born October 20, 1936) w..."
4,evidence-4,He detected abnormalities of oxytocinergic fun...


Train label distribution:


label
SUPPORTS           519
NOT_ENOUGH_INFO    386
REFUTES            199
DISPUTED           124
Name: count, dtype: int64

Dev label distribution:


label
SUPPORTS           68
NOT_ENOUGH_INFO    41
REFUTES            27
DISPUTED           18
Name: count, dtype: int64

In [5]:
CLIMATE_CONCEPTS = {
    "physical_mechanisms": [
        "greenhouse effect", "carbon dioxide", "CO2", "methane", "CH4", 
        "greenhouse gas", "GHG", "emissions", "fossil fuel", "carbon cycle",
        "radiative forcing", "albedo", "feedback", "sensitivity"
    ],
    "observations": [
        "temperature", "warming", "cooling", "precipitation", "sea level", 
        "ice sheet", "glacier", "sea ice", "ocean acidification", "drought",
        "flood", "extreme weather", "heat wave", "storm", "hurricane"
    ],
    "climate_systems": [
        "atmosphere", "ocean", "cryosphere", "biosphere", "El Niño", "La Niña", 
        "jet stream", "gulf stream", "AMOC", "PDO", "AMO", "ENSO", "monsoon"
    ],
    "time_periods": [
        "pre-industrial", "industrial", "holocene", "anthropocene", "pleistocene",
        "ice age", "medieval warm period", "little ice age", "paleoclimate"
    ]
}

## Feature Extraction Phase 1 --> 600
Uses BM25 along with MiniLM to retrieve top 600 most relevant evidences


### BM25

In [158]:
# Top 600 relevant evidences
K_BM25   = 600    
N_DENSE  = 600    
EF_QUERY = 200    

def tokenize(text: str):
    return [tok.lower() for tok in wordpunct_tokenize(text) if tok.isalnum()]


corpus_tokens = [tokenize(t) for t in evid_dict.values()]
bm25          = BM25Okapi(corpus_tokens)
evid_list     = list(evid_dict.keys())         


def get_bm25_topk(df, K=K_BM25):
    # Get cached result if exists
    cache_path = pathlib.Path(f"bm25_top{K}.pkl.gz")
    if cache_path.exists():
        with gzip.open(cache_path, "rb") as f:
            print(f"✓ load BM25 cache ({cache_path})")
            return pickle.load(f)

    # Calculate BM25 scores for each claim
    claim_topk = {}
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"BM25 Top-{K}"):
        idxs = np.argsort(bm25.get_scores(tokenize(row.claim_text)))[::-1][:K]
        claim_topk[row.claim_id] = [evid_list[i] for i in idxs]

    # Save cached result
    with gzip.open(cache_path, "wb") as f:
        pickle.dump(claim_topk, f)
    print(f"⌛ save cache to {cache_path}")
    return claim_topk


bm25_topk = get_bm25_topk(df_dev, K_BM25)

✓ load BM25 cache (bm25_top600.pkl.gz)


### MiniLM

In [None]:
# Load MiniLM sentence transformer model
MODEL = "all-MiniLM-L12-v2"
minilm = SentenceTransformer(MODEL)

# Get embedding dimensionality
DIM = minilm.get_sentence_embedding_dimension()

# Define file paths for HNSW index and evidence ID list
hnsw_bin = pathlib.Path("evidence_hnsw.bin")
ids_npy = pathlib.Path("evid_ids.npy")

# Load evidence IDs
evid_ids = np.load(ids_npy)

# If the HNSW index doesn't exist, build and save it
if not hnsw_bin.exists():
    print("Building hnsw bin...")
    # Encode evidence sentences into dense vectors
    ev_emb = minilm.encode(
        list(evid_dict.values()),
        batch_size=256,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    # Save the list of evidence IDs
    np.save(ids_npy, np.array(evid_list))

    # Create and initialize HNSW index
    idx = hnswlib.Index(space="cosine", dim=DIM)
    idx.init_index(max_elements=len(ev_emb), ef_construction=200, M=32)

    # Add encoded vectors to the index and save it
    idx.add_items(ev_emb)
    idx.save_index(str(hnsw_bin))

# If index file already exists, load it
else:
    print("Loading hnsw bin...")
    idx = hnswlib.Index(space="cosine", dim=DIM)
    idx.load_index(str(hnsw_bin), max_elements=len(evid_ids))
    print("Loaded hnsw bin")

# Set query-time ef parameter for HNSW search
idx.set_ef(EF_QUERY)

# Function to retrieve top-k evidence IDs using MiniLM + HNSW
def dense_retrieve(text, k=N_DENSE):
    # Encode query using both models
    q_vec = minilm.encode(text, normalize_embeddings=True)

    # Search the fused index
    I, _ = idx.knn_query(q_vec, k=k)
    return [evid_ids[i] for i in I[0]]


Loading hnsw bin...


### Combine BM25 and MiniLM

In [None]:
# Retrieve top evidence IDs for each claim using union of BM25 and dense
claim_topk = {}
for _, row in tqdm(df_dev.iterrows(), total=len(df_dev), desc="BM25∪HNSW retrieval"):
    # Merge BM25 and dense retrieval results (remove duplicates while preserving order)
    merged = list(dict.fromkeys(
        bm25_topk[row.claim_id] + dense_retrieve(row.claim_text)))
    # Store merged results for the current claim
    claim_topk[row.claim_id] = merged

### Evaluate result

In [None]:
# Function to compute recall@k for development set
def recall_at_k(dev_df, topk_map):
    rs = []
    for _, r in dev_df.iterrows():
        gold = set(r.evid_ids)                            # Ground truth evidence
        found = len(gold & set(topk_map[r.claim_id]))     # Correctly retrieved evidence
        rs.append(found / len(gold))                      # Recall for this claim
    return sum(rs) / len(rs)                              # Average recall

print("Computing recall...")
# Evaluate recall on development set and print result
recall_val = recall_at_k(df_dev, claim_topk)
print(f"\nRecall(BM25∪HNSW) on dev: {recall_val:.3f}")

## Feature Extraction Phase 2 600 -> 200

In [None]:
def simple_tokenize(text):
    
    words = re.findall(r'\w+', text.lower())
    return [w for w in words if len(w) > 3]

def calculate_recall(gold_ids, pred_ids):
    
    if not gold_ids:
        return 1.0
    return len(set(gold_ids) & set(pred_ids)) / len(gold_ids)

def improved_coarse_retrieval(df,
                              evid_dict,
                              bm25: BM25Okapi,
                              evid_list,
                              sbert: SentenceTransformer,
                              hnsw_index: hnswlib.Index,
                              evid_ids,
                              k_bm25=100,
                              k_dense=100,
                              rrf_k=60,
                              mmr_lambda=0.7,
                              candidates_per_claim=200,
                              cache_path="improved_coarse_candidates.pkl.gz"):

    cache_file = pathlib.Path(cache_path)
    if cache_file.exists():
        with gzip.open(cache_file, "rb") as f:
            data = pickle.load(f)
        print(f"✓ load from cache: {cache_file}")
        return data['candidates'], data['recall']
    
    claim_candidates = {}
    recall_vals = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="improved"):
        cid = row['claim_id']
        text = row['claim_text']
        gold = row.get('evid_ids', [])
        
        # 1. BM25
        tokens = simple_tokenize(text)
        bm25_scores = bm25.get_scores(tokens)
        top_bm25 = np.argsort(-bm25_scores)[:k_bm25]
        bm25_ids = [evid_list[i] for i in top_bm25]
        bm25_rank = {evid_list[i]: r for r,i in enumerate(top_bm25)}
        
        # 2. Dense 
        q_emb = sbert.encode(text, normalize_embeddings=True)
        labels, _ = hnsw_index.knn_query(q_emb.reshape(1, -1), k=k_dense)
        dense_idx = labels[0]
        dense_ids = [evid_ids[i] for i in dense_idx]
        dense_rank = {evid_ids[i]: r for r,i in enumerate(dense_idx)}
        
        # 3. RRF
        fusion = {}
        for eid in set(bm25_ids + dense_ids):
            score = 0.0
            if eid in bm25_rank:
                score += 1/(bm25_rank[eid] + rrf_k)
            if eid in dense_rank:
                score += 1/(dense_rank[eid] + rrf_k)
            fusion[eid] = score
        fused = [eid for eid,_ in sorted(fusion.items(), key=lambda x: -x[1])]
        
        # 4. MMR
        pool = fused[:2*candidates_per_claim]
        pool_embs = sbert.encode([evid_dict[e] for e in pool], normalize_embeddings=True)
        sim_q = cosine_similarity(q_emb.reshape(1,-1), pool_embs)[0]
        
        selected, cand_ids, cand_embs = [], pool.copy(), pool_embs.copy()
        sim_to_q = sim_q.copy()
        while len(selected) < candidates_per_claim and cand_ids:
            if not selected:
                idx0 = int(np.argmax(sim_to_q))
            else:
                sims = cosine_similarity(q_emb.reshape(1,-1), cand_embs)[0]
                sel_embs = sbert.encode([evid_dict[e] for e in selected], normalize_embeddings=True)
                sim_sel = cosine_similarity(cand_embs, sel_embs)
                max_sel = np.max(sim_sel, axis=1)
                mmr_scores = mmr_lambda * sims - (1-mmr_lambda) * max_sel
                idx0 = int(np.argmax(mmr_scores))
            selected.append(cand_ids.pop(idx0))
            cand_embs = np.delete(cand_embs, idx0, axis=0)
            sim_to_q = np.delete(sim_to_q, idx0)
        
        claim_candidates[cid] = selected
        recall_vals.append(calculate_recall(gold, selected))
    
    mean_rec = float(np.mean(recall_vals))
    
    with gzip.open(cache_file, "wb") as f:
        pickle.dump({'candidates': claim_candidates, 'recall': mean_rec}, f)
    print(f"⌛ cache: {cache_file}")
    print(f"average recall : {mean_rec:.4f}  |  candicate: {candidates_per_claim}")
    
   
    print("\n sample:")
    for i, row in df.head(3).iterrows():
        cid = row['claim_id']
        gold = row.get('evid_ids', [])
        found = len(set(gold)&set(claim_candidates[cid]))
        print(f"{cid}: {found}/{len(gold)} = {found/len(gold):.2f}")
    
    return claim_candidates, mean_rec


improved_cands, improved_rec = improved_coarse_retrieval(
    df_dev, evid_dict, bm25, evid_list, minilm, idx, evid_ids,
    k_bm25=100, k_dense=100, rrf_k=60,
    mmr_lambda=0.7, candidates_per_claim=200
)
print(f"\nfinal Recall: {improved_rec:.4f}")


✓ load from cache: improved_coarse_candidates.pkl.gz

final Recall: 0.7158


## Feature Extraction Phase 3 200 -> 10

In [99]:


training_sample = []
for key, evid_ids in improved_cands.items():
    filtered_df = df_dev[df_dev['claim_id'] == key]
    for evid_id in evid_ids:
        relevant = 0
        if str(evid_id) in filtered_df['evid_ids'].values[0]:
            relevant = 1
        training_sample.append(InputExample(texts=[filtered_df["claim_text"].values[0], evid_dict[evid_id]], label=float(relevant)))


In [100]:
print(training_sample[:10])

[<sentence_transformers.readers.InputExample.InputExample object at 0x000002B64017D580>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002B63F619100>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002B640384B30>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002B640384530>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002B58D7ACE30>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002B58D7ADE50>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002B58D7AFE90>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002B64916A240>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002B649168B00>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002B64916BEF0>]


In [None]:
model_name = "cross-encoder/ms-marco-MiniLM-L6-v2"
model = CrossEncoder(model_name, num_labels=1)


In [102]:

dataloader = DataLoader(training_sample, shuffle=True, batch_size=8)



In [103]:
model.fit(
    train_dataloader=dataloader,
    epochs=5,
    warmup_steps=10,
)

Step,Training Loss
500,0.074
1000,0.0497
1500,0.0694
2000,0.0691
2500,0.0641
3000,0.0658
3500,0.0556
4000,0.0468
4500,0.0421
5000,0.0482


In [105]:
model.save("./cross_encoder_model")

In [130]:


def softmax(x):
    e_x = np.exp(x - np.max(x))  # for numerical stability
    return e_x / e_x.sum()
    

def rerank_with_cross_encoder(claims_df, evid_dict, candidate_map):
    reranked = {}
    for _, row in tqdm(claims_df.iterrows(), total=len(claims_df), desc="Re-ranking"):
        claim_id = row['claim_id']
        claim_text = row['claim_text']
        candidates = candidate_map[claim_id]

        # Prepare input pairs for cross-encoder
        pairs = [(claim_text, evid_dict[eid]) for eid in candidates]
        
        # Predict relevance scores
        scores = model.predict(pairs, batch_size=32)

        reranked[claim_id] = (candidates, scores)
    return reranked


reranked_candidates = rerank_with_cross_encoder(df_dev, evid_dict, improved_cands)


Re-ranking: 100%|██████████| 154/154 [00:31<00:00,  4.88it/s]


In [154]:
EXTRACTION_THRESHOLD = 0.00005

def extract_top_evid(candidates, scores):   
    scores = np.array(scores)
    probs = softmax(scores)
    indices_to_keep = np.where(probs > EXTRACTION_THRESHOLD)[0]
    # Take top-N evidence IDs
    top_evids = [candidates[idx] for idx in indices_to_keep ]
    return top_evids

final_candidates = {}
for claim_id, (candidates, scores) in reranked_candidates.items():
    final_candidates[claim_id] = extract_top_evid(candidates, scores)

def precision_recall_f1_at_k(dev_df, topk_map, k=None):
    precisions, recalls, f1s = [], [], []
    
    for _, row in dev_df.iterrows():
        gold = set(row.evid_ids)
        pred = topk_map.get(row.claim_id, [])
        
        if k:
            pred = pred[:k]
        
        pred_set = set(pred)
        correct = len(gold & pred_set)
        
        precision = correct / len(pred) if pred else 1.0
        recall = correct / len(gold) if gold else 1.0
        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0.0

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1 = sum(f1s) / len(f1s)
    
    return {
        'precision@k': avg_precision,
        'recall@k': avg_recall,
        'f1@k': avg_f1
    }

print(precision_recall_f1_at_k(df_dev, final_candidates))

{'precision@k': 0.5761024103113274, 'recall@k': 0.6918831168831169, 'f1@k': 0.5500537015904301}


In [107]:
df_dev

Unnamed: 0,claim_id,claim_text,label,evid_ids,label_id
0,claim-752,[South Australia] has the most expensive elect...,SUPPORTS,"[evidence-67732, evidence-572512]",0
1,claim-375,when 3 per cent of total annual global emissio...,NOT_ENOUGH_INFO,"[evidence-996421, evidence-1080858, evidence-2...",2
2,claim-1266,This means that the world is now 1C warmer tha...,SUPPORTS,"[evidence-889933, evidence-694262]",0
3,claim-871,"“As it happens, Zika may also be a good model ...",NOT_ENOUGH_INFO,"[evidence-422399, evidence-702226, evidence-28...",2
4,claim-2164,Greenland has only lost a tiny fraction of its...,REFUTES,"[evidence-52981, evidence-264761, evidence-947...",1
...,...,...,...,...,...
149,claim-2400,"'To suddenly label CO2 as a ""pollutant"" is a d...",REFUTES,"[evidence-409365, evidence-127519, evidence-85...",1
150,claim-204,"after a natural orbitally driven warming, atmo...",NOT_ENOUGH_INFO,"[evidence-368192, evidence-261690, evidence-20...",2
151,claim-1426,Many of the world’s coral reefs are already ba...,NOT_ENOUGH_INFO,"[evidence-1124018, evidence-995813, evidence-1...",2
152,claim-698,A recent study led by Lawrence Livermore Natio...,REFUTES,[evidence-660755],1


In [109]:
reranked_candidates

for _, row in df_dev.iterrows():
    print("needed: ", set(row.evid_ids))
    print("got: ", reranked_candidates[row.claim_id])
    count = 0
    for each in set(row.evid_ids):
        if each in reranked_candidates[row.claim_id]:
            count += 1
    print(f'Needed: {len(row.evid_ids)}, got: {count}')


needed:  {'evidence-67732', 'evidence-572512'}
got:  ['evidence-572512', 'evidence-67732', np.str_('evidence-1061888'), np.str_('evidence-169170'), np.str_('evidence-1172687'), np.str_('evidence-780332'), np.str_('evidence-666596'), 'evidence-723533', 'evidence-472012', 'evidence-48256']
Needed: 2, got: 2
needed:  {'evidence-1080858', 'evidence-699212', 'evidence-832334', 'evidence-208053', 'evidence-996421'}
got:  [np.str_('evidence-699212'), 'evidence-996421', np.str_('evidence-559290'), 'evidence-215052', 'evidence-631053', np.str_('evidence-1101862'), np.str_('evidence-949910'), np.str_('evidence-584694'), 'evidence-537951', np.str_('evidence-295448')]
Needed: 5, got: 2
needed:  {'evidence-694262', 'evidence-889933'}
got:  ['evidence-694262', 'evidence-889933', 'evidence-38305', np.str_('evidence-1017598'), np.str_('evidence-622225'), np.str_('evidence-594052'), np.str_('evidence-246929'), np.str_('evidence-590642'), 'evidence-94670', np.str_('evidence-158510')]
Needed: 2, got: 2
n

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
if torch.cuda.is_available():
    print("CUDA is available.")

    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")

    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

    print(f"Current GPU device: {torch.cuda.current_device()}")
else:
    print("CUDA is not available. Running on CPU.")

CUDA is available.
Number of available GPUs: 1
GPU 0: NVIDIA GeForce RTX 4070 SUPER
Current GPU device: 0


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*