# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
import json
import os
import pandas as pd
import torch

## DATA COLLECTION

In [4]:

# load data
DATA_DIR = "./data"  
TRAIN_FILE = os.path.join(DATA_DIR, "train-claims.json")
DEV_FILE   = os.path.join(DATA_DIR, "dev-claims.json")
TEST_FILE  = os.path.join(DATA_DIR, "test-claims-unlabelled.json")
EVID_FILE  = os.path.join(DATA_DIR, "evidence.json")


def load_claims(path, labelled=True):
    with open(path, 'r', encoding='utf-8') as f:
        raw = json.load(f)
    records = []
    for cid, info in raw.items():
        rec = {
            "claim_id": cid,
            "claim_text": info.get("claim_text", ""),
        }
        if labelled:
            rec["label"] = info["claim_label"]
            rec["evid_ids"] = info["evidences"]
        records.append(rec)
    return pd.DataFrame(records)

def load_evidence(path):
    with open(path, 'r', encoding='utf-8') as f:
        raw = json.load(f)
    return pd.DataFrame([{"evid_id": k, "evid_text": v} for k, v in raw.items()])

# make them to dataframe
df_train = load_claims(TRAIN_FILE, labelled=True)
df_dev   = load_claims(DEV_FILE,   labelled=True)
df_test  = load_claims(TEST_FILE,  labelled=False)
df_evid  = load_evidence(EVID_FILE)

# 4. have a look!
print("Train size：", len(df_train))
print("Dev size：", len(df_dev))
print("Test  size：", len(df_test))
print("Evidence size：", len(df_evid))


display(df_train.head())
display(df_evid.head())

# 5. label count
print("Train label distribution：")
display(df_train["label"].value_counts())

print("Dev label distribution：")
display(df_dev["label"].value_counts())




Train size： 1228
Dev size： 154
Test  size： 153
Evidence size： 1208827


Unnamed: 0,claim_id,claim_text,label,evid_ids
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
1,claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
2,claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"
3,claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5..."
4,claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72..."


Unnamed: 0,evid_id,evid_text
0,evidence-0,"John Bennet Lawes, English entrepreneur and ag..."
1,evidence-1,Lindberg began his professional career at the ...
2,evidence-2,``Boston (Ladies of Cambridge)'' by Vampire We...
3,evidence-3,"Gerald Francis Goyer (born October 20, 1936) w..."
4,evidence-4,He detected abnormalities of oxytocinergic fun...


Train label distribution：


label
SUPPORTS           519
NOT_ENOUGH_INFO    386
REFUTES            199
DISPUTED           124
Name: count, dtype: int64

Dev label distribution：


label
SUPPORTS           68
NOT_ENOUGH_INFO    41
REFUTES            27
DISPUTED           18
Name: count, dtype: int64

## TF-IDF

In [23]:

from sklearn.feature_extraction.text import TfidfVectorizer
import re


def preprocess_text(text):
    
    text = re.sub(r'<.*?>', '', text)
    
    text = re.sub(r'\s+', ' ', text).strip()
    return text


df_evid['processed_text'] = df_evid['evid_text'].apply(preprocess_text)



print("TF-IDF")
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(df_evid['processed_text'])
print(f"TF-IDF shape: {tfidf_matrix.shape}")

TF-IDF
TF-IDF shape: (1208827, 50000)


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
if torch.cuda.is_available():
    print("CUDA is available.")

    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")

    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

    print(f"Current GPU device: {torch.cuda.current_device()}")
else:
    print("CUDA is not available. Running on CPU.")

CUDA is available.
Number of available GPUs: 1
GPU 0: NVIDIA GeForce RTX 4070 SUPER
Current GPU device: 0


In [25]:

!pip install -q sentence-transformers transformers

import torch
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
import os


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")


print("opensource model")
retriever = SentenceTransformer('paraphrase-mpnet-base-v2').to(device) 
tokenizer = AutoTokenizer.from_pretrained('roberta-base') 
classifier = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=4).to(device)

# label
label_map = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT_ENOUGH_INFO", 3: "DISPUTED"}
label_to_id = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2, "DISPUTED": 3}


print("evidence embading")
batch_size = 1000
all_embeddings = []
for i in tqdm(range(0, len(df_evid), batch_size), desc="Encoding evidence"):
    batch = df_evid['processed_text'][i:i+batch_size].tolist()
    embeddings = retriever.encode(batch, convert_to_tensor=True, show_progress_bar=False)
    all_embeddings.append(embeddings)
evidence_embeddings = torch.cat(all_embeddings)

# 结合规则和模型
def classify_with_rules_and_model(claim_text, evidence_texts):
   
    
    if all(len(set(claim_text.lower().split()) & set(ev.lower().split())) < 3 for ev in evidence_texts):
        return "NOT_ENOUGH_INFO"
        
   
    combined_text = claim_text + tokenizer.sep_token + tokenizer.sep_token.join(evidence_texts)
    inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    
    with torch.no_grad():
        outputs = classifier(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)[0]
        predicted_class = torch.argmax(logits, dim=1).item()
    
    
    if probs[label_to_id["DISPUTED"]] > 0.2 or (max(probs) - min(probs) < 0.3):
        has_contradiction = any("contrary" in ev.lower() or "oppose" in ev.lower() or 
                              "disagree" in ev.lower() or "conflict" in ev.lower() for ev in evidence_texts)
        if has_contradiction:
            return "DISPUTED"
    
    return label_map[predicted_class]


def retrieve_evidence(claim_text, top_k=5, top_tfidf=20):
    
    
    claim_embedding = retriever.encode(claim_text, convert_to_tensor=True)
    cos_scores = util.cos_sim(claim_embedding, evidence_embeddings)[0]
    
 
    claim_tfidf = tfidf.transform([preprocess_text(claim_text)])
    tfidf_scores = np.array((claim_tfidf @ tfidf_matrix.T).toarray()[0])
    

    tfidf_top_indices = np.argsort(-tfidf_scores)[:top_tfidf]
    

    semantic_scores = cos_scores.cpu().numpy()
    

    for idx in tfidf_top_indices:
        semantic_scores[idx] += 0.2  # 提升TF-IDF匹配较好的文档的分数
    
    # 获取最终的前K个结果
    final_top_indices = np.argsort(-semantic_scores)[:top_k]
    evidence_ids = df_evid.iloc[final_top_indices]['evid_id'].tolist()
    evidence_texts = df_evid.iloc[final_top_indices]['processed_text'].tolist()
    
    return evidence_ids, evidence_texts

device: cuda
opensource model


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


evidence embading


Encoding evidence: 100%|██████████| 1209/1209 [11:53<00:00,  1.69it/s]


In [27]:

print("generate result")
results = {}
for idx, row in tqdm(df_dev.iterrows(), total=len(df_dev), desc="Processing claims"):
    claim_id = row['claim_id']
    claim_text = row['claim_text']
    
    
    evidence_ids, evidence_texts = retrieve_evidence(claim_text, top_k=5)
    
    
    label = classify_with_rules_and_model(claim_text, evidence_texts)
    
    
    results[claim_id] = {
        "claim_label": label,
        "evidences": evidence_ids
    }


if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)


output_path = os.path.join(DATA_DIR, 'dev-predictions.json')
with open(output_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"saved into {output_path}")



generate result


Processing claims: 100%|██████████| 154/154 [01:49<00:00,  1.41it/s]

saved into ./data\dev-predictions.json





# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*