In [1]:
import json
import re
from collections import Counter, defaultdict
import unicodedata

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

12.4
True
NVIDIA GeForce RTX 3060 Ti


In [None]:
train_claims_path = 'data/train-claims.json'
dev_claims_path = 'data/dev-claims.json'
evidence_path = 'data/evidence.json'

# loading the data
with open(train_claims_path, 'r', encoding='utf-8') as f:
    train_claims = json.load(f)
with open(dev_claims_path, 'r', encoding='utf-8') as f:
    dev_claims = json.load(f)
with open(evidence_path, 'r', encoding='utf-8') as f:
    evidences = json.load(f)

# Extract the first 10 claims from the evidence data
# evidences = dict(list(evidences.items())[:20000])

print(f"Loaded {len(train_claims)} training claims.")
print(f"Loaded {len(dev_claims)} dev claims.")
print(f"Loaded {len(evidences)} evidences.")

Loaded 1228 training claims.
Loaded 154 dev claims.
Loaded 1208827 evidences.


In [None]:
def clean_text(text):
    """
    Clean the input text by:
    - Removing non-ASCII characters
    - Lowercasing
    - Removing extra spaces
    - Removing unwanted punctuations (optional, adjustable)

    Args:
        text (str): The raw text to clean.

    Returns:
        str: The cleaned text.
    """
    # Normalize unicode characters (e.g., replace \u2019 with real symbols)
    text = unicodedata.normalize('NFKD', text)
    # Encode to ASCII bytes, ignore non-ASCII characters, then decode back to string
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # Lowercase the text
    text = text.lower()
    # Remove unwanted punctuations (keep basic ones if needed)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text).strip()

    return text

def build_inverted_index(evidences):
    """
    Build:
    1. inverted index: word -> set of evidence IDs
    2. word sets for each evidence (not just cleaned text)
    """
    from collections import defaultdict
    import re

    inverted_index = defaultdict(set)
    evidence_word_sets = {}

    for evidence_id, text in evidences.items():
        clean_text_ = clean_text(text)
        word_list = re.findall(r'\w+', clean_text_.lower())
        word_set = set(word_list)

        # Save set of words directly
        evidence_word_sets[evidence_id] = word_set

        # Build inverted index
        for word in word_set:
            inverted_index[word].add(evidence_id)

    return inverted_index, evidence_word_sets

#     return top_evidence_ids

def simple_retrieve_fast(claim_text, inverted_index, evidence_word_sets, top_k=5, max_candidates=10000):
    """
    Fast retrieval using inverted index + limited candidates.
    Uses preprocessed evidence word sets directly.

    Args:
        claim_text (str): The claim text.
        inverted_index (dict): Prebuilt word -> evidence_ids map.
        evidence_word_sets (dict): {evidence_id: set of words}.
        top_k (int): Number of evidences to retrieve.
        max_candidates (int): Max number of evidence candidates to consider.

    Returns:
        List of evidence IDs.
    """
    clean_claim = clean_text(claim_text)
    claim_words = re.findall(r'\w+', clean_claim.lower())
    claim_word_set = set(claim_words)

    # Gather candidate evidence IDs
    candidate_ids = set()
    for word in claim_word_set:
        if word in inverted_index:
            candidate_ids.update(inverted_index[word])

    # Truncate to max_candidates
    if len(candidate_ids) > max_candidates:
        candidate_ids = set(list(candidate_ids)[:max_candidates])

    # Fine-grained scoring
    scores = []
    for evidence_id in candidate_ids:
        evidence_words = evidence_word_sets[evidence_id]
        common_word_count = len(claim_word_set & evidence_words)
        scores.append((evidence_id, common_word_count))

    scores.sort(key=lambda x: x[1], reverse=True)
    top_evidence_ids = [evidence_id for evidence_id, score in scores[:top_k]]

    return top_evidence_ids


In [16]:
inverted_index, evidence_word_sets = build_inverted_index(evidences)
print("Inverted index and evidence word sets built.")
print(f"Number of unique words in inverted index: {len(inverted_index)}")
print(f"Number of unique evidence IDs: {len(evidence_word_sets)}")

Inverted index and evidence word sets built.
Number of unique words in inverted index: 568713
Number of unique evidence IDs: 1208827


In [17]:
claim_id = list(train_claims)[0]
print(f"Example claim ID: {claim_id}")
claim_text = train_claims[claim_id]["claim_text"]
print(f"Example claim text: {claim_text}")
cleaned_claim_text = clean_text(claim_text)
print(f"Example cleaned claim text: {cleaned_claim_text}")
cleaned_evidences = {evidence_id: clean_text(evidence_text) for evidence_id, evidence_text in evidences.items()}
print(f"Example cleaned evidence: {list(cleaned_evidences.values())[0]}")
retrieved_evidence_ids = simple_retrieve_fast(cleaned_claim_text, inverted_index, evidence_word_sets, top_k=5, max_candidates=100000)
print(f"Retrieved evidence IDs: {retrieved_evidence_ids}")
print(f"Retrieved evidence texts: {[cleaned_evidences[evidence_id] for evidence_id in retrieved_evidence_ids]}")

Example claim ID: claim-1937
Example claim text: Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
Example cleaned claim text: not only is there no scientific evidence that co2 is a pollutant higher co2 concentrations actually help ecosystems support more plant and animal life
Example cleaned evidence: john bennet lawes english entrepreneur and agricultural scientist
Retrieved evidence IDs: ['evidence-367539', 'evidence-20656', 'evidence-888023', 'evidence-499734', 'evidence-832031']
Retrieved evidence texts: ['omnivores also consume both animal and non animal food and apart from the more general definition there is no clearly defined ratio of plant to animal material that would distinguish a facultative carnivore from an omnivore', 'james huneker remarked that the four movements of the sonata have no common life and that the sonata is not more a sonata than it is a sequence of ballad

In [19]:
class SimpleClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        """
        Initialize the SimpleClassifier model.

        Args:
            vocab_size (int): Size of the vocabulary.
            embedding_dim (int): Dimension of the embedding vectors.
            hidden_dim (int): Dimension of the hidden LSTM state.
            output_dim (int): Number of output classes.
        """
        super(SimpleClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, input_ids):
        """
        Forward pass of the model.

        Args:
            input_ids (Tensor): Tensor of token ids with shape (batch_size, seq_length).

        Returns:
            logits (Tensor): Raw output scores for each class (before softmax).
        """
        embedded = self.embedding(input_ids)  # Shape: (batch_size, seq_length, embedding_dim)
        _, (hidden, _) = self.lstm(embedded)   # hidden shape: (1, batch_size, hidden_dim)
        hidden = hidden.squeeze(0)             # Shape: (batch_size, hidden_dim)
        logits = self.fc(hidden)                # Shape: (batch_size, output_dim)
        return logits


In [20]:
class SimpleTokenizer:
    def __init__(self):
        """
        Initialize the SimpleTokenizer.
        """
        self.word2id = {}
        self.id2word = {}
        self.pad_token = "<PAD>"
        self.unk_token = "<UNK>"

    def build_vocab(self, texts, min_freq=1):
        """
        Build vocabulary from a list of texts.

        Args:
            texts (list): List of text strings.
            min_freq (int): Minimum frequency a word must have to be included.
        """
        word_freq = {}

        for text in texts:
            words = re.findall(r'\w+', text.lower())
            for word in words:
                word_freq[word] = word_freq.get(word, 0) + 1

        # Initialize vocabulary with special tokens
        self.word2id = {
            self.pad_token: 0,
            self.unk_token: 1
        }
        self.id2word = {
            0: self.pad_token,
            1: self.unk_token
        }

        idx = 2  # Start indexing words from 2
        for word, freq in word_freq.items():
            if freq >= min_freq:
                self.word2id[word] = idx
                self.id2word[idx] = word
                idx += 1

    def encode(self, text):
        """
        Encode a text string into a list of token ids.

        Args:
            text (str): Input text.

        Returns:
            List[int]: List of token ids.
        """
        words = re.findall(r'\w+', text.lower())
        ids = [self.word2id.get(word, self.word2id[self.unk_token]) for word in words]
        return ids

    def decode(self, ids):
        """
        Decode a list of token ids back into a text string.

        Args:
            ids (list): List of token ids.

        Returns:
            str: Decoded text string.
        """
        words = [self.id2word.get(id_, self.unk_token) for id_ in ids]
        return ' '.join(words)

    def vocab_size(self):
        """
        Get the size of the vocabulary.

        Returns:
            int: Number of tokens in the vocabulary.
        """
        return len(self.word2id)


In [21]:
def prepare_training_data_fast(claims, inverted_index, evidence_word_sets, tokenizer, top_k=5, max_seq_length=128, max_candidates=1000):
    """
    Prepare the training dataset using fast retrieval with preprocessed word sets.

    Args:
        claims (dict): Dictionary of claim instances.
        inverted_index (dict): Prebuilt word -> evidence_ids map.
        evidence_word_sets (dict): Preprocessed word sets for each evidence.
        tokenizer (SimpleTokenizer): The tokenizer to encode texts.
        top_k (int): Number of evidences to retrieve per claim.
        max_seq_length (int): Maximum length of input sequence.
        max_candidates (int): Max candidate evidences for filtering.

    Returns:
        List of (input_ids, label_id) pairs.
    """
    data = []

    label_map = {
        "SUPPORTS": 0,
        "REFUTES": 1,
        "NOT_ENOUGH_INFO": 2,
        "DISPUTED": 3
    }

    for claim_id, claim_data in claims.items():
        claim_text = claim_data['claim_text']
        claim_label = claim_data['claim_label']

        clean_claim = clean_text(claim_text)

        # Retrieve top evidence IDs
        retrieved_ids = simple_retrieve_fast(
            clean_claim,
            inverted_index,
            evidence_word_sets,
            top_k=top_k,
            max_candidates=max_candidates
        )

        # Convert evidence word sets back to text (space-joined)
        evidence_texts = [
            " ".join(sorted(evidence_word_sets[ev_id])) for ev_id in retrieved_ids if ev_id in evidence_word_sets
        ]
        combined_text = clean_claim + " " + " ".join(evidence_texts)
        combined_text = clean_text(combined_text)

        # Encode
        input_ids = tokenizer.encode(combined_text)

        # Pad or truncate
        if len(input_ids) > max_seq_length:
            input_ids = input_ids[:max_seq_length]
        else:
            padding_length = max_seq_length - len(input_ids)
            input_ids = input_ids + [tokenizer.word2id[tokenizer.pad_token]] * padding_length

        # Label
        label_id = label_map.get(claim_label, 2)  # Default to NOT_ENOUGH_INFO

        data.append((input_ids, label_id))

    return data


In [22]:
class ClaimDataset(Dataset):
    def __init__(self, data):
        """
        Custom Dataset for claim data.

        Args:
            data (list): List of (input_ids, label_id) pairs.
        """
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids, label = self.data[idx]
        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)


In [23]:
def evaluate_model(model, val_dataset, batch_size=16, device='cpu'):
    """
    Evaluate the model on validation dataset.

    Args:
        model (nn.Module): The classification model.
        val_dataset (Dataset): Validation dataset.
        batch_size (int): Batch size.
        device (str): 'cpu' or 'cuda'.

    Returns:
        avg_loss (float): Average loss on validation set.
        accuracy (float): Accuracy on validation set.
    """
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model.eval()  # Set model to evaluation mode

    criterion = nn.CrossEntropyLoss()

    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # Turn off gradient tracking
        for batch in val_loader:
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(val_loader)
    accuracy = correct / total

    return avg_loss, accuracy

def train_model_with_validation(model, train_dataset, val_dataset, epochs=5, batch_size=16, learning_rate=1e-3, device='cpu'):
    """
    Train the model with validation after each epoch.

    Args:
        model (nn.Module): The classification model.
        train_dataset (Dataset): Training dataset.
        val_dataset (Dataset): Validation dataset.
        epochs (int): Number of epochs.
        batch_size (int): Batch size.
        learning_rate (float): Learning rate for the optimizer.
        device (str): 'cpu' or 'cuda'.
    """
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()  # Set model to train mode

        total_loss = 0.0
        correct = 0
        total = 0

        for batch in train_loader:
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = correct / total

        # Evaluate on validation set
        val_loss, val_accuracy = evaluate_model(model, val_dataset, batch_size, device)

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")


In [24]:
# Step 1: Collect all texts for building vocabulary
texts_for_vocab = []

for claim in train_claims.values():
    texts_for_vocab.append(claim['claim_text'])
for ev in evidences.values():
    texts_for_vocab.append(ev)

# Step 2: Create and build tokenizer
tokenizer = SimpleTokenizer()
tokenizer.build_vocab(texts_for_vocab)

# Step 3: initialize the model
model = SimpleClassifier(
    vocab_size=tokenizer.vocab_size(), 
    embedding_dim=128, 
    hidden_dim=256, 
    output_dim=4
)


In [None]:
# Move to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Prepare train and validation datasets
train_data = prepare_training_data_fast(train_claims, inverted_index, evidence_word_sets, tokenizer, top_k=5, max_seq_length=128, max_candidates=100000)
val_data = prepare_training_data_fast(dev_claims, inverted_index, evidence_word_sets, tokenizer, top_k=5, max_seq_length=128, max_candidates=100000)
print(f"Prepared {len(train_data)} training samples and {len(val_data)} validation samples.")

train_dataset = ClaimDataset(train_data)
val_dataset = ClaimDataset(val_data)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Using device: cuda
Prepared 1228 training samples and 154 validation samples.
Training dataset size: 1228
Validation dataset size: 154


In [35]:
# Start training with validation
train_model_with_validation(model, train_dataset, val_dataset, epochs=5, batch_size=16, learning_rate=1e-3, device=device)

Epoch 1/5 | Train Loss: 1.5758 | Train Acc: 0.3550 | Val Loss: 1.3378 | Val Acc: 0.4286
Epoch 2/5 | Train Loss: 1.1711 | Train Acc: 0.4593 | Val Loss: 1.3964 | Val Acc: 0.3571
Epoch 3/5 | Train Loss: 1.0179 | Train Acc: 0.5619 | Val Loss: 1.5263 | Val Acc: 0.3247
Epoch 4/5 | Train Loss: 0.9011 | Train Acc: 0.6124 | Val Loss: 1.5658 | Val Acc: 0.3831
Epoch 5/5 | Train Loss: 0.7952 | Train Acc: 0.6555 | Val Loss: 1.7516 | Val Acc: 0.3312


In [None]:
def generate_submission(
    model,
    test_claims,
    inverted_index,
    evidence_word_sets,
    tokenizer,
    output_file="test-output.json",
    top_k=5,
    max_seq_length=128,
    max_candidates=1000,
    device="cpu"
):
    model.eval()
    id_to_label = {
        0: "SUPPORTS",
        1: "REFUTES",
        2: "NOT_ENOUGH_INFO",
        3: "DISPUTED"
    }

    output_dict = {}

    with torch.no_grad():
        for claim_id, claim_data in test_claims.items():
            claim_text = claim_data["claim_text"]

            # Retrieve top evidence IDs
            retrieved_ids = simple_retrieve_fast(
                claim_text,
                inverted_index,
                evidence_word_sets,
                top_k=top_k,
                max_candidates=max_candidates
            )

            # Join evidence words (for input encoding)
            evidence_texts = [
                " ".join(sorted(evidence_word_sets[ev_id])) for ev_id in retrieved_ids if ev_id in evidence_word_sets
            ]
            combined_text = clean_text(claim_text) + " " + " ".join(evidence_texts)
            input_ids = tokenizer.encode(combined_text)

            # Pad/truncate
            if len(input_ids) > max_seq_length:
                input_ids = input_ids[:max_seq_length]
            else:
                padding = max_seq_length - len(input_ids)
                input_ids += [tokenizer.word2id[tokenizer.pad_token]] * padding

            # Predict
            input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
            output = model(input_tensor)
            pred_label_id = torch.argmax(output, dim=1).item()
            pred_label = id_to_label[pred_label_id]

            # Construct output
            output_dict[claim_id] = {
                "claim_text": claim_text,
                "claim_label": pred_label,
                "evidences": retrieved_ids  # top_k evidences
            }

    # Save to JSON
    import json
    with open(output_file, "w") as f:
        json.dump(output_dict, f, indent=2)

    print(f"✅ Submission file saved to: {output_file}")


In [31]:
test_path = 'data/test-claims-unlabelled.json'
with open(test_path, 'r', encoding='utf-8') as f:
    test_claims = json.load(f)

generate_submission(
    model=model,
    test_claims=test_claims,
    inverted_index=inverted_index,
    evidence_word_sets=evidence_word_sets,
    tokenizer=tokenizer,
    output_file="submission.json",
    device="cuda"
)


✅ Submission file saved to: submission.json
