In [34]:
import json
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from scipy.spatial.distance import cdist
from sentence_transformers import SentenceTransformer
from nlp_function import pick_random_keys, stopwords_func, lower_processing

In [35]:
## Read in data
# Read in training data (claim)
with open('../project-data/train-claims.json', 'r') as tclaim_file:
    tclaim_data = json.load(tclaim_file)

# Read in development data (claim)
with open('../project-data/dev-claims.json', 'r') as dclaim_file:
    dclaim_data = json.load(dclaim_file)

# Read in test data (claim)
with open('../project-data/test-claims-unlabelled.json', 'r') as uclaim_file:
    uclaim_data = json.load(uclaim_file)

# Read in evidence data
with open('../project-data/evidence.json', 'r') as evi_file:
    evi_data = json.load(evi_file)

## Preprocessing - Lowercase operation of the case
tclaim_data = lower_processing(tclaim_data, "claim_text")
dclaim_data = lower_processing(dclaim_data, "claim_text")
uclaim_data = lower_processing(uclaim_data, "claim_text")
evi_data = lower_processing(evi_data, 'evidence')

# ## Remove stopwords from claims and evidence (optional)
# stop_words = set(stopwords.words('english'))
# tclaim_data = stopwords_func(stop_words, "claim", tclaim_data)
# dclaim_data = stopwords_func(stop_words, "claim", dclaim_data)
# uclaim_data = stopwords_func(stop_words, "claim", uclaim_data)
# evi_data = stopwords_func(stop_words, "evidence", evi_data)

## Create claim-evidence pair based on training set
train_pairs = []
for i in tclaim_data.values():
    for j in i["evidences"]:
        train_pairs.append((i["claim_text"], evi_data[j], 1))

## insert negative sample to the training set
for i in tclaim_data.values():
    excluded_keys = i["evidences"]
    random_keys = pick_random_keys(evi_data, excluded_keys, len(excluded_keys))
    for j in random_keys:
        train_pairs.append((i["claim_text"], evi_data[j], 0))

In [36]:
# Obtain sentence list
sentence_dict = {"train": 0, "test": 0}
sentence_list = []
for i in tclaim_data:
    sentence_dict["train"] += 1
    sentence_list.append(tclaim_data[i]["claim_text"])
for i in uclaim_data:
    sentence_dict["test"] += 1
    sentence_list.append(uclaim_data[i]["claim_text"])

# Load pre-trained SBERT model
model_name = 'distilbert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

# Embed sentences and obtain test set vectors
embeddings = model.encode(sentence_list)
train_matrix = embeddings[:sentence_dict["train"]]
test_matrix = embeddings[sentence_dict["train"]:]

# Capture the closest training instance (index) to the test set
test_train_index = []
for i in range(test_matrix.shape[0]):
    distances = cdist(train_matrix, np.expand_dims(test_matrix[i], axis=0), metric='euclidean')
    test_train_index.append(np.argmin(distances))

In [48]:
# 0-R Classification 
label_list = []
for i in tclaim_data.values():
    label_list.append(i["claim_label"])
strings = label_list
counter = Counter(strings)
most_common = counter.most_common(1)
most_frequent_string = most_common[0][0]
frequency = most_common[0][1]

# Assign label and evidence to the test set
train_key_list = list(tclaim_data.keys())
count = 0
for i in uclaim_data.values():
    i["claim_label"] = most_frequent_string
    i["evidences"] = tclaim_data[train_key_list[test_train_index[count]]]["evidences"]
    count += 1

# Save the test set result
file_path = '../test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(uclaim_data, json_file)

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from torch import nn, optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Set up a simple classifier for text matching
classifier = nn.Sequential(
    nn.Linear(2 * model.config.hidden_size, 256),
    nn.ReLU(),
    nn.Linear(256, 2)  # Assuming binary classification (matched vs not matched)
).to(device)

# Set up optimizer and loss function
optimizer = optim.Adam(classifier.parameters(), lr=0.0001)
loss_fn = nn.CrossEntropyLoss()

# Training data
train_data = train_pairs

# Training loop
for epoch in range(20):
    total_loss = 0
    for claim, evidence, label in train_data:
        # Tokenize and encode claim and evidence
        claim_tokens = tokenizer.tokenize(claim)
        evidence_tokens = tokenizer.tokenize(evidence)
        encoded_data = tokenizer.encode_plus(claim_tokens, evidence_tokens, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = encoded_data['input_ids'].to(device)
        attention_mask = encoded_data['attention_mask'].to(device)

        # Generate BERT embeddings
        # Generate BERT embeddings
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            claim_embedding = outputs.last_hidden_state[:, 0, :]
            evidence_embedding = outputs.last_hidden_state[:, 0, :]

        # Concatenate claim and evidence embeddings
        combined_embedding = torch.cat((claim_embedding, evidence_embedding), dim=1)

        # Make predictions
        logits = classifier(combined_embedding)
        predictions = torch.argmax(logits, dim=1)

        # Calculate loss
        loss = loss_fn(logits, torch.tensor([label]).to(device))

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}: Average Loss = {total_loss / len(train_data)}")

In [None]:
# Test data
test_data = []
for i in dclaim_data.values():
    for j in i["evidences"]:
        test_data.append((i["claim_text"], evi_data[j]))

count = 0
# Testing loop
for claim, evidence in test_data:
    # Tokenize and encode claim and evidence
    claim_tokens = tokenizer.tokenize(claim)
    evidence_tokens = tokenizer.tokenize(evidence)
    encoded_data = tokenizer.encode_plus(claim_tokens, evidence_tokens, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = encoded_data['input_ids'].to(device)
    attention_mask = encoded_data['attention_mask'].to(device)

    # Generate BERT embeddings
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        claim_embedding = outputs.last_hidden_state[:, 0, :]
        evidence_embedding = outputs.last_hidden_state[:, 0, :]

    # Concatenate claim and evidence embeddings
    combined_embedding = torch.cat((claim_embedding, evidence_embedding), dim=1)

    # Make predictions
    logits = classifier(combined_embedding)
    predictions = torch.argmax(logits, dim=1)
    if predictions.item() == 1:
        count += 0

    # Print results
    print(f"Claim: {claim}")
    print(f"Evidence: {evidence}")
    print(f"Prediction: {predictions.item()}")
    print()

count / len(test_data)