In [1]:
import json
import torch
import numpy as np
from sklearn.svm import SVC
from collections import Counter
from nltk.corpus import stopwords
from scipy.spatial.distance import cdist
from sentence_transformers import SentenceTransformer
from nlp_function import pick_random_keys, stopwords_func, lower_processing

In [2]:
## Read in data
# Read in training data (claim)
with open('../project-data/train-claims.json', 'r') as tclaim_file:
    tclaim_data = json.load(tclaim_file)

# Read in development data (claim)
with open('../project-data/dev-claims.json', 'r') as dclaim_file:
    dclaim_data = json.load(dclaim_file)

# Read in test data (claim)
with open('../project-data/test-claims-unlabelled.json', 'r') as uclaim_file:
    uclaim_data = json.load(uclaim_file)

# Read in evidence data
with open('../project-data/evidence.json', 'r') as evi_file:
    evi_data = json.load(evi_file)

## Preprocessing - Lowercase operation of the case
tclaim_data = lower_processing(tclaim_data, "claim_text")
dclaim_data = lower_processing(dclaim_data, "claim_text")
uclaim_data = lower_processing(uclaim_data, "claim_text")
evi_data = lower_processing(evi_data, 'evidence')

# ## Remove stopwords from claims and evidence (optional)
# stop_words = set(stopwords.words('english'))
# tclaim_data = stopwords_func(stop_words, "claim", tclaim_data)
# dclaim_data = stopwords_func(stop_words, "claim", dclaim_data)
# uclaim_data = stopwords_func(stop_words, "claim", uclaim_data)
# evi_data = stopwords_func(stop_words, "evidence", evi_data)

## Create claim-evidence pair based on training set
train_pairs = []
for i in tclaim_data.values():
    for j in i["evidences"]:
        train_pairs.append((i["claim_text"], evi_data[j], 1))

## insert negative sample to the training set
for i in tclaim_data.values():
    excluded_keys = i["evidences"]
    random_keys = pick_random_keys(evi_data, excluded_keys, len(excluded_keys))
    for j in random_keys:
        train_pairs.append((i["claim_text"], evi_data[j], 0))

In [8]:
import nltk
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer
import torch
from transformers import BertModel, BertTokenizer
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /Users/relax/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/relax/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
claim_train_sentence = []
claim_dep_sentence = []
claim_test_sentence = []
evidence_sentence = []

for i in tclaim_data.values():
    claim_train_sentence.append(i["claim_text"])
for i in dclaim_data.values():
    claim_dep_sentence.append(i["claim_text"])
for i in uclaim_data.values():
    claim_test_sentence.append(i["claim_text"])
for i in evi_data:
    evidence_sentence.append(evi_data[i])

In [13]:
model_name = "bert-large-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
encoded_inputs = tokenizer.batch_encode_plus(claim_train_sentence, add_special_tokens=True, padding='longest', truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**encoded_inputs)
    sentence_embeddings = model_output.last_hidden_state[:, 0, :]

print(sentence_embeddings.shape)

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

torch.Size([1228, 1024])


In [15]:
sentence_embeddings.cpu().numpy()

array([[-0.39607003, -0.37619188, -0.7186759 , ..., -0.8095297 ,
        -0.6317222 ,  0.10742587],
       [-0.23124436,  0.28230086, -0.56114906, ..., -0.9730543 ,
        -0.5901787 ,  0.35957515],
       [-0.08257185, -0.06848834, -0.19044401, ..., -0.20861049,
        -0.58043325,  0.23053607],
       ...,
       [ 0.12806794,  0.30706802, -0.61280346, ..., -1.0072852 ,
        -0.7599472 ,  0.20259118],
       [-0.1665917 ,  0.2650316 , -0.94764674, ..., -0.8275705 ,
        -0.5272113 , -0.48972073],
       [-0.51467323, -0.12119047, -1.2438874 , ..., -0.06886862,
        -1.4306076 ,  0.40594798]], dtype=float32)

In [4]:
claim_train_sentence = []
claim_dep_sentence = []
claim_test_sentence = []
evidence_sentence = []

for i in tclaim_data.values():
    claim_train_sentence.append(i["claim_text"])
for i in dclaim_data.values():
    claim_dep_sentence.append(i["claim_text"])
for i in uclaim_data.values():
    claim_test_sentence.append(i["claim_text"])
for i in evi_data:
    evidence_sentence.append(evi_data[i])

# Load pre-trained SBERT model
model_name = 'distilbert-base-nli-mean-tokens'
model = SentenceTransformer(model_name, device=torch.device("mps"))

# Embed sentences and obtain test set vectors
train_claim_embeddings = model.encode(claim_train_sentence)
deveplop_claim_embeddings = model.encode(claim_dep_sentence)
test_claim_embeddings = model.encode(claim_test_sentence)
evidence_embeddings = model.encode(evidence_sentence)

train_out_file = open("../train_claim.npz", "wb")
np.save(train_out_file, train_claim_embeddings)
dev_out_file = open("../dev_claim.npz", "wb")
np.save(dev_out_file, deveplop_claim_embeddings)
test_out_file = open("../test_claim.npz", "wb")
np.save(test_out_file, test_claim_embeddings)
evi_out_file = open("../evidence.npz", "wb")
np.save(evi_out_file, evidence_embeddings)

In [None]:
# Obtain sentence list
sentence_dict = {"train": 0, "test": 0}
sentence_list = []
for i in tclaim_data:
    sentence_dict["train"] += 1
    sentence_list.append(tclaim_data[i]["claim_text"])
for i in uclaim_data:
    sentence_dict["test"] += 1
    sentence_list.append(uclaim_data[i]["claim_text"])

# Load pre-trained SBERT model
model_name = 'distilbert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

# Embed sentences and obtain test set vectors
embeddings = model.encode(sentence_list)
train_matrix = embeddings[:sentence_dict["train"]]
test_matrix = embeddings[sentence_dict["train"]:]

# Capture the closest training instance (index) to the test set
test_train_index = []
for i in range(test_matrix.shape[0]):
    distances = cdist(train_matrix, np.expand_dims(test_matrix[i], axis=0), metric='euclidean')
    test_train_index.append(np.argmin(distances))

In [None]:
# 0-R Classification 
label_list = []
for i in tclaim_data.values():
    label_list.append(i["claim_label"])
strings = label_list
counter = Counter(strings)
most_common = counter.most_common(1)
most_frequent_string = most_common[0][0]
frequency = most_common[0][1]

# Assign label and evidence to the test set
train_key_list = list(tclaim_data.keys())
count = 0
for i in uclaim_data.values():
    i["claim_label"] = most_frequent_string
    i["evidences"] = tclaim_data[train_key_list[test_train_index[count]]]["evidences"]
    count += 1

# Save the test set result
file_path = '../test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(uclaim_data, json_file)