In [9]:
import json
import random
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nlp_function import pick_random_keys, stopwords_func, lower_processing, most_frequent_element

## Task 1

In [10]:
## Read in data
# Read in training data (claim)
with open('../project-data/train-claims.json', 'r') as tclaim_file:
    tclaim_data = json.load(tclaim_file)

# Read in development data (claim)
with open('../project-data/dev-claims.json', 'r') as dclaim_file:
    dclaim_data = json.load(dclaim_file)

# Read in test data (claim)
with open('../project-data/test-claims-unlabelled.json', 'r') as uclaim_file:
    uclaim_data = json.load(uclaim_file)

# Read in evidence data
with open('../project-data/evidence.json', 'r') as evi_file:
    evi_data = json.load(evi_file)

## Preprocessing - Lowercase operation of the case
tclaim_data = lower_processing(tclaim_data, "claim_text")
dclaim_data = lower_processing(dclaim_data, "claim_text")
uclaim_data = lower_processing(uclaim_data, "claim_text")
evi_data = lower_processing(evi_data, 'evidence')

# ## Remove stopwords from claims and evidence (optional)
# stop_words = set(stopwords.words('english'))
# tclaim_data = stopwords_func(stop_words, "claim", tclaim_data)
# dclaim_data = stopwords_func(stop_words, "claim", dclaim_data)
# uclaim_data = stopwords_func(stop_words, "claim", uclaim_data)
# evi_data = stopwords_func(stop_words, "evidence", evi_data)

## Create claim-evidence pair based on training set
tkey_list = list(tclaim_data.keys())
train_pairs = []
evi_keys = []
labels = []
for i in tclaim_data:
    for j in tclaim_data[i]["evidences"]:
        evi_keys.append(j)
        train_pairs.append((i, tclaim_data[i]["claim_text"], evi_data[j], j))
        labels.append(1)
  
## insert negative sample to the training set
random.seed(1)
for i in tclaim_data:
    excluded_keys = tclaim_data[i]["evidences"]
    random_keys = pick_random_keys(evi_data, excluded_keys, len(excluded_keys))
    for j in random_keys:
        evi_keys.append(j)
        train_pairs.append((i, tclaim_data[i]["claim_text"], evi_data[j], j))
        labels.append(0)

In [11]:
# Store the claim, evidence sentence(s) to list for embedded usage
claim_train_sentence = []
claim_dep_sentence = []
claim_test_sentence = []
evidence_full_sentence = []
evidence_sample_sentence = []

for i in tclaim_data.values():
    claim_train_sentence.append(i["claim_text"])
for i in dclaim_data.values():
    claim_dep_sentence.append(i["claim_text"])
for i in uclaim_data.values():
    claim_test_sentence.append(i["claim_text"])
for i in evi_data.values():
    evidence_full_sentence.append(i)
tfidf_keys = []
for i in train_pairs[:int(len(train_pairs)/2)]:
  evidence_sample_sentence.append(i[2])
  tfidf_keys.append(i[3])

In [12]:
# Sample training data
train_data = claim_train_sentence + claim_dep_sentence + evidence_full_sentence

# Sample test data
test_data = claim_dep_sentence

# Initialize BOW vectorizer and fit on training text data
vectorizer = CountVectorizer()
vectorizer.fit(train_data)

# Embed training and test data
train_embeddings = vectorizer.transform(claim_train_sentence)
evi_embeddings = vectorizer.transform(evidence_full_sentence)
test_embeddings = vectorizer.transform(test_data)

# Obtain the closest 3 evidences
k = 3
evi_label_list = []
for i in range(test_embeddings.shape[0]):
    cosine_sim = cosine_similarity(test_embeddings[i], evi_embeddings)
    evi_label_list.append(list((np.argsort(cosine_sim.flatten())[::-1][:k])))

# Example array
evi_key_list = list(evi_data.keys())

# List of indices to extract
test_evi_list = []
for i in evi_label_list:
    test_evi_list.append([evi_key_list[j] for j in i])

count = 0
for i in dclaim_data:
  dclaim_data[i]["claim_label"] = "SUPPORTS"
  dclaim_data[i]["evidences"] = test_evi_list[count]
  count += 1

# Save the test set result
file_path = '../project-data/dev-claims-test.json'
with open(file_path, 'w') as json_file:
    json.dump(dclaim_data, json_file)

## Task 2

In [13]:
# Training data
train_data = claim_train_sentence + claim_dep_sentence

# Test data
test_data = claim_test_sentence

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data
tfidf_vectorizer.fit(train_data)

# Transform the training and test data using the trained vectorizer
train_embeddings = tfidf_vectorizer.transform(train_data).toarray()
test_embeddings = tfidf_vectorizer.transform(test_data).toarray()

# Create training and test np.ndarray
train_np = train_embeddings[:len(claim_train_sentence)]
test_np = test_embeddings

In [14]:
# Collect similarity value for mojority voting operation
label_list = []
for i in test_np:
    similarity = []
    for j in train_np:
        similarity.append(cosine_similarity(np.reshape(i, (1, -1)), np.reshape(j, (1, -1)))[0][0])
    top_index = np.argsort(similarity)[-11:]
    label_list.append(list(top_index))

with open('../Performance/BoW/test-claims-predictions.json', 'r') as final_json:
    final_test = json.load(final_json)

# Obtain the most frequent label from the closest claim(s)
potential_label_list = []
train_key_list = list(tclaim_data.keys())
test_key_list = list(final_test.keys())
for i in range(len(label_list)):
    label_list_potential = []
    for j in label_list[i]:
        label_list_potential.append(tclaim_data[train_key_list[j]]["claim_label"])
    potential_label_list.append(label_list_potential)
    test_class = most_frequent_element(label_list_potential)
    final_test[test_key_list[i]]["claim_label"] = test_class

# Store to json
file_path = '../Performance/BoW/test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(final_test, json_file)