In [97]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import json
import torch
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from scipy.spatial.distance import cdist
from transformers import BertModel, BertTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /Users/relax/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/relax/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [98]:
'''Remove stopwords from claim and evidence for reducing the computational consumption'''
def stopwords_func(stop_words, text_type, text_data):
    if text_type == "evidence":
        for i in text_data:
            sentence = text_data[i]
            words = sentence.split()
            filtered_words = [word for word in words if word.lower() not in stop_words]
            filtered_sentence = " ".join(filtered_words)
            text_data[i] = filtered_sentence
    else:
        for i in text_data.values():
            sentence = i["claim_text"]
            words = sentence.split()
            filtered_words = [word for word in words if word.lower() not in stop_words]
            filtered_sentence = " ".join(filtered_words)
            i["claim_text"] = filtered_sentence
    return text_data

'''Function for picking random keys from the dictionary after excluding the specified key(s)'''
def pick_random_keys(dictionary, excluded_keys, num_keys):
    available_keys = [key for key in dictionary.keys() if key not in excluded_keys]
    random_keys = random.sample(available_keys, num_keys)
    return random_keys

'''Function for turning the text into lowercase expression'''
def lower_processing(data, text_type):
    if text_type == "claim_text":
        for i in data:
            data[i][text_type] = data[i][text_type].lower()
    else:
        for i in data:
            data[i] = data[i].lower()
    return data

In [99]:
## Read in data
# Read in training data (claim)
with open('../project-data/train-claims.json', 'r') as tclaim_file:
    tclaim_data = json.load(tclaim_file)

# Read in development data (claim)
with open('../project-data/dev-claims.json', 'r') as dclaim_file:
    dclaim_data = json.load(dclaim_file)

# Read in test data (claim)
with open('../project-data/test-claims-unlabelled.json', 'r') as uclaim_file:
    uclaim_data = json.load(uclaim_file)

# Read in evidence data
with open('../project-data/evidence.json', 'r') as evi_file:
    evi_data = json.load(evi_file)

## Preprocessing - Lowercase operation of the case
tclaim_data = lower_processing(tclaim_data, "claim_text")
dclaim_data = lower_processing(dclaim_data, "claim_text")
uclaim_data = lower_processing(uclaim_data, "claim_text")
evi_data = lower_processing(evi_data, 'evidence')

## Create claim-evidence pair based on training set
tkey_list = list(tclaim_data.keys())
train_pairs = []
evi_keys = []
labels = []
for i in tclaim_data:
    for j in tclaim_data[i]["evidences"]:
        evi_keys.append(j)
        train_pairs.append((i, tclaim_data[i]["claim_text"], evi_data[j], j))
        labels.append(1)
  
## insert negative sample to the training set
random.seed(1)
for i in tclaim_data:
    excluded_keys = tclaim_data[i]["evidences"]
    random_keys = pick_random_keys(evi_data, excluded_keys, len(excluded_keys))
    for j in random_keys:
        evi_keys.append(j)
        train_pairs.append((i, tclaim_data[i]["claim_text"], evi_data[j], j))
        labels.append(0)

In [100]:
# Store the claim, evidence sentence(s) to list for embedded usage
claim_train_sentence = []
claim_dep_sentence = []
claim_test_sentence = []
evidence_sample_sentence = []

for i in tclaim_data.values():
    claim_train_sentence.append(i["claim_text"])
for i in dclaim_data.values():
    claim_dep_sentence.append(i["claim_text"])
for i in uclaim_data.values():
    claim_test_sentence.append(i["claim_text"])
tfidf_keys = []
for i in train_pairs[:int(len(train_pairs)/2)]:
  if i[2] not in evidence_sample_sentence:
    evidence_sample_sentence.append(i[2])
    tfidf_keys.append(i[3])

In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample training data
train_data = claim_train_sentence + claim_dep_sentence

# Sample test data
test_data = claim_dep_sentence

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data
tfidf_vectorizer.fit(train_data)

# Transform the training and test data using the trained vectorizer
train_embeddings = tfidf_vectorizer.transform(train_data).toarray()
test_embeddings = tfidf_vectorizer.transform(test_data).toarray()

# Create training and test np.ndarray
train_np = train_embeddings[:len(claim_train_sentence)]
evi_np = train_embeddings[len(claim_train_sentence):]
test_np = test_embeddings

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample training data
train_data = claim_train_sentence + claim_dep_sentence

# Sample test data
test_data = claim_test_sentence

# Initialize BOW vectorizer and fit on training text data
vectorizer = CountVectorizer()
vectorizer.fit(train_data)

# Transform new sentences into BOW vectors
train_embeddings = vectorizer.transform(train_data).toarray()
test_embeddings = vectorizer.transform(test_data)

# Create training and test np.ndarray
train_np = train_embeddings[:len(claim_train_sentence)]
test_np = test_embeddings

In [108]:
# Collect similarity value for weighted operation
label_list = []
sim_list = []
for i in test_np:
    similarity = []
    for j in train_np:
        similarity.append(cosine_similarity(np.reshape(i, (1, -1)), np.reshape(j, (1, -1)))[0][0])
    top_index = np.argsort(similarity)[-9:]
    sim_list.append([similarity[k] for k in top_index])
    label_list.append(list(top_index))

# Collect the potential label
potential_label_list = []
train_key_list = list(tclaim_data.keys())
for i in range(len(label_list)):
    label_list_potential = []
    for j in label_list[i]:
        label_list_potential.append(tclaim_data[train_key_list[j]]["claim_label"])
    potential_label_list.append(label_list_potential)

# Compute the weighted KNN result (dictionary) and select the label with the greatest value
final_label = []
for i in range(len(potential_label_list)):
    score_dict = {}
    for j in range(len(potential_label_list[i])):
        if potential_label_list[i][j] not in score_dict:
            score_dict[potential_label_list[i][j]] = sim_list[i][j] ** 2
        else:
            score_dict[potential_label_list[i][j]] += sim_list[i][j] ** 2
    final_label.append(max(score_dict, key = score_dict.get))

# Store the predicted label into the json file
with open('../test-claims-predictions.json', 'r') as final_json:
    final_test = json.load(final_json)
test_key_list = list(final_test.keys())
for i in range(len(test_key_list)):
    final_test[test_key_list[i]]["claim_label"] = final_label[i]
file_path = '../test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(final_test, json_file)

In [130]:
# Collect similarity value for weighted operation
label_list = []
for i in test_np:
    similarity = []
    for j in train_np:
        similarity.append(cosine_similarity(np.reshape(i, (1, -1)), np.reshape(j, (1, -1)))[0][0])
    top_index = np.argsort(similarity)[-11:]
    label_list.append(list(top_index))

with open('../project-data/dev-claims-test.json', 'r') as final_json:
    final_test = json.load(final_json)

def most_frequent_element(nums):
    return max(set(nums), key=nums.count)

potential_label_list = []
train_key_list = list(tclaim_data.keys())
test_key_list = list(final_test.keys())
for i in range(len(label_list)):
    label_list_potential = []
    for j in label_list[i]:
        label_list_potential.append(tclaim_data[train_key_list[j]]["claim_label"])
    potential_label_list.append(label_list_potential)
    test_class = most_frequent_element(label_list_potential)
    # test_class = (tclaim_data[train_key_list[label_list[i]]]["claim_label"])
    final_test[test_key_list[i]]["claim_label"] = test_class

# Store to json
file_path = '../project-data/dev-claims-test.json'
with open(file_path, 'w') as json_file:
    json.dump(final_test, json_file)

In [123]:
with open('../test-claims-predictions.json', 'r') as final_json:
    test = json.load(final_json)

In [127]:
final_test == test

False