Subtask2

In [0]:
import numpy as np
from fever_io import load_dataset_json
from math import *
import re
import json

In [10]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
PATH = '/content/gdrive/My Drive/IRDM_CHFX2/'

In [0]:
def cosine_similarity(x, y, norm=False):
    '''
    The input x is the TF-IDF of the claim.
    The input y is the TF-IDF of each documents.
    The output is the cosine similarity with normalization between 0 and 1.
    '''
    assert len(x) == len(y), "len(x) != len(y)"
    zero_list = [0] * len(x)
    if x == zero_list or y == zero_list:
        return float(1) if x == y else float(0)
      
    res = np.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))])
    cos = sum(res[:, 0]) / (np.sqrt(sum(res[:, 1])) * np.sqrt(sum(res[:, 2])))
    
    return 0.5 * cos + 0.5 if norm else cos

In [0]:
def extract_wiki(wikipedia_dir): 
    '''
    The input is the path of wiki-pages.
    The output is a dictionary which the key is the 'id' and the value is the 'text' which 'text' is not empty.
    The output is 3 GB so it is not including in the file.
    '''
    diction = dict()
    for i in range(1,110): ## jsonl file number from 001 to 109
        jnum="{:03d}".format(i)
        fname=wikipedia_dir+"wiki-"+jnum+".jsonl"
        with open(fname) as f:
            line=f.readline()
            while line:
                data=json.loads(line.rstrip("\n"))
                doc_id=data["id"]
                text = data["text"]
                if text != "":
                    diction[doc_id]=text
                line=f.readline()
    np.save(PATH + "diction_Subtask2.npy",diction)
    print("save complete")

In [0]:
extract_wiki(PATH + "data/wiki-pages/wiki-pages/")

save complete


In [0]:
def create_dictory(diction):
    '''
    The purpose of this function is inverted index
    The input is the path of 'diction_Subtask2.npy'
    The output is a dictionary which the key is the term and the value is the id of the documents including this term. It also print the total number of documents.
    The output is 2 GB so it is not including in the file.
    '''
    data = np.load(diction, allow_pickle=True).item()
    dictory = {}
    n_ducuments = len(data)
    for d in data.items():
        text = list(set(d[1].split(' ')))[1:]
        for t in text:
            t = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", t)
            if t.isdigit():
                continue
            if not t in dictory:
                dictory[t] = [d[0]]
            else:
                dictory[t].append(d[0])
    np.save(PATH + "dictory_Subtask2.npy",dictory)
    print(n_ducuments)

In [0]:
create_dictory(PATH + 'diction_Subtask2.npy')

5396106


In [0]:
def Subtask2_cossim(claim_id, numberofducuments):
    '''
    The input is the list of claim 'id' and the total number of documents.
    The output is the claim, the top 5 TF-IDF terms in the claim, the TF-IDF of these terms, the five most similar documents with the claim and the cosine similarity between them.
    The top 5 TF-IDF terms in the claim and the TF-IDF of these terms is save in the 'Q2_claim_TF-IDF.csv' and the claim with the five most similar documents with the claim is save in the 'Q2_vector_space.csv'.
    '''
    data = np.load(PATH + 'diction_Subtask2.npy', allow_pickle=True).item()
    dictory = np.load(PATH + 'dictory_Subtask2.npy', allow_pickle=True).item()
    train_data = load_dataset_json(PATH + 'data/train.jsonl', instance_num=20)

    claim = None
    for d in train_data:
        if d['id'] == claim_id:
            d['claim'] = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", d['claim'])
            claim = d['claim'].split(' ')
            break
    print(d['id'])
    print(d['claim'])

    claim_tfidf = []
    keys = []
    for c in claim:
        tf = claim.count(c) / len(claim)
        idf = log((numberofducuments / (1 + (len(dictory[c]) if c in dictory else 0))))
        keys.append((c, tf * idf, idf, tf))
    keys.sort(key=lambda x: x[1], reverse=True)
    keys = keys[:5]
    word = [k[0] for k in keys]
    vec1 = [k[1] for k in keys] ## vec1 is the list of tf*IDF of top 5 words in the claim.
    print(word)
    print(vec1)
    
    document_tfidf = []
    for d in data.items():
        text = d[1].split(' ')
        vec2 = []
        for k in keys:
            tf = text.count(k[0]) / len(text)
            idf = k[2]
            vec2.append(tf * idf) ## vec2 is the list of tf*IDF of top 5 words in the document.
        sim = cosine_similarity(vec1, vec2)
        document_tfidf.append([d[0], sim])
    document_tfidf.sort(key=lambda x: x[1], reverse=True)
    return document_tfidf[:5]

In [0]:
index_list = [
75397,
150448,
214861,
156709,
129629,
33078,
6744,
226034,
40190,
76253]
numberofducuments = 5396106

for index in index_list:
    print(Subtask2_cossim(index, numberofducuments))

75397
Nikolaj Coster-Waldau worked with the Fox Broadcasting Company
['Coster-Waldau', 'Nikolaj', 'Broadcasting', 'Fox', 'Company']
[1.4897086502302004, 1.2657387165766936, 0.7486763413296768, 0.7230640262285507, 0.5452877241794746]
[['New_Amsterdam_-LRB-TV_series-RRB-', 0.9138333600221531], ['Nikolaj_Coster-Waldau', 0.8763048817667467], ['The_Other_Woman_-LRB-2014_film-RRB-', 0.8763048817667466], ['Game_of_Thrones_-LRB-season_1-RRB-', 0.8570808064872153], ['Simon_Staho', 0.8570808064872153]]
150448
Roman Atwood is a content creator
['Atwood', 'creator', 'content', 'Roman', 'is']
[1.5428368732723297, 1.0768405242196075, 0.9376490536887279, 0.7449620407932371, 0.05058654380892037]
[['The_O.C._-LRB-season_3-RRB-', 0.8436960639251242], ['Genre_fiction', 0.8433921562727289], ['Joel_Spolsky', 0.8433921562727289], ['Premiere_-LRB-The_O.C.-RRB-', 0.8433921562727289], ['Quetzal_-LRB-disambiguation-RRB-', 0.8433912773406878]]
214861
History of art includes architecture dance sculpture music pai

In [0]:
### The code below is the original data pre-processing code. Because of the limited memory, I divided it into two function as above.

In [0]:
# def create_dictory_1():
#     dictory = {}
#     path = PATH + 'data/wiki-pages/wiki-pages/'
#     files = os.listdir(path)
#     D = 0
#     documents = []
#     for f in files:
#         data = load_dataset_json(os.path.join(path, f))
#         documents += data
#         for d in data:
#             D += 1
#             text = list(set(d['text'].split(' ')))[1:]
#             for t in text:
#                 t = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", t)
#                 if t.isdigit():
#                     continue
#                 if not t in dictory:
#                     dictory[t] = [d['id']]
#                 else:
#                     dictory[t].append(d['id'])
#     return dictory, documents, D

In [0]:
# def Subtask2(claim_id, dictory, documents, D):
# 
#     train_data = load_dataset_json(PATH + 'data/train.jsonl', instance_num=20)
# 
#     claim = None
#     for d in train_data:
#         if d['id'] == claim_id:
#             d['claim'] = re.sub("[,.。:_=+*&^%$#@!?()<>/`';|]", "", d['claim'])
#             d['claim'] = d['claim'].lower()
#             claim = d['claim'].split(' ')
#             break
#     #print(d['id'] , d['claim'])
# 
#     claim_tfidf = []
#     keys = []
#     for c in claim:
#         tf = claim.count(c) / len(claim)
#         idf = log((D / (1 + (len(dictory[c]) if c in dictory else 0))))
#         keys.append((c, tf * idf, idf, tf, (len(dictory[c]) if c in dictory else 0)))
#     keys.sort(key=lambda x: x[1], reverse=True)
#     keys = keys[:5]
#     vec1 = [k[1] for k in keys]
#     print(keys)
# 
#     document_tfidf = []
#     for d in documents:
#         text = d['text'].split(' ')
#         vec2 = []
#         for k in keys:
#             tf = text.count(k[0]) / len(text)
#             idf = k[2]
#             vec2.append(tf * idf)
#         sim = cosine_similarity(vec1, vec2)
#         document_tfidf.append([d['id'], sim])
#     document_tfidf.sort(key=lambda x: x[1], reverse=True)
#     return document_tfidf[:5]

In [0]:
# dictory, documents, D = create_dictory()
#
# print(Subtask2(75397, dictory, documents, D))
# print(Subtask2(150448, dictory, documents, D))
# print(Subtask2(214861, dictory, documents, D))
# print(Subtask2(156709, dictory, documents, D))
# print(Subtask2(129629, dictory, documents, D))
# print(Subtask2(33078, dictory, documents, D))
# print(Subtask2(6744, dictory, documents, D))
# print(Subtask2(226034, dictory, documents, D))
# print(Subtask2(40190, dictory, documents, D))
# print(Subtask2(76253, dictory, documents, D))