In [18]:
import numpy as np
import os
import json
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix, save_npz
from collections import Counter
from tqdm.notebook import tqdm

In [2]:
def load_entity2id(file_path):
    entity2id_dict = {}
    with open(file_path) as in_file:
        for idx, line in enumerate(in_file):
            if idx == 0:
                num_words = int(line)
            else:
                entity, entity_id = line.split()
                entity2id_dict[entity] = entity_id
    return entity2id_dict

In [3]:
def load_text(dir_path):
    entity2tf = {}
    text_files = os.listdir(dir_path)
    for text_file in text_files:
        with open(os.path.join(dir_path, text_file)) as text_in:
            content = text_in.readline()
            all_jsons = json.loads(content)
            for json_line in all_jsons:
                text = json_line["text"]
                entity_id = json_line["entity_id"]
                words = text.split()
                length = len(words)
                counter = Counter(words)
                for word, raw_count in counter.items():
                    counter[word] = raw_count / length
                entity2tf[entity_id] = counter
    return entity2tf

In [12]:
def build_tfidf(entity2tf, entity2id):
    word_num = 0
    word2idx = {}
    word_df = defaultdict(int)
    for entity_id, word_dict in entity2tf.items():
        for word in word_dict:
            word_df[word] += 1
            if word not in word2idx:
                word2idx[word] = word_num
                word_num += 1
                  
    row = []
    col = []
    data = []
    doc_num = len(entity2tf)
    for entity_id, word_dict in entity2tf.items():
        entity_idx = entity2id[entity_id]
        for word, tf in word_dict.items():
            word_idx = word2idx[word]
            df = word_df[word]
            idf = np.log(doc_num / df)
            tfidf = tf * idf
        
            row.append(entity_idx)
            col.append(word_idx)
            data.append(tfidf)
            
    entity_num = len(entity2id)
    entity_tfidf = csr_matrix((data, (row, col)), shape=(entity_num, word_num))
    return entity_tfidf, word2idx

In [30]:
def compute_sim_matrix(train_pairs, entity_tfidf, entity2id):
    entity_num = len(entity2id)
    row = []
    col = []
    data = []
    for i, j in tqdm(train_pairs):
            row.append(i)
            col.append(j)
            sim = entity_tfidf.getrow(i).dot(entity_tfidf.getrow(j).transpose()).toarray().item()
            data.append(sim)
    sim_matrix = csr_matrix((data, (row, col)), shape=(entity_num, entity_num))
    return sim_matrix

In [22]:
def get_train_pairs(file_path):
    pairs = []
    with open(file_path) as file_in:
        for idx, line in enumerate(file_in):
            if idx == 0:
                continue
            else:
                head, tail, _ = line.split()
                pairs.append((head, tail))
    return pairs

In [19]:
base_path = '/home/ubuntu/text-pwrd-kg-reasoning/OpenKE/benchmarks/FB60K/'

In [6]:
entity2id = load_entity2id(os.path.join(base_path, "entity2id.txt"))
entity2tf = load_text("tokenized")

In [13]:
entity_tfidf, word2idx = build_tfidf(entity2tf, entity2id)

In [23]:
train_pairs = get_train_pairs(os.path.join(base_path, "train2id.txt"))

In [31]:
sim_matrix = compute_sim_matrix(train_pairs, entity_tfidf, entity2id)
save_npz(os.path.join(base_path, "sparse_matrix.npz"), sim_matrix)

HBox(children=(FloatProgress(value=0.0, max=268280.0), HTML(value='')))


