In [7]:
import json
import numpy as np
from nltk.corpus import wordnet as wn

In [54]:
cn_vec = '../data/processed_data/codenames_vecs{}.json'
dt_vec = '../data/processed_data/dictionary_vecs{}{}.json'
output = '../data/processed_data/{}_sim_mat{}{}.npz'
suffix = ['', '_dblp', '_wiki']
suffix_2 = ['', '_3k', '_5k', '_10k', '_20k', '_30k']

In [55]:
def load_data(file):
    with open(file) as f:
        content = json.load(f)
    words = np.array(list(content.keys()))
    vecs = np.array(list(content.values()))
    word_list = [[words[i], vecs[i]] for i in range(len(words)) if len(wn.synsets(words[i])) > 0]
    words, vecs = np.array(word_list, dtype=object).T
    vecs = np.array([x.tolist() for x in vecs])
    return words, vecs

In [58]:
for s1 in suffix:
    cn_file = cn_vec.format(s1)
    for s2 in suffix_2:
        if (s1 == '' and s2 != '') or (s1 != '' and s2 == ''):
            continue
        dt_file = dt_vec.format(s1, s2)
        cn_words, cn_vecs = load_data(cn_file)
        dt_words, dt_vecs = load_data(dt_file)
        # cosine similarity
        cosine_sim_mat = np.matmul(dt_vecs, cn_vecs.T)
        # wordnet similarities
        wn_fn_dict = {
            'path': wn.path_similarity,
            'lch': wn.lch_similarity,
            'wup': wn.wup_similarity
        }
        path_sim_mat = np.zeros([len(dt_words), len(cn_words)])
        wup_sim_mat = np.zeros([len(dt_words), len(cn_words)])
        
        for i in range(len(dt_words)):
            for j in range(len(cn_words)):
                dtw = wn.synsets(dt_words[i])[0]
                cnw = wn.synsets(cn_words[j])[0]
                path_sim_mat[i][j] = wn.path_similarity(dtw, cnw)
                wup_sim_mat[i][j] = wn.wup_similarity(dtw, cnw)
        
        np.savez(
            output.format('cosine', s1, s2),
            codenames_words=cn_words,
            dictionary_words=dt_words,
            matrix=cosine_sim_mat
        )
        
        np.savez(
            output.format('path', s1, s2),
            codenames_words=cn_words,
            dictionary_words=dt_words,
            matrix=path_sim_mat
        )
        
        np.savez(
            output.format('wup', s1, s2),
            codenames_words=cn_words,
            dictionary_words=dt_words,
            matrix=wup_sim_mat
        )