In [45]:
import numpy as np
import json
import re
from scipy.sparse import coo_matrix
from sklearn.neighbors import NearestNeighbors
from scipy.sparse.linalg import svds

In [46]:
data = json.load(open("vocab_dict.json", "r"))

In [47]:
max_doc_index = max([x[0] for occrs in data.values() for x in occrs])
print(max_doc_index)

614627


In [48]:
word_list = []
rows = []   # word indices
cols = []   # document indices
vals = []   # counts
doc_lenghts = []

word2ind = {}

docId2ind = {}

words_encountered = {}

for word in data.keys():
    passages = data[word]
    word = word.lower()
    if word in words_encountered:
        scanned_passages_for_given_word = words_encountered[word]
    else:
        scanned_passages_for_given_word = set()
        word_list.append(word)
        word2ind[word] = len(word_list) - 1
    for passage in passages:
        if passage[0] in scanned_passages_for_given_word:
            print(f"Already encountered this passage for this word: {word}")
            continue
        rows.append(word2ind[word])
        if passage[0] in docId2ind:
            colId = docId2ind[passage[0]]
        else:
            colId = len(docId2ind)
            docId2ind[passage[0]] = colId
            doc_lenghts.append(len(passage[1].split()))
        cols.append(colId)
        scanned_passages_for_given_word.add(passage[0])
        vals.append(int(len(re.findall(rf"\b{re.escape(word)}\b", passage[1], re.IGNORECASE))))
    words_encountered[word] = scanned_passages_for_given_word

assert(max(cols) == len(doc_lenghts) -1)

m = len(word_list)
n = max(cols) + 1
assert(rows[-1] == len(word_list) - 1)

term_doc = coo_matrix((vals, (rows, cols)), shape=(m, n))

Already encountered this passage for this word: ababa
Already encountered this passage for this word: abu
Already encountered this passage for this word: aires
Already encountered this passage for this word: alert
Already encountered this passage for this word: amman
Already encountered this passage for this word: amount
Already encountered this passage for this word: amount
Already encountered this passage for this word: amount
Already encountered this passage for this word: an
Already encountered this passage for this word: and
Already encountered this passage for this word: angeles
Already encountered this passage for this word: antonio
Already encountered this passage for this word: athens
Already encountered this passage for this word: atlantic
Already encountered this passage for this word: beijing
Already encountered this passage for this word: berlin
Already encountered this passage for this word: bond
Already encountered this passage for this word: bonds
Already encountered th

In [49]:
print(type(term_doc))
tf = term_doc/np.array(doc_lenghts)
print(type(tf))

<class 'scipy.sparse._coo.coo_matrix'>
<class 'scipy.sparse._coo.coo_matrix'>


In [50]:
eidf = len(doc_lenghts)/term_doc.getnnz(axis=1)
idf = np.log(eidf)
print(type(idf))

<class 'numpy.ndarray'>


In [51]:
tf_idf = tf.multiply(idf.reshape(-1,1))

In [52]:
top_k = 50
U_k, S_k, Vt_k = svds(tf_idf, k=top_k)
idx = np.argsort(S_k)[::-1] #top-k are returned in reverse order
U_k, S_k, Vt_k = U_k[:, idx], S_k[idx], Vt_k[idx]


term_proj = U_k@np.diag(S_k)  #taking projection in lower-space
term_proj /= np.linalg.norm(term_proj, axis=1, keepdims=True)  #normalizing for cosine similarity

In [53]:
nn = NearestNeighbors(
    n_neighbors=5,
    metric="cosine",
    algorithm="brute"
)

nn.fit(term_proj)

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [56]:
test_words = ["help", "break", "india", "pakistan", "nepal", "china"]
word_list = np.array(word_list)

In [57]:
distances, indices = nn.kneighbors([term_proj[word2ind[w]] for w in test_words])

for i, w in enumerate(test_words):
    print(f"Five words closest to {w}: {word_list[indices[i]]}")

Five words closest to help: ['help' 'happy' 'improve' 'accept' 'west']
Five words closest to break: ['break' 'metro' 'opens' 'football' 'joe']
Five words closest to india: ['india' 'rtrs' 'hindus' 'censor' 'reshaped']
Five words closest to pakistan: ['pakistan' '08:40' 'begum' 'kilometres' 'chaos']
Five words closest to nepal: ['nepal' 'kathmandu' 'para' 'regular' 'uml']
Five words closest to china: ['china' 'santander' 'hainan' 'hu' 'squeeze']
