In [1]:
from gensim.models.keyedvectors import KeyedVectors
import os
import numpy as np
from tqdm import tqdm
import pandas as pd 
from numpy import dot
from numpy.linalg import norm


In [2]:
model =  KeyedVectors.load_word2vec_format('/Castor-data/embeddings/word2vec/GoogleNews-vectors-negative300.txt', binary=False)


In [3]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tknzr.tokenize(s0)

['This',
 'is',
 'a',
 'cooool',
 '#dummysmiley',
 ':',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 '<',
 '>',
 '->',
 '<--']

In [5]:
def cos_sim(a, b):
    return dot(a, b)/(norm(a)*norm(b))

def calculate_cosine_similarity_matrix(query, document):
    query_terms = tknzr.tokenize(query)
    doc_terms = tknzr.tokenize(document)
    
    sim_mat = []
    for query_term in query_terms:
        row = []
        for doc_term in doc_terms:
            if doc_term not in model:
                doc_term = "UNK"
            if query_term not in model:
                query_term = "UNK"
            
            sim_vector = cos_sim(model[query_term], model[doc_term])
            row.append(sim_vector)
        sim_mat.append(row)
        
    # print(np.array(sim_mat))
    return sim_mat



In [6]:
#  Generate from the new ql results
# for year in range(2011, 2015):
#     f = open("/u4/w85yang/MatchZoo/data/tweets/TweetCorpus/test_ql_{}.txt".format(year)) 
#     for l in tqdm(f):
#         sim, a, b, qid, docid = l.replace("\n", "").split("\t")
#         directory = "/mnt/collections/w85yang/cosine/topic_doc_mat/{}".format(qid)
#         out = os.path.join(directory, "{}.npy".format(docid))
#         if not os.path.exists(directory):
#             os.makedirs(directory)
#         sim_mat = calculate_cosine_similarity_matrix(a, b)
#         np.save(out, sim_mat)


46037it [00:42, 1071.10it/s]
54986it [00:47, 1167.68it/s]
60000it [00:51, 1154.78it/s]
55000it [00:57, 955.66it/s] 


In [14]:
%%time
for year in ["train_2011", "test_2011", "train_2013", "test_2013"]:
    folder = "/u4/w85yang/deep-tweet-search/data/twitter/order_by_rel/{}".format(year)
    fa = open(os.path.join(folder, "a.toks")) 
    fb = open(os.path.join(folder, "b.toks")) 
    fsim = open(os.path.join(folder, "sim.txt")) 
    fid = open(os.path.join(folder, "id.txt")) 
    for a, b, sim, ids in zip(fa, fb, fsim, fid):
        qid, _, docid, _, score, _ = ids.replace("\n", "").split()
        directory = "/mnt/collections/w85yang/cosine/topic_doc_mat/{}".format(qid)
        out = os.path.join(directory, "{}.npy".format(docid))
        if not os.path.exists(directory):
            os.makedirs(directory)
        a = a.replace("\n", "")
        b = b.replace("\n", "")
        sim_mat = calculate_cosine_similarity_matrix(a, b)
        np.save(out, sim_mat)


CPU times: user 2min 27s, sys: 8.32 s, total: 2min 36s
Wall time: 2min 36s


## Generate IDF files

In [15]:

def load_idf():
    df = pd.read_csv("/mnt/collections/w85yang/wikiextractor/idf_all/idf_terms.csv")
    return df.set_index('token')["idf"].to_dict()

def get_qidf(a):
    query_terms = tknzr.tokenize(a)
    qidf = []
    for w in query_terms:
        if w in idfs:
            qidf.append(idfs[w])
        else:
            qidf.append(15)
    return qidf

In [16]:
idfs = load_idf()

In [20]:
qid2idf = {}
for year in ["train_2011", "test_2011", "train_2013", "test_2013"]:
    folder = "/u4/w85yang/deep-tweet-search/data/twitter/order_by_rel/{}".format(year)
    fa = open(os.path.join(folder, "a.toks")) 
    fb = open(os.path.join(folder, "b.toks")) 
    fsim = open(os.path.join(folder, "sim.txt")) 
    fid = open(os.path.join(folder, "id.txt")) 
    for a, b, sim, ids in zip(fa, fb, fsim, fid):
        qid, _, docid, _, score, _ = ids.replace("\n", "").split()
        a = a.replace("\n", "")
        directory = "/mnt/collections/w85yang/cosine/query_idf/topic_term_idf"
        out = os.path.join(directory, "{}.npy".format(qid))
        if qid in qid2idf:
            continue
        if not os.path.exists(directory):
            os.makedirs(directory)
        idf = get_qidf(a)
        np.save(out, idf)
        qid2idf[qid] = idf