In [1]:
from gensim.models import KeyedVectors
import numpy as np

In [2]:
ft_file = "/Users/i337036/Documents/Data/wiki.en.vec"
word_vecs = KeyedVectors.load_word2vec_format(ft_file, limit=50000)

In [3]:
import pandas as pd
csv_file = "/Users/i337036/Downloads/train.csv"
df = pd.read_csv(csv_file, encoding='utf-8')
len(df)

404290

In [4]:
df.columns.values

array(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'],
      dtype=object)

In [5]:
df = df[df['is_duplicate']==1]

In [6]:
df = df.drop_duplicates(subset=['question1'], keep='first')
df = df.drop_duplicates(subset=['question2'], keep='first')
len(df)

73776

In [7]:
len((df["question1"] + df["question2"]).unique())

73776

In [8]:
q1_list = df.question1.tolist()
q2_list = df.question2.tolist()
print(len(q1_list), len(q2_list))

73776 73776


In [9]:
def clean_sentence(sentence):
    sentence = [word for word in sentence.strip().split() if word in word_vecs.vocab.keys()]
    
    return sentence

In [10]:
def get_sentence_matrix(sentences):
    sentence_mat = np.zeros((len(sentences), 300), dtype=float)
    for idx in tqdm(range(len(sentences)), desc="Building vectors for %d sentences" % len(sentences)):
        sentence = sentences[idx]
        for word in sentence:
            sentence_mat[idx, :] += word_vecs[word]
            
    return sentence_mat

In [11]:
from tqdm import tqdm

In [12]:
q1_word_list = [clean_sentence(txt1) for txt1 in q1_list]
q2_word_list = [clean_sentence(txt2) for txt2 in q2_list]

In [13]:
q2_mat = get_sentence_matrix(q1_word_list)
q1_mat = get_sentence_matrix(q2_word_list)

Building vectors for 73776 sentences: 100%|██████████| 73776/73776 [00:03<00:00, 22654.80it/s]
Building vectors for 73776 sentences: 100%|██████████| 73776/73776 [00:02<00:00, 25265.31it/s]


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar(q1_mat, q2_mat, index, top_n=5):
    qd_sims = cosine_similarity(q1_mat[index: index+1], q2_mat).flatten()
    qd_indices = qd_sims.argsort()[:-top_n:-1]
    return qd_indices

In [15]:
from random import randint

In [16]:
search_idx = randint(0, len(q1_list))
print("Searched for: %s at [%d]" % (q1_list[search_idx], search_idx))
for index in find_similar(q1_mat, q2_mat, search_idx):
    print ("%s %d" % (q2_list[index], index))

Searched for: What are some obvious forms of time travel crimes? at [55652]
What are the most over-hyped travel destinations? 26085
What are some time travel crimes? 55652
What's your best travel hack? 21884
Why should I travel alone? 26826


In [17]:
q1_list[search_idx], q2_list[search_idx]

('What are some obvious forms of time travel crimes?',
 'What are some time travel crimes?')

In [18]:
from tqdm import tqdm

In [19]:
correct_index = 0
for search_idx in tqdm(range(500)):
    for index in find_similar(q1_mat, q2_mat, search_idx):
        if search_idx==index or q1_list[search_idx]==q2_list[index]:
            correct_index += 1
            break

100%|██████████| 500/500 [01:45<00:00,  4.72it/s]


In [20]:
print(correct_index)

198
