## Question to Doc2Vec Training

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_pickle('input/train.p')
df.head(3)

Unnamed: 0,is_duplicate,question1,question2,diff_avg_word,tf_distance1,tf_distance2,cnt_distance,jaccard_dist,word_match
0,0,step step guid invest share market india,step step guid invest share market,-0.035714,0.034913,0.074441,0.100368,0.230769,0.727273
1,0,stori kohinoor kohinoor diamond,would happen indian govern stole kohinoor kohi...,-0.394231,0.396372,0.689242,0.572472,0.75,0.307692
2,0,increas speed internet connect use vpn,internet speed increas hack dns,-0.685714,0.542446,0.729503,0.519389,0.857143,0.363636


## Save Word2Vec model with Gensim
- Training + TestSet (5500172)
- Google News Corpus

In [19]:
from gensim.models import word2vec
from gensim.models import KeyedVectors
import multiprocessing

In [20]:
cores = multiprocessing.cpu_count()

In [9]:
train_questions = pd.concat([df.question1, df.question2])
test_questions = pd.concat([df_test.question1, df_test.question2])
all_questions = pd.concat([train_questions, test_questions])
len(all_questions)

5500172

In [16]:
model = KeyedVectors.load_word2vec_format('input/GoogleNews-vectors-negative300.bin', binary=True)  

In [17]:
model.similarity('woman', 'man')

0.76640122309953518

In [18]:
model.most_similar('programmer')

[('programmers', 0.6646655201911926),
 ('Programmer', 0.6181110739707947),
 ('coder', 0.6113354563713074),
 ('programer', 0.6070292592048645),
 ('computer_programmer', 0.5910932421684265),
 ('Jon_Shiring', 0.5551139116287231),
 ('sysadmin', 0.5432621836662292),
 ('animator', 0.5348355174064636),
 ('coders', 0.5315176248550415),
 ('engineer', 0.5284372568130493)]

## Doc2Vec

In [74]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models import doc2vec
from nltk import word_tokenize
from tqdm import tqdm
# nltk.download('punkt')

In [23]:
df_train = pd.read_csv('input/train.csv')

In [56]:
class LabeledLineSentence(object):

    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list

    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield doc2vec.TaggedDocument(words=word_tokenize(doc), tags=[self.labels_list[idx]])

In [None]:
labels = []
for label in df_train['id'].tolist():
    labels.append('SENT_%s_1' % label)
for label in df_train['id'].tolist():
    labels.append('SENT_%s_2' % label)

In [60]:
docs.doc_list[:5]

['step step guid invest share market india',
 'stori kohinoor kohinoor diamond',
 'increas speed internet connect use vpn',
 'mental lone solv',
 'one dissolv water quick sugar salt methan carbon di oxid']

In [48]:
docs = LabeledLineSentence(train_questions.tolist(), labels)
it = docs.__iter__()
model1 = Doc2Vec(it, size=12, window=8, min_count=5, workers=cores)

In [61]:
model1.corpus_count

808580

In [63]:
%time model1.train(it, total_examples=model1.corpus_count, epochs=20)

CPU times: user 2.52 ms, sys: 1.73 ms, total: 4.25 ms
Wall time: 3.99 ms


0

In [64]:
model1.most_similar('step')

[('npcomplet', 0.9148457646369934),
 ('focusconcentr', 0.8950409889221191),
 ('3050', 0.8718961477279663),
 ('antiag', 0.8639283180236816),
 ('alma', 0.8621370792388916),
 ('handicap', 0.8450130224227905),
 ('coitus', 0.8414425849914551),
 ('nake', 0.8406568169593811),
 ('mite', 0.8279911875724792),
 ('npo', 0.8273588418960571)]

In [None]:
X_test.index = np.arange(0, X_test['question1'].shape[0])
y_test.index = np.arange(0, X_test['question1'].shape[0])

count = 0
for i in range(X_test['question1'].shape[0]):
    doc1 = word_tokenize(X_test['question1'][i])
    doc2 = word_tokenize(X_test['question2'][i])

    docvec1 = model1.infer_vector(doc1)
    docvec2 = model1.infer_vector(doc2)

In [72]:
def cosine_similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    return np.dot(v1, v2) / (np.sqrt(np.sum(v1**2)) * np.sqrt(np.sum(v2**2)))

In [79]:
doc_similarity = []
for i in tqdm(range(df['question1'].shape[0])):
    doc1 = word_tokenize(df['question1'][i])
    doc2 = word_tokenize(df['question2'][i])

    docvec1 = model1.infer_vector(doc1)
    docvec2 = model1.infer_vector(doc2)
    
    similarity = cosine_similarity(docvec1, docvec2)
    doc_similarity.append(similarity)

100%|██████████| 404290/404290 [04:58<00:00, 1352.22it/s]


In [70]:
docvec1

array([ 0.00578766,  0.03320243,  0.02880182,  0.04073226,  0.01728419,
        0.01202335,  0.00750114, -0.03486169, -0.02347461,  0.02026902,
       -0.01370297,  0.01564579], dtype=float32)

In [81]:
doc_similarity[:3]

[-0.49335498, -0.6562255, -0.15725894]

In [80]:
df['doc_similarity'] = doc_similarity 

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 10 columns):
is_duplicate      404290 non-null int64
question1         404290 non-null object
question2         404290 non-null object
diff_avg_word     404290 non-null float64
tf_distance1      404290 non-null float64
tf_distance2      404290 non-null float64
cnt_distance      404290 non-null float64
jaccard_dist      404290 non-null float64
word_match        404290 non-null float64
doc_similarity    404290 non-null float64
dtypes: float64(7), int64(1), object(2)
memory usage: 33.9+ MB


In [83]:
df.to_pickle('input/train.p')