In [1]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models import doc2vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine, jaccard, hamming, correlation
from collections import Counter
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import re

from sklearn.model_selection import train_test_split
import math
import gensim
import json
import sys
import pandas as pd
import numpy as np
import gc

import multiprocessing
import functools
from tqdm import tqdm

In [2]:
def cosine(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    return np.dot(v1, v2) / (np.sqrt(np.sum(v1**2)) * np.sqrt(np.sum(v2**2)))

def concatenate(data):
    X_set1 = data['question1']
    X_set2 = data['question2']
    X = X_set1.append(X_set2, ignore_index=True)
    return X

class LabeledLineSentence(object):

    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list

    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield doc2vec.TaggedDocument(words=word_tokenize(doc),
                                         tags=[self.labels_list[idx]])
            


In [3]:
def get_data(src_train, src_test):
    df_train = pd.read_csv(src_train)
    df_test  = pd.read_csv(src_test)
    df_train = df_train.loc[:, ['question1', 'question2']]
    df_test = df_test.loc[:, ['question1', 'question2']]
    df_train.fillna('NULL', inplace = True)
    df_test.fillna('NULL', inplace = True)
    data = pd.concat((df_train, df_test))
    del df_train, df_test
    gc.collect()
    return data
    
def get_dists_doc2vec(data):
    docvec1s = np.zeros((data.shape[0], 300), dtype = 'float32')
    docvec2s = np.zeros((data.shape[0], 300), dtype = 'float32')
    for i in tqdm(range(data.shape[0])):
        doc1 = word_tokenize(data.iloc[i, -2])
        doc2 = word_tokenize(data.iloc[i, -1])
        docvec1 = model1.infer_vector(doc1, alpha=start_alpha, steps=infer_epoch)
        docvec2 = model1.infer_vector(doc2, alpha=start_alpha, steps=infer_epoch)
        docvec1s[i, :] = docvec1
        docvec2s[i, :] = docvec2
    return docvec1s, docvec2s

In [4]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/embeddings/doc2vec/enwiki_dbow/'
model_path = src + 'doc2vec.bin'
eng_stopwords = set(stopwords.words('english'))

src_train = 'df_train_spacylemmat_fullclean.csv'
src_test = 'df_test_spacylemmat_fullclean.csv'

model1 = Doc2Vec.load(model_path)
data = get_data(src_train, src_test)

In [5]:
start_alpha = 0.01
infer_epoch = 10

results = get_dists_doc2vec(data)

100%|██████████| 2750086/2750086 [46:20<00:00, 989.05it/s] 


In [7]:
df_train = pd.read_csv(src_train)

docvec1s, docvec2s = results[0], results[1]

docvec1s = np.array(docvec1s)
docvec1s_tr = docvec1s[:df_train.shape[0]]
docvec1s_te = docvec1s[df_train.shape[0]:]

docvec2s = np.array(docvec2s)
docvec2s_tr = docvec2s[:df_train.shape[0]]
docvec2s_te = docvec2s[df_train.shape[0]:]

In [8]:
np.save('train_q1_doc2vec_vectors_pretrained_fullcleanDF', docvec1s_tr)
np.save('test_q1_doc2vec_vectors_pretrained_fullcleanDF', docvec1s_te)

np.save('train_q2_doc2vec_vectors_pretrained_fullcleanDF', docvec2s_tr)
np.save('test_q2_doc2vec_vectors_pretrained_fullcleanDF', docvec2s_te)