In [None]:
import numpy as np
import pandas as pd
import gensim
import re
import nltk
import json
import sys
import datetime
import operator
import matplotlib.pyplot as plt
import math
import csv
import timeit

from collections import Counter
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec
from gensim.models import doc2vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors, NearestCentroid, LSHForest
from pylab import plot, show, subplot, specgram, imshow, savefig
from tqdm import tqdm

%matplotlib inline

In [None]:
def cosine(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    return np.dot(v1, v2) / (np.sqrt(np.sum(v1**2)) * np.sqrt(np.sum(v2**2)))

def concatenate(data):
    X_set1 = data['question1']
    X_set2 = data['question2']
    X = X_set1.append(X_set2, ignore_index=True)
    return X

class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield doc2vec.TaggedDocument(words=word_tokenize(doc),
                                         tags=[self.labels_list[idx]])
            
def get_dists_doc2vec(data):
    docvec1s = []
    docvec2s = []
    for i in tqdm(range(data.shape[0])):
        doc1 = word_tokenize(data.iloc[i, -2])
        doc2 = word_tokenize(data.iloc[i, -1])
        docvec1 = model.infer_vector(doc1, alpha=start_alpha, steps=infer_epoch)
        docvec2 = model.infer_vector(doc2, alpha=start_alpha, steps=infer_epoch)
        docvec1s.append(docvec1)
        docvec2s.append(docvec2)
    return docvec1s, docvec2s

In [None]:
src_train = 'df_train_spacylemmat_fullclean.csv'
src_test = 'df_test_spacylemmat_fullclean.csv'

df_train = pd.read_csv(src_train)
df_test = pd.read_csv(src_test)
df_train = df_train[['id', 'question1', 'question2']]
df_test = df_test[['test_id', 'question1', 'question2']]

df_train.fillna('NULL', inplace =  True)
df_test.fillna('NULL', inplace = True)


df_test.rename(columns = {'test_id': 'id'}, inplace = True)
data = pd.concat([df_train, df_test], ignore_index = True)
X_train = data[['id', 'question1', 'question2']]
X = concatenate(X_train)

labels = []
for label in X_train['id'].tolist():
    labels.append('SENT_%s_1' % label)
for label in X_train['id'].tolist():
    labels.append('SENT_%s_2' % label)

docs = LabeledLineSentence(X.tolist(), labels)
it = docs.__iter__()

In [None]:
model = Doc2Vec(size=100, window=10, min_count=2, sample=1e-5, workers=8, iter = 20)
model.build_vocab(docs)
print('Model built.')
model.train(docs, total_examples=model.corpus_count, epochs=model.iter)
print('Model trained.')

In [None]:
start_alpha = 0.01
infer_epoch = 10

results = get_dists_doc2vec(data)
docvec1s, docvec2s = results[0], results[1]

docvec1s = np.array(docvec1s)
docvec1s_tr = docvec1s[:df_train.shape[0]]
docvec1s_te = docvec1s[df_train.shape[0]:]

docvec2s = np.array(docvec2s)
docvec2s_tr = docvec2s[:df_train.shape[0]]
docvec2s_te = docvec2s[df_train.shape[0]:]

In [None]:
np.save('train_q1_doc2vec_vectors_trainquora', docvec1s_tr)
np.save('test_q1_doc2vec_vectors_trainquora', docvec1s_te)

np.save('train_q2_doc2vec_vectors_trainquora', docvec2s_tr)
np.save('test_q2_doc2vec_vectors_trainquora', docvec2s_te)