In [18]:
from nltk import WordPunctTokenizer
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pickle as pkl

In [19]:
tokenizer = WordPunctTokenizer()

In [20]:
def acquire_documents(filename):
    with open(filename+'.txt','r',encoding='utf-8') as l:
        lemmas = l.read().split('\n')
    l.close()
    tokens = tokenizer.tokenize_sents(lemmas)
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]
    return documents

In [21]:
train_documents = acquire_documents('Train_lemmas')

In [22]:
dim = 100

In [23]:
model = Doc2Vec(train_documents, vector_size=dim, window=2, min_count=1, workers=4)

In [24]:
model.save('Train_only.model')

In [25]:
train_vectors = [model.infer_vector(i.words) for i in tqdm(train_documents)]

100%|██████████████████████████████████| 87719/87719 [00:58<00:00, 1491.21it/s]


In [26]:
test_documents = acquire_documents('Test_lemmas')

In [27]:
test_vectors = [model.infer_vector(i.words) for i in tqdm(test_documents)]

100%|██████████████████████████████████████| 999/999 [00:00<00:00, 1660.83it/s]


In [28]:
val_documents = acquire_documents('Val_lemmas')

In [29]:
val_vectors = [model.infer_vector(i.words) for i in tqdm(val_documents)]

100%|██████████████████████████████████████| 998/998 [00:00<00:00, 1635.92it/s]


In [30]:
train = np.array(train_vectors)

In [31]:
test = np.array(test_vectors)

In [32]:
val = np.array(val_vectors)

In [33]:
with open('Train_'+str(dim)+'.pkl', 'wb') as tr:
    pkl.dump(train, tr)

In [34]:
with open('Test_'+str(dim)+'.pkl', 'wb') as tst:
    pkl.dump(test, tst)

In [35]:
with open('Val_'+str(dim)+'.pkl', 'wb') as vl:
    pkl.dump(val, vl)