In [1]:
import json
import os
import re

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument 

import multiprocessing

from doc2vec_prep import stem_text

In [34]:
def read_input(filename, type):
    with open(filename) as f:
        docs = json.load(f)
    print(f'Loaded {len(docs)} {type}')
    return docs

In [None]:
publications_file = 'epcc_inf_publications.json'
projects_file = 'epcc_inf_projects.json'
staff_file = 'epcc_inf_staff.json'

min_words = 10

publications = read_input(publications_file, 'publications')
projects = read_input(projects_file, 'projects')

In [65]:
num_cores = multiprocessing.cpu_count()
print(f'#cores = {num_cores}')
num_cores = 8 # should be 8??
print(f'#cores = {num_cores}')

#cores = 1
#cores = 8


In [33]:
def create_documents(publications, projects):
    training_data = []
    for pub_id, publication in publications:
        text = publication['abstract'] + publication['title']
        words = stem_text(text)
        if len(words) >= min_words:
            training_data.append(TaggedDocument(words=words, tags=[pub_id]))
    for proj_id, project in projects:
        text = project['title'] + project['description']
        words = stem_text(text)
        if len(words) >= min_words:
            training_data.append(TaggedDocument(words=words, tags=[proj_id]))

    return training_data

In [35]:
def create_training_data(publications, projects):
    docs_with_abstract = filter(lambda p: p[1]['abstract'] != '', publications.items())
    docs_with_description = filter(lambda p: p[1]['description'] != '', projects.items())    
    training_data = create_stem_tagged_document(docs_with_abstract, docs_with_description)
    return training_data

In [37]:
print('Creating training data')
training_data = create_training_data(publications, projects)
print(f'Training data contains {len(training_data)} documents')

Creating training data
Training data contains 16497 documents


In [17]:
def create_model(training_data, hyperparams):
    # Create the model
    print('Creating the model')
    model = Doc2Vec(**hyperparams)
    model.build_vocab(training_data)
    model.train(training_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [64]:
%%time
hyperparams_cbow  = {
    'vector_size': 300,
    'min_count': 1,
    'epochs': 100,
    'window': 15,
    'negative': 5, 
    'sampling_threshold': 1e-5, 
    'workers': num_cores, 
    'dm': 0
}
model_cbow = create_model(training_data, hyperparams_cbow)
model_cbow.save('cbow.model')
print('complete')

Creating the model
complete
CPU times: user 8min 44s, sys: 661 ms, total: 8min 44s
Wall time: 9min 9s


In [66]:
%%time
hyperparams_dmv1  = {
    'vector_size': 300,
    'min_count': 1,
    'epochs': 100,
    'window': 15,
    'negative': 5, 
    'sampling_threshold': 1e-5, 
    'workers': num_cores, 
    'dm': 1
}

model_dmv1 = create_model(training_data, hyperparams_dmv1)
model_dmv1.save('dmv1.model')
print('complete')

Creating the model
complete
CPU times: user 15min 37s, sys: 1.07 s, total: 15min 38s
Wall time: 16min 23s


In [67]:
hyperparams_dmv2  = {
    'vector_size': 300,
    'min_count': 1,
    'epochs': 100,
    'window': 3,
    'hs': 0,
    'negative': 5,
    'ns_exponent': -0.5,
    'sampling_threshold': 1e-5, 
    'workers': multiprocessing.cpu_count(), 
    'dm': 1
}

model_dmv2 = create_model(training_data, hyperparams_dmv2)
model_dmv2.save('dmv2.model')
print('complete')

Creating the model
complete
