In [2]:
import glob
import json
from tqdm import tqdm

from gensim.models.doc2vec import Doc2Vec, TaggedDocument 

from doc2vec_prep import stem_text

In [2]:
import configparser
from neo4j import GraphDatabase

In [3]:
# Init the Doc2Vec model
hyperparams  = {
    'vector_size': 300,
    'min_count': 1,
    'epochs': 100,
    'window': 15,
    'negative': 5, 
    'sampling_threshold': 1e-5, 
    'workers': 8, 
    'dm': 0
}
min_length = 10 # number of stemmed words in document

In [4]:
def generate_documents(driver, query):
    research_outputs = set()
    gen_docs = 0
    with driver.session() as session:
        result = session.run(query)
        for r in result:
            resout_node = r['r']
            resout_id = resout_node['uuid']
            if resout_id not in research_outputs:
                research_outputs.add(resout_id)
            if resout_node['abstract_value'] != '':
                text = resout_node['abstract_value'] + resout_node['title']
                words = stem_text(text)
                if len(words) >= min_length:
                    gen_docs += 1
                    yield TaggedDocument(words=words, tags=[resout_id])
    print(f'Generated {gen_docs} documents')

In [5]:
config = configparser.ConfigParser()
config.read('.env')
uri = "bolt://roag.is.ed.ac.uk:7687"
driver = GraphDatabase.driver(uri, auth=(config['auth']['USERNAME'], config['auth']['PASSWORD']))

Build the model with data input directly from ROAG. If you remove the "LIMIT 1000" it uses all data (105,403 documents on 25 June).

In [6]:
model = Doc2Vec(**hyperparams)
query = 'MATCH (r:PURE:ResearchOutput) RETURN r LIMIT 1000'

In [7]:
model.build_vocab(tqdm(generate_documents(driver, query)))

845it [00:02, 354.29it/s]


Generated 845 documents


In [8]:
model.train(
    tqdm(generate_documents(driver, query)), 
    total_examples=model.corpus_count, 
    epochs=model.epochs)

845it [00:02, 306.22it/s]


Generated 845 documents


## Load existing model

In [4]:
model = Doc2Vec.load('doc2vec_neo4j_v1.model', mmap='r')

## Test model

In [5]:
def get_document_neo4j(driver, doc_id):
    document = None
    with driver.session() as session:        
        query = f"""
            MATCH (r:PURE:ResearchOutput) -- (p:PURE:Person)
            WHERE r.uuid = '{doc_id}' 
            RETURN r, p
        """
        result = session.run(query)
        author_ids = set()
        authors = []
        for r in result:
            if document is None:
                resout_node = r['r']
                document = {
                    'id': resout_node['uuid'],
                    'url': resout_node['info_portalUrl'],
                    'abstract' : resout_node['abstract_value'],
                    'title': resout_node['title'],
                    'keywords': resout_node['keywords'],
                }

            author_id = r['p']['uuid']
            author = {
                'name': f"{r['p']['name_firstName']} {r['p']['name_lastName']}",
                'url': r['p']['info_portalUrl'],
                'uuid': author_id,
            }
            if author_id not in author_ids:
                authors.append(author)
                author_ids.add(author_id)
    
        document['authors'] = authors

    return document

In [6]:
text = '''
DARE will deliver a new working environment for the teams of professionals wrestling with the challenge of extreme data, computing and complexity. It will present methods, in abstract terms, so that domain experts can understand, change and use them effectively. It will provide a set of tools that visualise the runs of these methods in summary form still without distracting technical detail. Those tools will allow drill down for diagnostics and validation, and help with the organisation of campaigns involving multiple runs and immense amount of data. This holistic abstract presentation together with automation that eliminates chores will push back the complexity barrier, accelerate innovation and improve the productivity of our hard-pressed expert teams.

The data-scale barrier will be pushed by a combination of optimised mappings and automation. To achieve this, we depend on learning the critical parameters in the cost functions dynamically, taking into account data movement, storage costs, limits and other resource costs in formulae weighted by community choices and priorities. The computational scale barrier will be pushed by a similar strategy. However, the methods we enable often have a mixture of computationally challenging parts and data challenging parts, best allocated to different platforms. In today’s R&D the practitioners have to organise this and the inherent data movement themselves. DARE’s optimised mappings will automatically partition parts of the work to different platforms and organise the coupled use of those platforms including any necessary data movements and adaptations. Most professional R&D requires sustained use of such methods. Sustaining their meaning across platforms means that working practices do not need to change and that the original investment in learning and in method development is retained.
DARE will work with two research infrastructures: EPOS (European Plate Observing System) and IS-ENES (Infrastructure for the European Network of Earth System Modelling), engaging in the co-design and production use of extreme methods that address these challenges. With our partners, we will show:

Accelerated innovation in the face of all three extremes.
Significantly increased productivity for expert teams and a wide range of users.
Substantial advances in the science and applications achievable in campaigns'''

In [7]:
vector = model.infer_vector(stem_text(text))
simdocs = model.docvecs.most_similar(positive=[vector])

In [10]:
for doc_id, sim in simdocs:
    print(doc_id)
    print(sim)

18ee4a19-f576-4b9b-9fc3-2e9ad8c85a1e
0.42743241786956787
758e5620-f1ae-45ce-8613-1e0851866f51
0.4259878993034363
8929dd86-1c96-4b1d-925f-aabae06e8145
0.4253643751144409
933114c3-7f7d-4d79-a308-b3cda487198d
0.42303478717803955
0421859d-052c-40d9-a509-e6c9378115ea
0.4205206334590912
a76fb7a1-81e0-484d-a8ba-bfa7de06aa05
0.418972373008728
782eca7d-db87-4188-8a7d-cc84cc6c938d
0.41811078786849976
138d79ec-9f1f-40c9-9537-7ae9df7f8315
0.41767820715904236
70c477a5-824b-4d81-ac56-42e51b440c68
0.41694849729537964
61708493-f117-43ed-8a16-f73adb344299
0.4168996214866638


In [8]:
for doc_id, sim in simdocs:
    doc = get_document_neo4j(driver, doc_id)
    print(doc['title'])
    for author in doc['authors']:
        print(author['name'])
    print()

NameError: name 'driver' is not defined