##### Loading packages

In [1]:
from pymongo import MongoClient
import json
import time
from bson.json_util import dumps
from elasticsearch import Elasticsearch
import pandas as pd
from py2neo import Graph, Node, Relationship
import re
import numpy as np
from time import time, ctime

In [2]:
start = time()

##### Connexion to MongoDB

In [3]:
#Server
server = MongoClient('')
#BD
db = server['aura_pmi_dl']
#Collection
collection = db['document_profiles']

##### Connexion to Neo4j

In [4]:
graph = Graph("bolt://159.84.108.111:7687", auth=("neo4j", ""))

In [5]:
#Réinit Neo4j DB
graph. delete_all()
#graph.run('MATCH (n) DETACH DELETE n')
#graph.run("MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE r,n")

##### Loading metadata

In [6]:
data = pd.read_csv("/home/ubuntu/hal/catalogue.csv", sep=";")
data['domain'] = data['domain'].str.slice(2,6)
data['submitted_date'] = pd.to_datetime(data['submitted_date'])
data['year'] = data['submitted_date'].dt.year
data['month'] = data['submitted_date'].dt.month
data['date'] = data['submitted_date'].dt.date
data.index = data['filepath']
data.index.name = ""
data.head()

Unnamed: 0,docid,domain,submitted_date,title,file_url,inst,language,filepath,year,month,date
,,,,,,,,,,,
/home/ubuntu/hal/data/0.scco/19.pdf,19.0,scco,2002-10-10 15:18:34,Self-motion and the perception of stationary o...,https://hal.archives-ouvertes.fr/hal-00000019/...,CdF (institution),en,/home/ubuntu/hal/data/0.scco/19.pdf,2002.0,10.0,2002-10-10
/home/ubuntu/hal/data/0.scco/20.pdf,20.0,scco,2002-10-10 15:33:58,The stationarity hypothesis: An allocentric cr...,https://hal.archives-ouvertes.fr/hal-00000020/...,CdF (institution),en,/home/ubuntu/hal/data/0.scco/20.pdf,2002.0,10.0,2002-10-10
/home/ubuntu/hal/data/0.phys/29.pdf,29.0,phys,2002-10-25 17:07:11,Statistics of lowest droplets in two-dimension...,https://hal.archives-ouvertes.fr/hal-00000029/...,UPMC,en,/home/ubuntu/hal/data/0.phys/29.pdf,2002.0,10.0,2002-10-25
/home/ubuntu/hal/data/0.math/36.pdf,36.0,math,2002-11-04 10:28:04,The Selberg zeta function for convex co-compac...,https://hal.archives-ouvertes.fr/hal-00000036/...,UN,en,/home/ubuntu/hal/data/0.math/36.pdf,2002.0,11.0,2002-11-04
/home/ubuntu/hal/data/0.scco/62.pdf,62.0,scco,2002-11-19 19:52:53,Perception and reconstruction of two-dimension...,https://hal.archives-ouvertes.fr/hal-00000062/...,CdF (institution),en,/home/ubuntu/hal/data/0.scco/62.pdf,2002.0,11.0,2002-11-19


In [7]:
data.dtypes

docid                      int64
domain                    object
submitted_date    datetime64[ns]
title                     object
file_url                  object
inst                      object
language                  object
filepath                  object
year                       int64
month                      int64
date                      object
dtype: object

##### Documents Objects

In [8]:
mongo_query = [{"$project":{
                      "path":"$path", 
                      "language":"$language", 
                        "nbPages":"$nbPages"           
        }}]


In [9]:
mongo_cursor = collection.aggregate(mongo_query)
for doc in mongo_cursor:
    neo_query_params = {}
    if(doc.get("_id") != None):
        neo_query_params["identifier"] = '''"'''+str(doc['_id'])+'''"'''
    if(doc.get("nbPages") != None):
        neo_query_params["nbPages"] = '''"'''+str(doc['nbPages'])+'''"'''
    if(doc.get("path") != None):
        neo_query_params["submissionDate"] = '''date("'''+str(data.loc[doc.get("path"),'date'])+'''")''' 
        neo_query_params["title"] = '''"'''+str(data.loc[doc.get("path"),'title']).replace("'", "").replace('''"''', "")+'''"''' 
    graph.run('''CREATE (c:Object:Document '''+str(neo_query_params).replace("'", "").replace("\\"," ")+''') RETURN c''')

##### Raw representations

In [10]:
mongo_query = [{"$project":{"path":"$path"
        }}]

In [11]:
mongo_cursor = collection.aggregate(mongo_query)
for doc in mongo_cursor:
    graph.run('''CREATE (r:Raw:Document {identifier:"'''+str(doc['_id'])+'''", 
            path:"'''+str(doc['path'])+'''"})''')
    graph.run('''MATCH (o:Object {identifier:"'''+str(doc['_id'])+'''"}), 
        (r:Raw {identifier:"'''+str(doc['_id'])+'''"})
                CREATE (o)-[l:REPRESENTATION]->(r)''')

In [12]:
##### GROUPING CREATION
def generate_grouping(grouping_name, neo):
    #TAKES: grouping_name, neo4j graph
    #RETURNS: id of the generated node
    
    result = neo.run('CREATE (c:Grouping {name:"'+grouping_name+'"}) RETURN ID(c) AS id')
    grouping_id = result.data()[0]['id']
    return grouping_id

def generate_groups(grouping_name, neo, mongo):
    #TAKES: grouping_name, neo4j graph, mongodb collection
    #RETURNS: list ids of the generated nodes
    
    #Getting groups from mongoDB
    mongo_query = [{"$match":{}}, 
        {"$group":{"_id":"$"+grouping_name ,"count":{"$sum":1}}}]
    mongo_cursor = mongo.aggregate(pipeline=mongo_query,allowDiskUse=True)
    group_ids = []
    group_names = []
    #Insertion of each group in Neo4j
    for item in mongo_cursor:
        group_name = str(item['_id'])
        result = neo.run('CREATE (c:Group {name:"'+group_name+'"}) RETURN ID(c) AS id')
        group_ids.append(result.data()[0]['id'])
        #print(group_ids)
    return group_ids

def generate_groups_membership(grouping_id, group_ids, neo):
    #TAKES: grouping_id, list of group ids, neo4j graph
    #RETURNS: list ids of the generated nodes
    neo_query = '''MATCH (a:Grouping),(b:Group)
    WHERE ID(a)='''+str(grouping_id)+''' AND ID(b) IN '''+str(group_ids)+'''
    CREATE (a)<-[r:MEMBER]-(b)
    RETURN a,b'''
    result = neo.run(neo_query)
    #print(result.data())
    
def generate_docs_classification(grouping_name, neo, mongo):
    #TAKES: grouping_name, neo4j graph, mongodb collection
    #RETURNS:
    
    #Identification of associated members
    neo_query = '''MATCH (p:Group)-[r:MEMBER]->(q:Grouping {name:"'''+grouping_name+'''"}) 
                RETURN p.name AS name, ID(p) AS id'''
    result = neo.run(neo_query)
    
    #For each group we get relative docs from mongo
    for item in result.data():
        group_id = item['id']
        group_name = item['name']
        query_selection= {grouping_name:group_name} 
        query_projection = {"_id":1}
        mongo_cursor = mongo.find(query_selection, query_projection)     
        #For each doc we add a relationship with the relative group
        for doc in mongo_cursor:
            doc_id = doc['_id']
            #print(1)
            query = '''MATCH (g:Group {name:"'''+group_name+'''"}),(d:Object {identifier:"'''+doc_id+'''"})
            CREATE (g)<-[r:CLASSIFIED]-(d)
            RETURN g,d'''
            result = neo.run(query)

In [13]:
def generate_grouping_metadata(grouping_name, neo, mongo):
    grouping_id = generate_grouping(grouping_name, neo)
    group_ids = generate_groups(grouping_name, neo, mongo)
    generate_groups_membership(grouping_id, group_ids, neo)
    generate_docs_classification(grouping_name, neo, mongo)

##### Grouping by Language

In [14]:
#generate_grouping_metadata(grouping_name="language", neo=graph, mongo=collection)

##### Grouping by YEAR

In [15]:
def generate_groups_year_month(grouping_name, neo, mongo):
    #TAKES: grouping_name, neo4j graph, mongodb collection
    #RETURNS: list ids of the generated nodes
    
    #Getting groups from mongoDB
    years = data[grouping_name].fillna("UNKNOWN").value_counts()
    
    group_ids = []
    group_names = []
    #Insertion of each group in Neo4j
    for item in years.index:
        group_name = str(item)
        if(group_name == 'None'):#Special group node UNKNOWN
            group_name = 'UNKNOWN'
        result = neo.run('CREATE (c:Group {name:"'+group_name+'"}) RETURN ID(c) AS id')
        group_ids.append(result.data()[0]['id'])
        #print(group_ids)
    return group_ids


def generate_docs_classification_year_month(grouping_name, neo, mongo):
    #TAKES: grouping_name, neo4j graph, mongodb collection
    #RETURNS:
    
    #Identification of associated members
    neo_query = '''MATCH (p:Group)-[r:MEMBER]->(q:Grouping {name:"'''+grouping_name+'''"}) 
                RETURN p.name AS name, ID(p) AS id'''
    result = neo.run(neo_query)
    
    #For each group we get relative docs from mongo
    for item in result.data():
        group_id = item['id']
        group_name = item['name']
        if group_name == 'UNKNOWN': #Special case of null values
            conv_group_name = None
        else: #Otherwise, converting to int
            conv_group_name = int(group_name)
        mongo_query = [{"$project":{
                      "path":"$path",            
        }}]
        mongo_cursor = mongo.aggregate(mongo_query)     
        #For each doc we add a relationship with the relative group
        for i, doc in enumerate(mongo_cursor):
            doc_id = doc['_id']
            if data.loc[doc['path'], grouping_name] == conv_group_name:
                query = '''MATCH (g:Group {name:"'''+group_name+'''"}),(d:Object {identifier:"'''+doc_id+'''"})
                WHERE ID(g) = '''+str(group_id)+'''
                CREATE (g)<-[r:CLASSIFIED]-(d)
                RETURN g,d'''
                result = neo.run(query)

In [16]:
def generate_grouping_metadata_year_month(grouping_name, neo, mongo):
    grouping_id = generate_grouping(grouping_name, neo)
    group_ids = generate_groups_year_month(grouping_name, neo, mongo)
    generate_groups_membership(grouping_id, group_ids, neo)
    generate_docs_classification_year_month(grouping_name, neo, mongo)

In [17]:
generate_grouping_metadata_year_month(grouping_name="year", neo=graph, mongo=collection)

##### Grouping by MONTH

In [18]:
generate_grouping_metadata_year_month(grouping_name="month", neo=graph, mongo=collection)

##### Grouping by DOMAIN

In [19]:
def generate_groups_from_metadata(grouping_name, neo, mongo):
    #TAKES: grouping_name, neo4j graph, mongodb collection
    #RETURNS: list ids of the generated nodes
    
    #Getting groups from mongoDB
    years = data[grouping_name].fillna("UNKNOWN").value_counts()
    
    group_ids = []
    group_names = []
    #Insertion of each group in Neo4j
    for item in years.index:
        group_name = str(item)
        if(group_name == 'None'):#Special group node UNKNOWN
            group_name = 'UNKNOWN'
        result = neo.run('CREATE (c:Group {name:"'+group_name+'"}) RETURN ID(c) AS id')
        group_ids.append(result.data()[0]['id'])
        #print(group_ids)
    return group_ids


def generate_docs_classification_from_metadata(grouping_name, neo, mongo):
    #TAKES: grouping_name, neo4j graph, mongodb collection
    #RETURNS:
    
    #Identification of associated members
    neo_query = '''MATCH (p:Group)-[r:MEMBER]->(q:Grouping {name:"'''+grouping_name+'''"}) 
                RETURN p.name AS name, ID(p) AS id'''
    result = neo.run(neo_query)
    
    #For each group we get relative docs from mongo
    for item in result.data():
        group_id = item['id']
        group_name = item['name']
        if group_name == 'UNKNOWN': #Special case of null values
            conv_group_name = None
        else: #Otherwise, converting to int
            conv_group_name = group_name
        mongo_query = [{"$project":{
                      "path":"$path",            
        }}]
        mongo_cursor = mongo.aggregate(mongo_query)     
        #For each doc we add a relationship with the relative group
        for i, doc in enumerate(mongo_cursor):
            doc_id = doc['_id']
            if data.loc[doc['path'], grouping_name] == conv_group_name:
                query = '''MATCH (g:Group {name:"'''+group_name+'''"}),(d:Object {identifier:"'''+doc_id+'''"})
                WHERE ID(g) = '''+str(group_id)+'''
                CREATE (g)<-[r:CLASSIFIED]-(d)
                RETURN g,d'''
                result = neo.run(query)

In [20]:
def generate_grouping_metadata_from_metadata(grouping_name, neo, mongo): 
    grouping_id = generate_grouping(grouping_name, neo)
    group_ids = generate_groups_from_metadata(grouping_name, neo, mongo)
    generate_groups_membership(grouping_id, group_ids, neo)
    generate_docs_classification_from_metadata(grouping_name, neo, mongo)

In [21]:
generate_grouping_metadata_from_metadata("domain", neo=graph, mongo=collection)

#### Grouping by Language

In [22]:
generate_grouping_metadata_from_metadata("language", neo=graph, mongo=collection)

#### Grouping by Institution country

In [1]:
#generate_grouping_metadata_from_metadata("inst", neo=graph, mongo=collection)

##### Indexing

In [None]:
try:
    res = graph.run('''DROP INDEX idx_grouping_name''')
    res = graph.run('''DROP INDEX idx_group_name''')
    res = graph.run('''DROP INDEX idx_object_identifier''')
    print("Indexes droped successfully")
except:
    print("No indexes")

In [None]:
try:
    res = graph.run('''CREATE INDEX idx_grouping_name
    FOR (n:Grouping)
    ON (n.name)''')
    res = graph.run('''CREATE INDEX idx_group_name 
    FOR (n:Group)
    ON (n.name)''')
    res = graph.run('''CREATE INDEX idx_object_identifier 
    FOR (n:Object)
    ON (n.identifier)''')
    print("Indexes created successfully")
except:
    print("Unable to create indexes")

In [24]:
done = time()
elapsed = done - start
print("*"*20)
print("STARTED : "+ctime(start) )
print("END : "+ctime(done) )
print("TIME ELAPSED:" + str(elapsed/60) + " MINUTES" )

********************
STARTED : Wed Jan 27 09:13:43 2021
END : Wed Jan 27 13:19:48 2021
TIME ELAPSED:246.07996703386306 MINUTES
