<a href="https://colab.research.google.com/github/SolanaO/Blogs_Content/blob/master/1_ArXiv_Knowledge_Graph_and_Queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installs & Imports

In [None]:
!pip install neo4j

In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
import hashlib
import copy

In [None]:
# Load and mount the drive helper
from google.colab import drive

# This will prompt for authorization
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the file containing the selected publications
filename = "/content/drive/MyDrive/codeLlama/arxiv_math_reduced.csv"
df = pd.read_csv(filename)

In [None]:
# Check the dataset
df.head(2)

Unnamed: 0,id,authors,title,journal,update_date,abstract
0,math/0001015,A. Chakrabarti and R. Chakrabarti,The Gervais-Neveu-Felder equation for the Jord...,,2009-10-31,"Using a contraction procedure, we construct ..."
1,math/0001024,"Piotr Kobak (Krakow), Andrew Swann (SDU, Odense)","HyperK\""ahler Potentials in Cohomogeneity Two",,2007-05-23,"A hyperK\""ahler potential is a function rho ..."


In [None]:
# Parse authors

lsa = list(df.authors)
ls_authors = []

for e in lsa:
    sep = ";"
    if ";" in e:
        sep = ";"
    else:
        sep = ","

    ls_authors.extend(e.split(sep))

print(len(ls_authors))

ls_authors = list(set(ls_authors))
print(len(ls_authors))

def hash_text(text):
   return hashlib.sha256(str(text).encode('utf-8')).hexdigest()

authors = []

for e in ls_authors:
    authors.append({"name":e,"id":hash_text(e)})

12746
9600


In [None]:
# Parse journals

journals = []
for j in list(df.journal.unique()):
   journals.append({"name":j,"id":hash_text(j)})
journals[20]

{'name': 'J. Algebra 231, 67-85 (2000)',
 'id': '23af3d6d450a2a8eb71d4baf5b25057daaada9fa050f130f5d4267965d77d660'}

In [None]:
# Parse articles

records = df.to_dict("records")

def extract_authors(text):
   ls_authors = []
   sep = ";"
   if ";" in text:
     sep = ";"
   else:
     sep = ","

   ls_authors.extend(text.split(sep))
   return ls_authors


articles = copy.deepcopy(records)
for r in articles:
    r["authors"] = extract_authors(r['authors'])

articles[20]

{'id': 'math/0001082',
 'authors': ['Alain Lascoux', ' Michel Lassalle (CNRS', 'Paris)'],
 'title': "Une identit\\'e remarquable en th\\'eorie des partitions",
 'journal': 'Math. Annalen, 318 (2000), 299-313',
 'update_date': '2007-05-23',
 'abstract': '  We prove an identity about partitions, previously conjectured in the study of\nshifted Jack polynomials (math.CO/9903020). The proof given is using\n$\\lambda$-ring techniques. It would be interesting to obtain a bijective proof.\n'}

## Establish Neo 4j Connection

In [None]:
# Initialize a Neo4j Desktop session

from neo4j import GraphDatabase
pwd = "0YsMf4vkTK04PDhc-ZdpRLU6_ORH7IxkEwJqN1zOBac"
driver = GraphDatabase.driver('neo4j+s://86428ec8.databases.neo4j.io', auth=('neo4j', pwd))

In [None]:
# Function to parse Cypher queries
def read_query(query, params=None):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result],
                            columns=result.keys())

In [None]:
# Create Journal nodes
read_query("""
UNWIND $data as journal
MERGE (j:Journal {id:journal.id})
SET j.name = journal.name
RETURN count(j)
""",{"data":journals})

Unnamed: 0,count(j)
0,2550


In [None]:
# Create Authors nodes

read_query("""
UNWIND $data as author
MERGE (a:Author {id:author.id})
SET a.name = author.name
RETURN count(a)
""",{"data":authors})

Unnamed: 0,count(a)
0,9600


In [None]:
# Create articles
read_query("""
UNWIND $data as row
MERGE (a:Article{id:row.id})
ON CREATE SET a.title = row.title, a.abstract = row.abstract,
            a.authors = row.authors, a.journal=row.journal,
            a.date=date(row.created_date),
RETURN count(*)
""", {'data': articles})

Unnamed: 0,count(*)
0,8446


In [None]:
# Match articles with their authors
read_query("""
MATCH (a:Article)
WITH a
UNWIND a.authors as name
MATCH (author:Author) where author.name = name
MERGE (author)-[:WROTE]->(a)
""")

In [None]:
# Match articles with their journals
read_query("""
MATCH (a:Article)
WITH a
MATCH (j:Journal) where j.name = a.journal
MERGE (j)-[:PUBLISHED]->(a)
""")

## Extract Graph Schema

In [None]:
# Queries to extract the graph schema

node_properties_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE NOT type = "RELATIONSHIP" AND elementType = "node"
WITH label AS nodeLabels, collect(property) AS properties
RETURN {labels: nodeLabels, properties: properties} AS output

"""

node_props = read_query(node_properties_query)

rel_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE type = "RELATIONSHIP" AND elementType = "node"
RETURN {source: label, relationship: property, target: other} AS output
"""
rels = read_query(rel_query)

In [None]:
# Create schema information

def schema_text(node_props, rels):
    return f"""
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  {node_props}
  Relationship point from source to target nodes
  {rels}
  Make sure to respect relationship types and directions
  """

In [None]:
schema_text = schema_text(node_props, rels)

## Sample Queries

In [None]:
# Find 5 articles that contain algebra in the title and abstract

read_query("""
MATCH (a:Article)
WHERE a.abstract CONTAINS 'algebra' AND a.title CONTAINS 'algebra'
RETURN a.title as Title
LIMIT 5
""")


Unnamed: 0,Title
0,The Gervais-Neve...
1,Deformations of ...
2,On the approxima...
3,Unitary represen...
4,Entropy in type ...


In [None]:
# Basic node retrieval
# Fetch 5 journals in the database

query = """
MATCH (j:Journal)
RETURN j.name LIMIT 5
"""

read_query(query)

Unnamed: 0,j.name
0,
1,J. Combin. Theor...
2,Geom. Topol. 4 (...
3,Complex geometry...
4,"Math. Annalen, 3..."


In [None]:
# Find the most published author

read_query("""
MATCH (a:Author)-[]-(p:Article)-[]-(j:Journal)
RETURN a.name as author, count(p) as freq
ORDER BY freq DESC
LIMIT 5
""")

Unnamed: 0,author,freq
0,Saharon Shelah,57
1,Friedrich Wehru...,16
2,Maks A. Akivis a...,15
3,Florentin Smara...,13
4,Peter W. Michor,12


In [None]:
# Node retrieval with property filtering
# Fetch articles published after a specific date

query = """
MATCH (a:Article)
WHERE a.date > date("2000-01-01")
RETURN a.title, a.date
"""

read_query(query)

Unnamed: 0,a.title,a.date
0,The Gervais-Neve...,2000-01-04
1,"HyperK\""ahler Po...",2000-01-05
2,"The HyperK\""ahle...",2000-01-05
3,"HyperK\""ahler Po...",2000-01-05
4,Knuth-Bendix for...,2000-01-06
...,...,...
7013,Efficient import...,2007-03-30
7014,Randomly growing...,2007-03-30
7015,On Nichols algeb...,2007-03-30
7016,Primes in a pres...,2007-03-30


In [None]:
# Relationship retrieval
#  Fetch all articles published in a specific journal

query = """
MATCH (j:Journal {name: "Nature"})-[:PUBLISHED]->(a:Article)
RETURN a.title
"""
read_query(query)

Unnamed: 0,a.title


In [None]:
# Nodes and relationships
# Fetch all authors who wrote a particular article

query = """
MATCH (a:Author)-[:WROTE]->(art:Article {title: "Graph Theory Basics"})
RETURN a.name
"""

read_query(query)

Unnamed: 0,a.name


In [None]:
# Using paths
# Find the journal in which an author's article was published

query = """
MATCH path = (a:Author {name: "Saharon Shelah"})-[:WROTE]->(:Article)<-[:PUBLISHED]-(j:Journal)
RETURN j.name
"""
read_query(query)

Unnamed: 0,j.name
0,Math. Sci. Res. ...
1,Israel J. Math. ...
2,Fund. Math. 145 ...
3,J. Symbolic Logi...
4,Adv. Math. 126 (...


In [None]:
# Aggregations
# Count the number of articles each author has written

query = """
MATCH (a:Author)-[:WROTE]->(art:Article)
RETURN a.name, COUNT(art) AS articles_written
ORDER BY articles_written DESC
"""

read_query(query)

Unnamed: 0,a.name,articles_written
0,Saharon Shelah,83
1,Friedrich Wehru...,16
2,Florentin Smara...,16
3,Maks A. Akivis a...,16
4,Saharon Shelah,15
...,...,...
9595,Satoshi Koike an...,1
9596,Mireille Bousque...,1
9597,Ahmad R. Sharafa...,1
9598,Richard Cleyton ...,1


In [None]:
# Relationships with property filetring
# Fetch articles written by a specific author and published after a certain date

query= """
MATCH (a:Author {name: "John Doe"})-[:WROTE]->(art:Article)
WHERE art.date > "1980-01-01"
RETURN art.title, art.date
"""
read_query(query)

Unnamed: 0,art.title,art.date


In [None]:
# Multiple paths
# Find authors who have written articles for a specific journal

query = """
MATCH (a:Author)-[:WROTE]->(:Article)<-[:PUBLISHED]-(j:Journal)
WHERE j.name CONTAINS "Topology 41"
RETURN DISTINCT a.name
"""
read_query(query)

Unnamed: 0,a.name
0,F. Loeser
1,J. Denef


In [None]:
# Combining Aggregations and Paths
# Find the journal that has published the most articles:

query = """
    MATCH (j:Journal)-[:PUBLISHED]->(a:Article)
    RETURN j.name, COUNT(a) AS number_of_articles
    ORDER BY number_of_articles DESC
    LIMIT 1
    """
read_query(query)

Unnamed: 0,j.name,number_of_articles
0,Some results obt...,2


In [None]:
# Complex Aggregations with Filtering
# Find authors who have written more than 5 articles and at least one of those articles was published in the "Topology" journal:

query = """
MATCH (a:Author)-[:WROTE]->(art:Article)
WITH a, COUNT(art) AS article_count
WHERE article_count > 5
MATCH (a)-[:WROTE]->(:Article)<-[:PUBLISHED]-(j:Journal)
WHERE j.name CONTAINS 'Topology'
RETURN a.name, article_count
"""
read_query(query)

Unnamed: 0,a.name,article_count
0,F. Loeser,8
1,J. Denef,8
2,Peter Teichner,6
3,Tomek Bartoszynski,11
