In [1]:
# Define Neo4j connections
from neo4j import GraphDatabase
host = 'neo4j://localhost:7687'
user = 'neo4j'
password = 'letmein'
driver = GraphDatabase.driver(host,auth=(user, password))

In [2]:
# Import libraries
import pandas as pd

def read_query(query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())
    
def drop_graph(name):
    with driver.session() as session:
        drop_graph_query = """
        CALL gds.graph.drop('{}');
        """.format(name)
        session.run(drop_graph_query)

## Agenda
- Graph import
- Text classification
- Named-entity recognition
- Sentiment analysis
- Unipartite projection of a bipartite network
- Community detection
- Data enrichment

### Graph import

We will be using the Kaggle News dataset made available by Kevin Toms. It contains the content of around 10500 news articles. The dataset was prepared in January 2019. Unfortunately, the author offers no information about the source and the timeline of the dataset. We have to download the dataset and copy it to the import folder.


We will import each article as a separate node with the title and the content stored as the properties of the node.

In [None]:
graph_import_query = """

LOAD CSV WITH HEADERS FROM "file:///text_summarizer_data.csv" as row 
CREATE (a:Article{title:row.title, content: row.content})

"""

read_query(graph_import_query)

### Text classification with Google

We will start our analysis with the APOC text classification procedure powered by Google's Natural language API. It classifies the input text in distinct content categories. As far as I have noticed, it works best for news articles and not so well on fictional literature. Google offers 30.000 free classification requests per month, so that's quite enough to start a hobby project.

As we need to process 10.000 articles, we will be using apoc.periodic.iterate
procedure to handle the batching process. Learn more about it in the documentation. The process will take a couple of minutes, so feel free to grab a coffee or do your favorite yoga pose while you wait.

In [None]:
text_classification = """

CALL apoc.periodic.iterate(" 
    // get all articles 
    MATCH (node:Article) RETURN node 
    "," 
    // classify each article 
    CALL apoc.nlp.gcp.classify.graph(node, { 
    // we retrieve gcp api key from static value storage
    key: apoc.static.get('gcp.apiKey'),
    // node property that contains the text
    nodeProperty: 'content',
    write:true }) YIELD graph RETURN distinct 'done'", 
    {batchSize:10})

"""

read_query(text_classification)

While you sip your delicious coffee, we can take a look at the results of the classification. Let's start by viewing some example category names just so we know what we are dealing with.

In [3]:
inspect_categories_query = """

MATCH (n:Category) 
RETURN n.name as category LIMIT 5

"""

read_query(inspect_categories_query)

Unnamed: 0,category
0,/Arts & Entertainment/Movies
1,/Finance/Insurance
2,/News
3,/Arts & Entertainment/Celebrities & Entertainm...
4,/Sports/Team Sports/Cricket


Category names are between one and three levels deep. Levels are separated by the slash character ( / ). For example, "/News" contains only one level, while "/Finance/Insurance" contains two levels of categories. We will extract the top-level category for each classification name and store it back to our graph. This will enable us to filter and group by top-level categories more straightforwardly.

When dealing with hierarchical trees in Neo4j, there are some rules I learned along the way that can help us simplify and optimize our queries. One of them is that we should have a single relationship type throughout the whole tree. This way, we can easily query one or two levels deep, as we will see.

Let's define a unique constraint for the top-level category node.

In [None]:
read_query("CREATE CONSTRAINT ON (t:TopCategory) ASSERT t.name IS UNIQUE;")

We can extract and store the top-level category by merely splitting the classification name.

In [None]:
top_category_extraction_query = """

MATCH (n:Category) 
WITH n, split(n.name,'/')[1] as top_level
MERGE (t:TopCategory{name:top_level})
MERGE (n)-[:CATEGORY]->(t)

"""

read_query(top_category_extraction_query)

Let's check out the number of articles by the top-level category. You can observe that with only a single relationship type in our category tree, traversing two levels deep is very simple.

In [4]:
inspect_top_level_categories_query = """

MATCH (t:TopCategory) 
RETURN t.name as category,
       size((t)<-[:CATEGORY*2..2]-()) as number_of_articles 
ORDER BY number_of_articles DESC 
LIMIT 10

"""

read_query(inspect_top_level_categories_query)

Unnamed: 0,category,number_of_articles
0,News,4041
1,Arts & Entertainment,2487
2,Law & Government,2036
3,Sports,1470
4,Business & Industrial,1044
5,Sensitive Subjects,837
6,People & Society,538
7,Science,337
8,Finance,295
9,Travel,226


One thing to notice is that the results of the query show more articles than there actually are in our graph. This is due to some articles having more than a single classification. We will quickly inspect results by looking at the content of two articles from the Science category.

In [5]:
example_science_articles = """

MATCH (t:TopCategory{name:"Science"})<-[:CATEGORY*2..2]-(article) 
RETURN article.content as text LIMIT 2

"""

read_query(example_science_articles)

Unnamed: 0,text
0,"Scientists in the UK have demonstrated a ""comm..."
1,The SI unit of mass is redefined using the Pla...


Both articles seem to be heavily focused on science and technology. We didn't really do an in-depth analysis of the results, but we will assume that the Natural Language API does its job of classifying news articles well.

### Named-entity recognition

In essence, NER is a process of identifying various entities in text and grouping them in categories such as persons, organizations, locations, and more. Which entity types we are looking for, depends entirely on the use-case. Sometimes we want to know which persons and organizations are mentioned in a given article, and other times we might be more interested in knowing which genes and proteins are referred. There exist a plethora of pre-trained models that you can use, and if none works for you, you can always train your own NER model, but this is beyond the scope of this blog post.

Both AWS and GCP offer their own NER models in their cloud API portfolio. We will compare them on two random stories and choose the more fitting for our use-case.

In [6]:
example_ner_articles = """

MATCH (node:Article) 
WITH node LIMIT 2 
RETURN node.content as text

"""

read_query(example_ner_articles)

Unnamed: 0,text
0,Aegon Life iTerm insurance plan helps you save...
1,An 81-year-old woman named Eileen Macken who g...


At first glance, we can observe that AWS found fewer entities than GCP. It also does not include words like "woman" and "mother" under person entities. My opinion is that if you are trying to do some sort of topic modeling, then maybe GCP might be a better fit, while for creating a knowledge graph, AWS might be better. We will use AWS to analyze and extract entities from all articles in our database. As we need to batch our NER process, we will use the apoc.periodic.iterate procedure.

In [None]:
ner_batch_process_query = """

CALL apoc.periodic.iterate("
  MATCH (n:Article) 
  WITH collect(n) as total
  // Create a batch of 25 articles
  CALL apoc.coll.partition(total,25) 
  YIELD value as nodes RETURN nodes
  ","
  CALL apoc.nlp.aws.entities.graph(nodes, { 
     key: apoc.static.get('aws.apiKey'), 
     secret: apoc.static.get('aws.apiSecret'), 
     nodeProperty: 'content', 
     relationshipType: 'AWS_ENTITY', 
     // store the results to Neo4j 
     write:true }) YIELD graph RETURN distinct 'done'", 
     {batchSize:1})

"""

read_query(ner_batch_process_query)

If you hurry, you have time for another great yoga pose before the NER process finishes. Let me know how it goes. Alright, we can now check the results of the NER process. We will begin by investigating the most mentioned people in the articles.

In [7]:
ner_results_query = """

MATCH (n:PERSON) 
RETURN n.text as person, 
       size((n)<-[:AWS_ENTITY]-()) as mentions 
ORDER BY mentions DESC 
LIMIT 10

"""

read_query(ner_results_query)

Unnamed: 0,person,mentions
0,PM,560
1,Narendra Modi,488
2,President,355
3,CM,342
4,CEO,276
5,Prime Minister,248
6,Rahul Gandhi,215
7,Virat Kohli,212
8,Modi,163
9,Union Minister,157


We chose AWS as it did not include words like "orphan" or "genealogist" under person type. I guess no model is perfect as we can see that "PM", "CEO", and "President" appear as person entities. One way to solve this problem would be to add coreference resolution as a step of the NER pipeline, but as we are dealing with third-party NLP pipelines, we don't have that luxury. Let's now look at the most mentioned persons in sports articles.

In [8]:
sports_person_query = """

MATCH (n:PERSON) 
RETURN n.text as person, 
       size((n)<-[:AWS_ENTITY]-()-[:CATEGORY*2..2]->({name:'Sports'})) as mentions 
ORDER BY mentions 
DESC LIMIT 10

"""

read_query(sports_person_query)

Unnamed: 0,person,mentions
0,Virat Kohli,197
1,Kohli,110
2,MS Dhoni,65
3,Sachin Tendulkar,64
4,Rishabh Pant,49
5,Rohit Sharma,45
6,Cheteshwar Pujara,45
7,Hardik Pandya,42
8,Dhoni,42
9,Gautam Gambhir,41


All top ten mentioned persons in sports are Indian cricket players. Virat Kohli really stands out, so I guess he is the captain of the team. One weak point of such NER extraction is that Virat Kohli and Kohli are treated as two separate entities. This can be explained that he is sometimes mentioned with full name and other times only with the last name. Another interesting use-case would be to look at which locations and events show up in the same article the most.

In [11]:
location_event_query = """

MATCH (a:Article)-[:AWS_ENTITY]->(event:EVENT),
      (a)-[:AWS_ENTITY]->(location:LOCATION)
RETURN location.text as location,
       event.text as event,
       count(*) as mentions
ORDER BY mentions DESC
LIMIT 10

"""

read_query(location_event_query)

Unnamed: 0,location,event,mentions
0,Las Vegas,CES 2019,11
1,Britain,Brexit,10
2,UK,Brexit,9
3,EU,Brexit,6
4,Kolkata,United India Rally,6
5,US,Christmas,6
6,Rajasthan,Assembly elections,5
7,Argentina,G20,5
8,Bhubaneswar,Hockey World Cup,5
9,Davos,World Economic Forum,4


The results look quite nice. It looks like CES 2019 happened in Las Vegas, United India Rally in Kolkata, G20 forum in Argentina, and Word Economic Forum in Davos. On top of that, there is talk about Brexit between the UK and the EU.

### Sentiment analysis

AWS also offers sentiment analysis in its Comprehend API bundle. It categorizes text into four distinct sentiment groups:

- Positive
- Neutral
- Negative
- Mixed

We will run the sentiment analysis on all of our articles. I would imagine that more or less news should be neutral, except for maybe sports. There is a particular APOC procedure <code>apoc.nlp.aws.sentiment.*</code> that handles AWS sentiment API requests with the same two modes (stream & graph) as the procedure for NER processing.

In [None]:
sentiment_analysis_query = """

CALL apoc.periodic.iterate("
    MATCH (n:Article) 
    WITH collect(n) as total 
    CALL apoc.coll.partition(total,25) 
    YIELD value as nodes RETURN nodes 
    "," 
    CALL apoc.nlp.aws.sentiment.graph(nodes, { 
        key: apoc.static.get('aws.apiKey'),
        secret: apoc.static.get('aws.apiSecret'),
        nodeProperty: 'content', write:true }) 
    YIELD graph RETURN distinct 'done'", 
    {batchSize:1})

"""

read_query(sentiment_analysis_query)

We will look at co-mentions of persons and events in sports articles that AWS Comprehend deemed as positive.

In [12]:
sentiment_analysis_results = """

MATCH (a:Article) 
WHERE a.sentiment = 'Positive' AND 
      (a)-[:CATEGORY*2..2]->({name:'Sports'}) 
MATCH (a)-[:AWS_ENTITY]->(person:PERSON), 
      (a)-[:AWS_ENTITY]->(event:EVENT) 
RETURN person.text as person,
       event.text as event,
       count(*) as mentions 
       ORDER BY mentions DESC
LIMIT 10

"""

read_query(sentiment_analysis_results)

Unnamed: 0,person,event,mentions
0,Serena Williams,Australian Open,3
1,Roger Federer,Australian Open,3
2,Virat Kohli,Melbourne Test,3
3,Gautam Gambhir,IPL,2
4,Serena Williams,Grand Slam,2
5,Federer,Australian Open,2
6,Naomi Osaka,Australian Open,2
7,Roger Federer,Grand Slam,2
8,Gambhir,IPL,2
9,Virat Kohli,2011 World Cup,2


It looks like tennis players are winning tennis tournaments. The Indian cricket team with Virat Kohli as their captain has won the 2011 World cup and Melbourne Test. Or so I might imagine given the results.

## Graph data science library

If you have read any of my blog posts, you know that I like to write about the Graph Data Science library. This blog is no different. First, we will project a bipartite network to a unipartite one with the help of the Node similarity algorithm. In the next step, we will search for communities within the projected unipartite graph with the Louvain algorithm.

### Unipartite projection of a bipartite network

A bipartite network is just a fancy way of saying that the graph contains two distinct sets of nodes, and likewise, a unipartite network contains only a single set of nodes. In our example, we will start with a bipartite network that includes both articles and NER entities. In the next step, we will project it to a unipartite network with the help of similarity algorithms in GDS.

We can think of unipartite projection as a process of translating indirect relationships to direct ones. The difference between similarity algorithms is just the metric being used to calculate similarity score or weight. For example, the Node similarity algorithm uses the Jaccard similarity score. It is defined as the size of the intersection divided by the size of the union. If we wanted to, we could also project a unipartite network of articles instead of persons and inspect how similar articles are based on the entities mentioned in them.

Before we run any algorithms, let's just quickly refresh how does the GDS library work.

The graph analytics pipeline consists of three main parts. In the first part, the graph loader reads the stored graph from Neo4j and loads it as an in-memory projected graph. We can use either native projection or cypher projection to load the projected graph. In the second step, we execute the graph algorithms in sequence. We can use the results of one graph algorithm as an input to another one. Last but not least, we store or stream the results back to Neo4j.

We will use the cypher projection to load the in-memory graph. If you need a quick refresher on how it works, I suggest you take a look at the official documentation. In the node statement, we will describe all articles that are in the top-level category of news and also all person entities. In the relationship statement, we will describe all the links between news articles and person entities.

In [13]:
load_projected_graph_query = """

CALL gds.graph.create.cypher("person_similarity", 
    // match articles that are in the 'News' category 
    "MATCH (a:Article) 
     WHERE (a)-[:CATEGORY*2..2]->({name:'News'}) 
     RETURN id(a) as id, labels(a) as labels 
     UNION 
     // match all person entities
     MATCH (p:PERSON) 
     RETURN id(p) as id, labels(p) as labels 
     "," 
     // match all links between news articles and person entities 
     MATCH (a:Article)-[r:AWS_ENTITY]->(p:PERSON) 
     WHERE (a)-[:CATEGORY*2..2]->({name:'News'}) 
     RETURN id(p) as source, id(a) as target, type(r) as type")

"""

read_query(load_projected_graph_query)

Unnamed: 0,nodeQuery,relationshipQuery,graphName,nodeCount,relationshipCount,createMillis
0,MATCH (a:Article) \n WHERE (a)-[:CATEGORY*...,// match all links between news articles and p...,person_similarity,11888,13847,620


In [14]:
node_similarity_stats_query = """

CALL gds.nodeSimilarity.stats('person_similarity') 
YIELD nodesCompared, similarityDistribution 
RETURN nodesCompared as nodes, 
       apoc.math.round(similarityDistribution.min,2) as min,
       apoc.math.round(similarityDistribution.p25,2) as p25,
       apoc.math.round(similarityDistribution.p50,2) as p50,
       apoc.math.round(similarityDistribution.p75,2) as p75,
       apoc.math.round(similarityDistribution.p90,2) as p90,
       apoc.math.round(similarityDistribution.mean,2) as mean

"""

read_query(node_similarity_stats_query)

Unnamed: 0,nodes,min,p25,p50,p75,p90,mean
0,3999,0.0,0.06,0.17,0.5,1.0,0.32


At first glance, the nodes don't seem to be very similar. This is fine. We want to infer a sparse network as community detection algorithms fare poorly on very dense graphs. Determining the most optimal similarity parameter values requires a combination of art and science, but with some experience, you will get good at it. We will set the <code>similarityCutoff</code> to be 0.6 while leaving the <code>degreeCutoff</code> and <code>topK</code> parameters at their default values. Using the <code>mutate</code> mode, we store the results of the algorithm back to the in-memory projected graph.

In [16]:
node_similarity_mutate_query = """

CALL gds.nodeSimilarity.mutate('person_similarity', 
    {degreeCutoff:1, similarityCutoff:0.6, 
     topK:10, mutateRelationshipType: 'SIMILAR', 
     mutateProperty: 'score'})

"""

read_query(node_similarity_mutate_query)

Unnamed: 0,createMillis,computeMillis,mutateMillis,postProcessingMillis,nodesCompared,relationshipsWritten,similarityDistribution,configuration
0,0,629,19,-1,3999,3778,"{'p1': 0.6190452575683594, 'max': 1.0000038146...","{'topK': 10, 'degreeCutoff': 1, 'bottomK': 10,..."


### Community detection

Community detection algorithms are designed to help us understand the structure of complex networks. The most apparent application is finding groups of friends in a social network. We think of a community as a densely connected group of nodes, similar to how a group of friends is highly interconnected.

In this blog post, we will use the Louvain algorithm to inspect the community structure of our inferred similarity network.

In [17]:
louvain_stream_query = """

CALL gds.louvain.stream('person_similarity',
    {nodeLabels:['PERSON'], relationshipTypes:['SIMILAR']})
YIELD nodeId, communityId
RETURN communityId,
       collect(gds.util.asNode(nodeId).text) as members 
ORDER BY size(members) DESC LIMIT 5

"""

read_query(louvain_stream_query)

Unnamed: 0,communityId,members
0,1891,"[Leander Paes, Miguel Ángel Reyes-Varela, Aust..."
1,4515,"[MLA Sujit Bose, Uluberia, MLA Dr Nirmal Maji,..."
2,5523,"[Army General, Mark Milley, Chairman of the Jo..."
3,3293,"[Halle Berry, Harrison Ford, Chadwick Boseman,..."
4,5217,"[Vanisha Mittal, Kylie Minogue, Sanjay Hinduja..."


The communities found are relatively small. This is directly a consequence of the <code>similarityCutoff</code> parameter value. If we chose a lower threshold, more nodes would be deemed similar, and likely we would get bigger communities. Likewise, we would also get bigger communities if we increased the size of our dataset and processed a couple thousand articles more. That being said, it is time to move to the data enrichment chapter and add some context to our entities.

### Data Enrichment

We found a couple of small communities in our graph. Some persons are quite famous, and it is not hard to find a common denominator between the members of communities. For example, I know that Harrison Ford, Halle Berry, and Jim Parsons are all actors, so I would assume the rest of the group are also actors. We could go and google them. A better idea would be to programmatically enrich the entities in our graph using external data providers like Google Knowledge Graph or WikiData. That is precisely what we are going to do next.

We are going to enrich only the persons that were returned in the previous query (members of the top five largest communities). To avoid running the community detection multiple times, we will first store the results of the Louvain algorithm back to Neo4j.

In [18]:
louvain_write_query = """

CALL gds.louvain.write('person_similarity', 
    {nodeLabels:['PERSON'], relationshipTypes:['SIMILAR'], 
    writeProperty:'louvain'})

"""

read_query(louvain_write_query)

Unnamed: 0,nodePropertiesWritten,createMillis,computeMillis,writeMillis,postProcessingMillis,ranLevels,communityCount,modularity,modularities,communityDistribution,configuration
0,7851,1,423,46,6,1,6561,0.997357,[0.9973570202904951],"{'p99': 4, 'min': 1, 'max': 8, 'mean': 1.19661...","{'maxIterations': 10, 'writeConcurrency': 4, '..."


### Google Knowledge Graph

A long time ago I have already used Google Knowledge Graph API in one of my blog posts. One of the reasons I write is that I have a repository of useful cypher queries that I can copy-paste from and use in later analyses.

We will use the Knowledge Graph API to enrich the members of the largest communities with description and detailed description properties.

In [21]:
enrich_with_gkg = """

// get the members of the top five communities 
MATCH (p:PERSON) 
WITH p.louvain as communityId, 
     collect(p) as members ORDER BY size(members) 
     DESC LIMIT 5 
     UNWIND members as member 
     WITH member, apoc.text.urlencode(member.text) as name, apoc.static.get('gcp.apiKey') as key 
     // send a request to KG API 
     CALL apoc.load.json("https://kgsearch.googleapis.com/v1/entities:search?query=" 
          + name + "&key=" + key + "&limit=1&indent=True") YIELD value 
     WITH member, value['itemListElement'][0]['result'] as results 
     // store results back to Neo4j 
     SET member.kg_description = results.description, 
         member.kg_detailed_description = results.detailedDescription.articleBody

"""

read_query(enrich_with_gkg)

Let's now look at the largest five communities of persons and include their Knowledge Graph descriptions.

In [19]:
top_five_with_kg_query = """

MATCH (p:PERSON) 
RETURN p.louvain as communityId, 
       collect(p.text) as members, 
       collect(DISTINCT p.kg_description) as description 
ORDER BY size(members) DESC LIMIT 5

"""

read_query(top_five_with_kg_query)

Unnamed: 0,communityId,members,description
0,1891,"[Leander Paes, Miguel Ángel Reyes-Varela, Aust...","[Indian tennis player, Tennis player, American..."
1,4515,"[MLA Sujit Bose, Uluberia, MLA Dr Nirmal Maji,...",[City in India]
2,5523,"[Army General, Mark Milley, Chairman of the Jo...",[General]
3,3293,"[Halle Berry, Harrison Ford, Chadwick Boseman,...","[American actress, American actor, Kenyan-Mexi..."
4,5217,"[Vanisha Mittal, Kylie Minogue, Sanjay Hinduja...","[Indian businessperson, Singer-songwriter, Bus..."


With the addition of the enriched descriptions, we get a better sense of who are the members of communities. We can observe that the largest cluster is comprised of tennis players. The knowledge graph doesn't identify any members of the second-largest group. It does, however, know that "Uluberia" is actually a city in India and not a person.

### Wikidata enrichment

The main reason why I am subscribed to the weekly twin4j developer newsletter is that I can copy-paste cypher queries that other people have written. Mark Needham did a whole series on how to query WikiData with cypher. He did all the hard work, so we can enjoy the fruits of his labor by copy-pasting his cypher queries. He also developed the APOC NLP procedures we just used, so kudos to Mark!

For all of you who have never heard of the Wikidata before, they describe themselves as a free and open knowledge base that can be read and edited by both humans and machines. After inspecting it for a bit, I can say that it is quite wealthy with information and definitely worth checking out. We will add the date of birth, occupation, and nationality information to the members of the largest communities. Check out the Wikidata query service for more details about data enrichment options.

In [26]:
wikidata_enrich_query = """

CALL apoc.periodic.iterate( 
    // get all persons from the biggest five communities 
    "MATCH (p:PERSON) 
     WITH p.louvain as communityId, collect(p) as members 
     ORDER BY size(members) DESC LIMIT 5 
     UNWIND members as member 
     RETURN member 
     "," 
     // prepare a sparql query 
     WITH 'SELECT * WHERE { ?person rdfs:label \\\"' + member.text + '\\\"@en ; 
                            wdt:P569 ?dateOfBirth ; 
                            wdt:P106 [ rdfs:label ?occupation ] ; 
                            wdt:P27 [ rdfs:label ?countryName ] . 
                            filter(lang(?countryName) = \\\"en\\\" && lang(?occupation) = \\\"en\\\") }' AS sparql, member 
     // make a request to wikidata 
     CALL apoc.load.jsonParams( \\\"https://query.wikidata.org/sparql?query=\\\" + apoc.text.urlencode(sparql), 
         { Accept: \\\"application/sparql-results+json\\\"}, null ) 
     YIELD value 
     CALL apoc.do.when( 
         // if there are any results 
         size(value.results.bindings) > 0, 
         // store results 
         'WITH value.results.bindings[0] AS result, member 
          SET member.dateOfBirth = date(datetime(result.dateOfBirth.value)), 
              member.wikidataImportDone = true 
          // store nationality 
          MERGE (c:Country {name: result.countryName.value }) 
          MERGE (member)-[:NATIONALITY]->(c) 
          // store occupation 
          MERGE (o:Occupation {name: result.occupation.value}) 
          MERGE (member)-[:HAS_OCCUPATION]->(o) RETURN member', 
          // else if no results 
          'SET member.wikidataImportDone = true RETURN member', 
          {value: value, member: member}) YIELD value AS result RETURN count(*)", 
          {batchSize: 20})

"""

read_query(wikidata_enrich_query)

Unnamed: 0,batches,total,timeTaken,committedOperations,failedOperations,failedBatches,retries,errorMessages,batch,operations,wasTerminated,failedParams
0,2,34,46,34,0,0,0,{},"{'total': 2, 'committed': 2, 'failed': 0, 'err...","{'total': 34, 'committed': 34, 'failed': 0, 'e...",False,{}


Let's now inspect the largest communities and include the additional information we acquired from Wikidata.

In [27]:
top_five_with_wiki_query = """

MATCH (p:PERSON) 
RETURN p.louvain as communityId, 
       collect(p.text) as members,
       apoc.math.round(avg(duration.inMonths(p.dateOfBirth, date()).years),2) as average_age,
       collect(distinct [(p)-[:NATIONALITY]->(n) | n.name][0]) as nationalities,
       collect(distinct [(p)-[:HAS_OCCUPATION]->(o) | o.name][0]) as occupations 
ORDER BY size(members) DESC 
LIMIT 5

"""

read_query(top_five_with_wiki_query)

Unnamed: 0,communityId,members,average_age,nationalities,occupations
0,1891,"[Leander Paes, Miguel Ángel Reyes-Varela, Aust...",35.83,"[India, Mexico, United States of America, Russia]",[tennis player]
1,4515,"[MLA Sujit Bose, Uluberia, MLA Dr Nirmal Maji,...",,[],[]
2,5523,"[Army General, Mark Milley, Chairman of the Jo...",,[],[]
3,3293,"[Halle Berry, Harrison Ford, Chadwick Boseman,...",58.17,"[United States of America, Mexico]","[actor, stage actor, film actor, television ac..."
4,5217,"[Vanisha Mittal, Kylie Minogue, Sanjay Hinduja...",45.0,"[United Kingdom, Puerto Rico, United States of...","[actor, singer-songwriter, composer]"


The results look very impressive. I bet you didn't know that there are at least two people named Jim Parsons. One of them is an actor, and the other one is a race car driver. And we are only scratching the surface of the information that is available through Wikidata API.

In [28]:
drop_graph('person_similarity')