In [1]:
!pip3 install py2neo
from py2neo import Graph

Collecting py2neo
  Downloading py2neo-3.1.2.tar.gz (100kB)
[K    100% |████████████████████████████████| 102kB 2.4MB/s 
[?25hBuilding wheels for collected packages: py2neo
  Running setup.py bdist_wheel for py2neo ... [?25l- \ done
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/72/76/d9/1dffb7c4c4161e2c0282b86d3e26df26ac442f17397c7bd44f
Successfully built py2neo
Installing collected packages: py2neo
Successfully installed py2neo-3.1.2


In [6]:
import json
import codecs

In [36]:
graph = Graph("http://neo4j:7474/db/data/",user='neo4j',password='neo4jch')

In [57]:
graph.node_labels

frozenset()

In [89]:
#DELETE EVERYTHING...
graph.run("MATCH (n) DETACH DELETE n")

<py2neo.database.Cursor at 0x7ff5bae37358>

### Experiment Loading in Beneficial Ownership Data

This data is provided as JSON records; one big question is how to model this data appropriately as a `neo4j` graph.

One approach would be to define node types as follows:

- person records;
- person_address records;
- company records.

Ideally, we'd have already created company record nodes from Companies House data, keyed by compnay number, then identify and connect to the node when we have a beneficial owner to link to it?

The `natures_of_control` list then suggests several possible edge types between a person and a company?

Need to work out what constraints are required and how to trap against them as part of data quality check.

In [20]:
!head -n 1 persons-with-significant-control-snapshot-2016-10-25.txt

{"company_number":"09145694","data":{"address":{"address_line_1":"Reading Road","locality":"Henley-On-Thames","postal_code":"RG9 1DP","premises":"161","region":"Oxfordshire"},"country_of_residence":"England","date_of_birth":{"month":2,"year":1977},"etag":"26281d9bedb2d102359f6afc3cb8cf62bb4a7f01","kind":"individual-person-with-significant-control","links":{"self":"/company/09145694/persons-with-significant-control/individual/bIhuKnMFctSnjrDjUG8n3NgOrlU"},"name":"Mrs Nga Thanh Wildman","name_elements":{"forename":"Nga","middle_name":"Thanh","surname":"Wildman","title":"Mrs"},"nationality":"Vietnamese","natures_of_control":["ownership-of-shares-50-to-75-percent"],"notified_on":"2016-04-06"}}


In [21]:
!head -n 100 persons-with-significant-control-snapshot-2016-10-25.txt > snapshot_small.txt

In [90]:
#Minimal import of some sample data
#The natures_of_control list suggests several possible typed edges connecting person with company
with codecs.open('snapshot_beneficialsmall.txt', 'r', 'utf-8-sig') as f:
    for line in f:
        jdata = json.loads(line)
        query = """
WITH {jdata} AS jd
MERGE (beneficialowner:BeneficialOwner {name: jd.data.name}) ON CREATE
  SET beneficialowner.nationality = jd.data.nationality, beneficialowner.country_of_residence = jd.data.country_of_residence
MERGE (company:Company {companynumber: jd.company_number})
MERGE (beneficialowner)-[:BENEFICIALOWNEROF]->(company)
FOREACH (noc IN jd.data.natures_of_control | MERGE (beneficialowner)-[:BENEFICIALOWNEROF {kind:noc}]->(company))
"""
        graph.run(query, jdata = jdata)
#?how would we link to pre-existing company nodes with the correct company number?
#Do we MATCH them before the edge creating MERGE? MATCH (company:Company {companynumber:jd.company_number})

In [91]:
graph.node_labels

frozenset({'BeneficialOwner', 'Company'})

In [92]:
q='''
MATCH (person)-[:BENEFICIALOWNEROF]->(co)
RETURN person, co LIMIT 5
'''
for a in graph.run(q):
    print(a)

('person': (eaacbee:BeneficialOwner {country_of_residence:"England",name:"Mrs Nga Thanh Wildman",nationality:"Vietnamese"}), 'co': (f9369c4:Company {companynumber:"09145694"}))
('person': (eaacbee:BeneficialOwner {country_of_residence:"England",name:"Mrs Nga Thanh Wildman",nationality:"Vietnamese"}), 'co': (f9369c4:Company {companynumber:"09145694"}))
('person': (e8ddcb3:BeneficialOwner {country_of_residence:"England",name:"Mr Stephen Robert Charles Davies",nationality:"British"}), 'co': (f10a526:Company {companynumber:"08581893"}))
('person': (e8ddcb3:BeneficialOwner {country_of_residence:"England",name:"Mr Stephen Robert Charles Davies",nationality:"British"}), 'co': (f10a526:Company {companynumber:"08581893"}))
('person': (e8ddcb3:BeneficialOwner {country_of_residence:"England",name:"Mr Stephen Robert Charles Davies",nationality:"British"}), 'co': (f10a526:Company {companynumber:"08581893"}))


In [93]:
q='''
MATCH (n)-[r]-(m)
RETURN n,m,r.kind LIMIT 5
'''
for a in graph.run(q):
    print(a)

('n': (eaacbee:BeneficialOwner {country_of_residence:"England",name:"Mrs Nga Thanh Wildman",nationality:"Vietnamese"}), 'm': (f9369c4:Company {companynumber:"09145694"}), 'r.kind': 'ownership-of-shares-50-to-75-percent')
('n': (eaacbee:BeneficialOwner {country_of_residence:"England",name:"Mrs Nga Thanh Wildman",nationality:"Vietnamese"}), 'm': (f9369c4:Company {companynumber:"09145694"}), 'r.kind': None)
('n': (f9369c4:Company {companynumber:"09145694"}), 'm': (eaacbee:BeneficialOwner {country_of_residence:"England",name:"Mrs Nga Thanh Wildman",nationality:"Vietnamese"}), 'r.kind': 'ownership-of-shares-50-to-75-percent')
('n': (f9369c4:Company {companynumber:"09145694"}), 'm': (eaacbee:BeneficialOwner {country_of_residence:"England",name:"Mrs Nga Thanh Wildman",nationality:"Vietnamese"}), 'r.kind': None)
('n': (e8ddcb3:BeneficialOwner {country_of_residence:"England",name:"Mr Stephen Robert Charles Davies",nationality:"British"}), 'm': (f10a526:Company {companynumber:"08581893"}), 'r.ki

In [95]:
q='''
MATCH ()-[r]-()
RETURN DISTINCT type(r), r.kind
'''
for a in graph.run(q):
    print(a)

('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'ownership-of-shares-50-to-75-percent')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': None)
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'ownership-of-shares-25-to-50-percent')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'ownership-of-shares-25-to-50-percent-as-firm')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'voting-rights-25-to-50-percent')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'significant-influence-or-control')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'significant-influence-or-control-as-firm')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'right-to-appoint-and-remove-directors-as-firm')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'significant-influence-or-control-as-trust')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'ownership-of-shares-75-to-100-percent')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'voting-rights-75-to-100-percent')
('type(r)': 'BENEFICIALOWNEROF', 'r.kind': 'right-to-appoint-and-remove-directors')
('type(r)': 'BENEFICIALOW

### Loading in Company Information

We can also load in CSV data about companies from simple CSV files. Again, we need to model the data appropriately, perhaps as:
- company record;
- company_address record.

Should company and person addresses be the same sort of record? Should they be linked by some relationship?

In [64]:
!head -n 100 BasicCompanyData-2016-10-01-part1_5.csv > snapshotcompanydata.csv
!head -n 5 snapshotcompanydata.csv

CompanyName, CompanyNumber,RegAddress.CareOf,RegAddress.POBox,RegAddress.AddressLine1, RegAddress.AddressLine2,RegAddress.PostTown,RegAddress.County,RegAddress.Country,RegAddress.PostCode,CompanyCategory,CompanyStatus,CountryOfOrigin,DissolutionDate,IncorporationDate,Accounts.AccountRefDay,Accounts.AccountRefMonth,Accounts.NextDueDate,Accounts.LastMadeUpDate,Accounts.AccountCategory,Returns.NextDueDate,Returns.LastMadeUpDate,Mortgages.NumMortCharges,Mortgages.NumMortOutstanding,Mortgages.NumMortPartSatisfied,Mortgages.NumMortSatisfied,SICCode.SicText_1,SICCode.SicText_2,SICCode.SicText_3,SICCode.SicText_4,LimitedPartnerships.NumGenPartners,LimitedPartnerships.NumLimPartners,URI,PreviousName_1.CONDATE, PreviousName_1.CompanyName, PreviousName_2.CONDATE, PreviousName_2.CompanyName,PreviousName_3.CONDATE, PreviousName_3.CompanyName,PreviousName_4.CONDATE, PreviousName_4.CompanyName,PreviousName_5.CONDATE, PreviousName_5.CompanyName,PreviousName_6.CONDATE, PreviousName_6.CompanyName,Previo

In [103]:
import csv
#Ideally, we create a company:Company node with a company either here 
#and then link to it from the beneficial ownership data?
with open('snapshotcompanydata.csv','r') as csvfile:
    #need to clean the column names by stripping whitespace
    reader = csv.DictReader(csvfile,skipinitialspace=True)
    for row in reader:
        query="""
        WITH {row} AS row
        MERGE (company:Company {companynumber: row.CompanyNumber}) ON CREATE
  SET company.name = row.CompanyName
  
        MERGE (address:Address {postcode : row["RegAddress.PostCode"]}) ON CREATE 
        SET address.line1=row['RegAddress.AddressLine1'], address.line2=row['RegAddress.AddressLine2'],
        address.posttown=row['RegAddress.PostTown'],
        address.county=row['RegAddress.County'],address.country=row['RegAddress.Country']
        MERGE (company)-[:LOCATION]->(address)
        
        MERGE (companyactivity:SICCode {siccode:row['SICCode.SicText_1']})
        MERGE (company)-[:ACTIVITY]->(companyactivity)
        """
        graph.run(query,row=row)

#For each SICCode field, need to:
# - check that that SICCode node exists;
# - if it doesn't create it
# - connect company to it, ?perhaps with a weight corresponding to SICCode 1, 2 etc?


In [105]:
q='''
MATCH (n)-[:LOCATION]-(m)
RETURN n,m LIMIT 5
'''
for a in graph.run(q):
    print(a)

('n': (b61fc10:Company {companynumber:"08209948",name:"! LTD",postcode:"LS10 2RU"}), 'm': (e123478:Address {country:"",county:"YORKSHIRE",line1:"METROHOUSE 57 PEPPER ROAD",line2:"HUNSLET",postcode:"LS10 2RU",posttown:"LEEDS"}))
('n': (f10f56f:Company {companynumber:"07382019",name:"!BIG IMPACT GRAPHICS LIMITED",postcode:"EC1V 9AV"}), 'm': (f1355ca:Address {country:"",county:"",line1:"335 ROSDEN HOUSE",line2:"372 OLD STREET",postcode:"EC1V 9AV",posttown:"LONDON"}))
('n': (e354a0a:Company {companynumber:"04753368",name:"!NFERNO LTD.",postcode:"EC1N 2HA"}), 'm': (a690d02:Address {country:"",county:"",line1:"FIRST FLOOR THAVIES INN HOUSE 3-4",line2:"HOLBORN CIRCUS",postcode:"EC1N 2HA",posttown:"LONDON"}))
('n': (c24e58c:Company {companynumber:"SC421617",name:"!NSPIRED LTD",postcode:"AB11 6DJ"}), 'm': (b16f17f:Address {country:"",county:"",line1:"12 BON ACCORD SQUARE",line2:"",postcode:"AB11 6DJ",posttown:"ABERDEEN"}))
('n': (d7d9cd2:Company {companynumber:"09152972",name:"!NVERTD DESIGNS L