In [None]:
#Stuff to initalise this notebook...
%load_ext sql
#This is how we connect to a sql database
#Monolithic VM addressing style
%sql postgresql://postgres:PGPass@postgres:5432/postgres

#COnnection details for connecting via pandas
from sqlalchemy import create_engine
engine = create_engine("postgresql://postgres:PGPass@postgres:5432/postgres")

import pandas as pd

#Somwtimes it can be handly to create out own network/graph structures on the fly - networkx is good for that
import networkx as nx

## Orientation Data Exercises

To get you started with the data, here are some opening questions, with queries to solve them.

In [None]:
### Preview a couple of rows of data
%sql SELECT * FROM sigcontrol LIMIT 2

In [None]:
#the underscore character accesses the contents of the *previously executed* cell
_

The SQL magic calls don't return a *pandas* dataframe, even if it looks as though they may do. We *can* get aa dataframe version though:

In [None]:
df = _.DataFrame()
df

The SQL magic also lets us run multiple SQL commands - watch your semi-colons though...

In [None]:
%%sql
DROP TABLE IF EXISTS tmp;
CREATE TEMPORARY TABLE tmp AS SELECT * FROM sigcontrol LIMIT 2;
-- Add comments too
SELECT * FROM tmp;

In [None]:
%sql DROP TABLE IF EXISTS tmp;

### What columns are available?
Get them as a python list form a *pandas* datatable.

In [None]:
tmp = %sql SELECT * FROM sigcontrol LIMIT 1
tmp.DataFrame().columns.tolist()

#### Who controls G4S? Who controls Whitbread?

To start, we can just look up companies that seem to be appropriately named:

In [None]:
# Who controls G4S?
g4s=%sql SELECT * FROM sigcontrol WHERE company_name LIKE '%G4S%'
g4s=g4s.DataFrame()
g4s.head()

In [None]:
# Who controls Whitbread?
whitbread=%sql SELECT * FROM sigcontrol WHERE company_name LIKE '%WHITBREAD%'
whitbread=whitbread.DataFrame()
whitbread.head()

Hmm...

What we want is to see who connects to whom; that is, we want to see the network, or graph.

So let's create a graph that allows us to see how these companies are related to each other. The `networkx` library is a handy tool for this.

If we create a *directed graph* we can show arrows *from* companies *to* companies that control them.

In [None]:
import networkx as nx

DG=nx.DiGraph()

In [None]:
#Each row has details for a company and the entity controlling it
def companyInterestsGrapher(row,DG):
    #Add nodes for companies and their controlling entity
    DG.add_node(row['company_number'], Label=row['company_name'])
    DG.add_node(row['controlling_entity_company_number'], Label=row['Controlling Entity Name'])
    #Add directed edges that go from each company to it's immediate controlling entity
    DG.add_edge(row['company_number'],row['controlling_entity_company_number'])
    return

In [None]:
tmp=g4s.apply(lambda x: companyInterestsGrapher(x,DG), axis=1)

In [None]:
DG.number_of_nodes()

In [None]:
node_labels = nx.get_node_attributes(DG,'Label')
node_labels

In [None]:
nx.draw(DG,with_labels=True,labels=node_labels,pos=nx.spring_layout(DG)) #nx.spring_layout(DG)

The defualt plot is a bit horrible, so you may prefer to export the graph data and import it into something like Gephi.

In [None]:
#If we need to write out the graph file, eg so we can visualise it in Gephi
nx.write_gexf(DG, "g4s.gexf")

By chance, I also spotted this package - [visjs2jupyter](http://www.ccbb.bio/bringing-interactivity-network-visualization-jupyter-notebooks-visjs2jupyter/) - for embedding some prettier interactive d3js charts in a notebook - you should be able to click and drag the nodes around.

(Note that it seems as if only one of these diagrams can be embedded in any single notebook.)

In [None]:
#Install the package 
!pip install --quiet visJS2jupyter

#Need to ensure the inline graphics imported else we get an error
%matplotlib inline
from visJS2jupyter.visJS_module import visjs_network

def prettygraph(DG):
    nodes = DG.nodes()
    edges = DG.edges()
    
    nodes_dict = [{"id":n} for n in nodes]
    node_map = dict(zip(nodes,range(len(nodes)))) # map to indices for source/target in edges
    edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]],
                  "title":'Simple company network'} for i in range(len(edges))]
    return visjs_network(nodes_dict, edges_dict, edge_arrow_to=True)

prettygraph(DG)

We can also print out the details for the subgraphs although we lose the directionality.

In [None]:
#http://stackoverflow.com/a/21751571/454773
UG = DG.to_undirected()
sub_graphs = nx.connected_component_subgraphs(UG)

for i, sg in enumerate(sub_graphs):
    print("subgraph {} has {} nodes".format(i, sg.number_of_nodes()))
    print("\tNodes:", sg.nodes(data=True))
    print("\tEdges:", sg.edges())

In [None]:
#We can look for nodes with high in-degree - eg companies that control a lot of other companies
from operator import itemgetter

#Show the most popular named entity nodes
for node,count in sorted(DG.in_degree_iter(),key=itemgetter(1),reverse=True)[:10]:
    #Print the node and the in_degree, sorted
    print(count,DG.node[node]['Label'],node)


### Exploring nationalities (see also the *Country Match* Notebook)
We can do some simple counting on nationalities, and then pose a question about how dirty this data may be and how we might be able to clean it.

In [None]:
# How many different nationalities are beneficial owners?
%sql SELECT COUNT(DISTINCT(nationality)) FROM sigcontrol

In [None]:
#Get the list f unique nationalisites
nationalities = %sql SELECT nationality, COUNT(nationality) AS cnt FROM sigcontrol WHERE nationality IS NOT NULL GROUP BY nationality ORDER BY cnt ASC
nationalities = nationalities.DataFrame()

In [None]:
#Alternatively...
nationalities_df = %sql SELECT nationality FROM sigcontrol WHERE nationality IS NOT NULL 
nationalities_df=nationalities_df.DataFrame()
nationalities_df['nationality'].value_counts()[:10]

In [None]:
nationalities_df['nationality'].value_counts()[-5:]

Let's try to explore the data a bit more and try to find things that look as if they may be similar...

In [None]:
#Start to thing about how we might find things that maybe look the same
from difflib import get_close_matches

In [None]:
#Get the list of unique nationalities
unique_nationalities = nationalities_df['nationality'].unique().tolist()

If we go down the list of unique countries, we can look for other countries that look the same that we haven't already matched.

In [None]:
#Quick'n scruffy... other approaches are available - see the 
fuzzyn={}
synn={}
todo_nationalities=[n for n in unique_nationalities]
for n in unique_nationalities:
    if n not in fuzzyn:
        fuzzyn[n]=get_close_matches(n,todo_nationalities)
        if fuzzyn[n]:
            synn[n]=fuzzyn[n]
        for fuzz in fuzzyn[n]:
            todo_nationalities.remove(fuzz)

In [None]:
#Look for guesstimates of things that may be the same
synn

In [None]:
#Also display the nationalities that didnlt get a partical match/grouping
fuzzyn

__See also the *Country Match* notebook for another example of trying to reconcile names to the list of nationalities in the UK FCO Official Country Register.__

### What is the age of the youngest and oldest person who controls a company?

In [None]:
%sql SELECT MIN(dob_year), MAX(dob_year) FROM sigcontrol

In [None]:
#Any under eighteens?
nextyear=2017
df=%sql SELECT dob_year, COUNT(*) AS cnt FROM sigcontrol \
        WHERE dob_year::numeric < 2016 AND dob_year::numeric > $nextyear-18 \
        GROUP BY dob_year
df=df.DataFrame()
df.head()

In [None]:
df.plot(kind='bar')

In [None]:
%%sql 
DROP TABLE IF EXISTS tmp;
CREATE TEMPORARY TABLE tmp AS SELECT given_name || family_name || dob_month::text ||dob_year::text AS person FROM sigcontrol

In [None]:
%sql SELECT person, COUNT(person) AS cnt FROM tmp GROUP BY person ORDER BY cnt DESC LIMIT 5

### Postcode

In [None]:
postcodes = %sql SELECT address_postal_code, COUNT(address_postal_code) AS cnt FROM sigcontrol WHERE address_postal_code IS NOT NULL GROUP BY address_postal_code ORDER BY cnt DESC
postcodes = postcodes.DataFrame()
postcodes.head()

### Are common postal codes associated with common beneficial owners?

In [None]:
%%sql
DROP TABLE IF EXISTS tmp;
CREATE TEMPORARY TABLE tmp AS SELECT given_name || family_name || dob_month::text ||dob_year::text AS person, address_postal_code FROM sigcontrol

In [None]:
#Postcodes associated with a person
gabpc=%sql SELECT address_postal_code, COUNT(*) AS cnt FROM tmp WHERE person= 'FORNAMESURNAMEDOB' GROUP BY address_postal_code ORDER BY cnt DESC
gabpc=gabpc.DataFrame()
gabpc.head()

In [None]:
#Rows associated with a person grouped by address
def companiesByNamedPerson(given_name, family_name):
    df = %sql SELECT address_care_of, po_box, address_street, address_locality, address_region, \
                        address_postal_code, address_country, COUNT(*) AS cnt FROM sigcontrol \
                WHERE given_name='$given_name'  AND family_name='$family_name' \
                GROUP BY address_care_of, po_box, address_street, address_locality, address_region, \
                        address_postal_code, address_country \
                ORDER BY cnt DESC
    return df.DataFrame()

companiesByNamedPerson('FORENAME', 'SURNAME' )

In [None]:
def companiesPostcodeByNamedPerson(given_name, family_name):
    df = %sql SELECT address_postal_code, COUNT(*) AS cnt FROM sigcontrol \
                WHERE given_name='$given_name'  AND family_name='$family_name' \
                GROUP BY address_postal_code \
                ORDER BY cnt DESC
    return df.DataFrame()
companiesPostcodeByNamedPerson('FORENAME', 'SURNAME' ).head()

In [None]:
# People associated with a postcode by count
def peopleWithPostcode(pc):
    df= %sql SELECT given_name, family_name, COUNT(*) AS cnt FROM sigcontrol WHERE address_postal_code='$pc'  GROUP BY given_name, family_name ORDER BY cnt DESC
    return df.DataFrame()
peopleWithPostcode('POST CODE')

In [None]:
#Count of common person/address
df_pcname=%sql SELECT person, address_postal_code, COUNT(*) AS cnt FROM tmp GROUP BY person, address_postal_code ORDER BY cnt DESC
df_pcname=df_pcname.DataFrame()
df_pcname.head(10)

In [None]:
peopleWithPostcode('POST CODE').head()

In [None]:
df=%sql SELECT "Controlling Entity Name", COUNT(*) AS cnt FROM sigcontrol \
                WHERE family_name IS NULL AND address_postal_code='POST CODE' GROUP BY "Controlling Entity Name" \
                ORDER BY cnt DESC
df.DataFrame()