** 02805 Social graphs and interactions **

# Network Construction

In [1]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
from __future__ import division # all numbers are float
import geopy # get geo location according to addresses
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
import time # sleep timer
from tqdm import tqdm_notebook # make a nice progressbar
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
merged = directory + '/merged_data.pkl'
extraction_csv = directory + '/company_data.csv'

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [3]:
# load data from all extractions
com_dat = dict()
if os.path.isfile(merged):
    with open(merged, 'rb') as f:
        com_dat = pickle.load(f)

In [32]:
def parse_employee_nr(input_val, c_name):
    """
    Parse a proper number out of the various inputs
    Case possibilities tested:
    
    s_list = [
        '15-50',
        '~300+',
        'circa 40',
        '9,985(Dec 2011)',
        'over 10,000 in 10 countries',
        'Five',
        'over 1 million',
        '10.000',
        'Part of Popular, Inc., which has 8,000 employees'
    ]

    for s in s_list:
        print parse_employee_nr(s, 'test')
    """
    
    # match the first number, dot or comma separation optional
    m = re.search(r'[0-9]+([,\.][0-9]+)?', unicode(input_val))
    if m:
        try:
            # replace , and conert to int
            return int(m.group().replace(',', '').replace('.', ''))
        except ValueError:
            print "WARN: Failed conversion of:{0} (company: {1})".format(
                input_val, c_name)
    return None

In [37]:
# specify the type for each field
types = {
    # when first link is crawled
    'wiki_name': unicode,
    'wiki_url': str,
    'name_url_quoted': str,
    
    # when WIKI API is crawled
    'wiki_page_id': float,
    'wiki_api_url': str,
    'all_links': list,
    'links': set,
    'is_company': bool,
    'wiki_raw': unicode,
    
    # from extraction2
    'oc_api_url': str,
    'oc_api_search_url': str,
    'oc_api_network_url': str,
    
    # added only when Infobox company exists or fields from OpenCorporates
    # not all fields always exist, they are NaN in the resulting DataFrame
    'name': unicode, 
    'type': unicode, 
    'founded': datetime.datetime, 
    'defunct': datetime.datetime, 
    'location': unicode,
    'location_city': unicode,
    'location_country': unicode,
    'location_geopy': unicode,
    'location_gps': tuple,
    # following not in OC
    'countries': set, # added with extraction 3
    'logo': dict,
    'key_people': list, # additonally processed with nameparser.HumanName (dict)
    'industry': list, 
    'subsid': list,
    'products': list, 
    'num_employees': float, 
    'parent': unicode, 
    'homepage': unicode
}

for c, c_dict in com_dat.iteritems():
    for k, val in c_dict.iteritems():
        if val and not isinstance(types[k], list) and isinstance(val, list):
            if k == 'num_employees':
                com_dat[c][k] = parse_employee_nr(val[0], c)
            else:
                com_dat[c][k] = val[0]

In [54]:
# convert from dict into dataframe
comp_df = pd.DataFrame.from_dict(com_dat, orient='index', dtype=set)

In [55]:
# save as CSV file
comp_df.to_csv(extraction_csv, encoding='utf-8', index_label='wiki_title')

In [56]:
c_dat = pd.read_csv(extraction_csv, index_col=0, dtype=types)

In [57]:
for c, row in c_dat.iterrows():
    print type(row['links'])
    break

<type 'str'>


In [42]:
cnt = Counter()
# iterate every row that represents a company
for index, row in c_dat.iterrows():
    # empty link list
    if isinstance(row['links'], float):
        continue
    cnt[row['name']] = len(row['links'])

print "Companies with most links:"
pprint(cnt.most_common(10))

Companies with most links:
[('China State Construction Engineering Corporation ', 2540),
 ('China Communications Construction', 2538),
 ('Metallurgical Corporation of China Limited', 2518),
 ('Dexia Crediop', 2490),
 ('Banca CRS', 2489),
 ('IW BANK S.p.A.', 2484),
 ('Banca IFIS S.p.A.', 2480),
 ('Banca Carim', 2479),
 ('ChiantiBanca', 2478),
 ('Fidi Toscana', 2478)]


In [43]:
cnt = Counter()
for index, row in c_dat.iterrows():
    cnt[row['name']] = row['num_employees']

print "Most companies by country:"
pprint(cnt.most_common(10))

Most companies by country:
[('United Bank Limited (UBL)', 15000.0),
 ('Taiwan Sugar Corporation', nan),
 (nan, 7000.0),
 ("Valle's Steak House", 3600.0),
 ('Nammo AS', 1900.0),
 ('Framo-Werke GmbH', 1000.0),
 ('Irish Bank Resolution Corporation', 850.0),
 ('Blue Nile Inc.', 301.0),
 ('Nordcurrent', 80.0),
 ('Serellan', nan)]


## 1. Construct the Company Network

In [45]:
# create one node per company name (keys of data)
c_graph = nx.DiGraph()

In [47]:
# create nodes and edges
for c, row in c_dat.iterrows():

    # don't add non companies
    if 'is_company' not in row or not row['is_company']:
        continue

    # add a node for the company
    c_graph.add_node(
        c,
        name=row['name'], 
        type=row['type'],
        key_people=row['key_people'],
        industry=row['industry'],
        founded=row['founded'],
        #location_geopy=row['location_geopy'],
        location_city=row['location_city'],
        location_country=row['location_country'],
        #location_gps=row['location_gps'],
        defunct=row['defunct'],
        subsid=row['subsid'],
        products=row['products'],
        num_employees=row['num_employees'],
        parent=row['parent']
    )

    # show example of related company
    if c == 'Apple Inc.':
        print row['links']    
    
    # check if edge list exists
    if 'links' not in row:
        print 'No link list for company:', c
        continue

    # add an edge for the company
    for e in row['links']:
        print c, e
        c_graph.add_edge(c, e)
        
    break

&pizza {
&pizza R
&pizza u
&pizza b
&pizza y
&pizza  
&pizza T
&pizza u
&pizza e
&pizza s
&pizza d
&pizza a
&pizza y
&pizza  
&pizza (
&pizza r
&pizza e
&pizza s
&pizza t
&pizza a
&pizza u
&pizza r
&pizza a
&pizza n
&pizza t
&pizza )
&pizza ,
&pizza  
&pizza C
&pizza i
&pizza t
&pizza y
&pizza  
&pizza S
&pizza p
&pizza o
&pizza r
&pizza t
&pizza s
&pizza }


In [25]:
print "There are {0} nodes in the network.".format(len(p_graph.nodes()))
print "There are {0} edges in the network.".format(len(p_graph.edges()))

There are 0 nodes in the network.
There are 0 edges in the network.


## 2. Analysis

Weakly Connected Component is maximal subgraph of a directed graph such that for every pair of vertices $u$, $v$ in the subgraph, there is an undirected path from $u$ to $v$ and a directed path from $v$ to $u$.

In [None]:
p_weak = sorted(nx.weakly_connected_component_subgraphs(p_graph), key=len, reverse=True)[0]
print 'The size (number of nodes) of subgraph with largest weakly connected component is:', p_weak.size()

In [None]:
### degree centrality
# in-edges
node_in_degree = nx.in_degree_centrality(p_weak)
# out-edges
node_out_degree = nx.out_degree_centrality(p_weak)

print "5 most central companies according to in-edges degree centrality:"
pprint.pprint(Counter(node_in_degree).most_common(5))

print "\n5 most central companies according to out-edges degree centrality:"
pprint.pprint(Counter(node_out_degree).most_common(5))

In [None]:
# calculate betweenness centrality
node_dict = nx.betweenness_centrality(p_weak)

# use to print with betweenness_centrality
print "Top 5 most central company according to betweenness centrality:"
pprint.pprint(Counter(node_dict).most_common(5))

In [None]:
### eigenvector centrality
# in-edges
node_in_eigen = nx.eigenvector_centrality(p_weak)
# For out-edges eigenvector centrality first reverse the graph with G.reverse().
node_out_eigen = nx.eigenvector_centrality(p_weak.reverse())

print "5 most central companies according to in-edges eigenvector centrality:"
pprint.pprint(Counter(node_in_eigen).most_common(5))

print "\n5 most central companies according to out-edges eigenvector centrality:"
pprint.pprint(Counter(node_out_eigen).most_common(5))

## What if?

What if google disappears and all nodes connecting directly to google with it.