** 02805 Social graphs and interactions **

# Network Construction

In [1]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
from __future__ import division # all numbers are float
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
extraction_csv = directory + '/all_data.csv'

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


## 1. Construct the Company Network

In [3]:
# load CSV data
c_dat = pd.read_csv(extraction_csv)
c_dat.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,location_city,links,wiki_link,page_id,location_country,homepage,is_company,name,num_employees,industry,products,type,founded,logo,key_people,location,defunct,parent,subsid
0,&pizza,,"{Ruby Tuesday (restaurant), City Sports}",https://en.wikipedia.org/wiki/%26pizza,47858476.0,,http://www.andpizza.com/,True,&pizza,,,,,2012-11-24 00:00:00,,,,,,
1,+Beryll,,,https://en.wikipedia.org/wiki/%2BBeryll,,,,,,,,,,,,,,,,
2,...instore,,,https://en.wikipedia.org/wiki/...instore,,,,,,,,,,,,,,,,
3,01 Communique,,{},https://en.wikipedia.org/wiki/01_Communique,15244876.0,,http://www.01com.com,True,01 Communique,15.0,Software,"Remote Access Software, Online Help Desk Suppo...",Public Listed Company (),1992,{u'wiki_raw_code': u'[[File:01 Communique Logo...,"Andrew Cheung, President","Mississauga, Ontario, Canada",,,
4,01 Distribution,,,https://en.wikipedia.org/wiki/01_Distribution,,,,,,,,,,,,,,,,


In [3]:
# create one node per company name (keys of data)
p_graph = nx.DiGraph()

In [None]:
# create node links
for p in com_dat.keys():

    # add a node for the company
    p_graph.add_node(
        com_dat[p]['name'], 
        type=com_dat[p]['type'],
        key_people=com_dat[p]['key_people'],
        industry=com_dat[p]['industry'],
        founded=com_dat[p]['founded'],
        location=com_dat[p]['location'],
        location_city=com_dat[p]['location_city'],
        location_city=com_dat[p]['location_country'],
        location_gps=com_dat[p]['location_gps'],
        defunct=com_dat[p]['defunct'],
        subsid=com_dat[p]['subsid'],
        products=com_dat[p]['products'],
        num_employees=com_dat[p]['num_employees'],
        parent=com_dat[p]['parent']
    )

    # show example of related company
    if p == 'Apple_Inc.':
        print com_dat[p]['links']    
    
    # check if edge list exists
    if 'links' not in com_dat[p]:
        print 'No link list for company:', p
        continue
    # add an edge for the company
    for e in com_dat[p]['links']:
        p_graph.add_edge(p, e)

In [None]:
print "There are {0} nodes in the network.".format(len(p_graph.nodes()))
print "There are {0} edges in the network.".format(len(p_graph.edges()))

## 2. Analysis

Weakly Connected Component is maximal subgraph of a directed graph such that for every pair of vertices $u$, $v$ in the subgraph, there is an undirected path from $u$ to $v$ and a directed path from $v$ to $u$.

In [None]:
p_weak = sorted(nx.weakly_connected_component_subgraphs(p_graph), key=len, reverse=True)[0]
print 'The size (number of nodes) of subgraph with largest weakly connected component is:', p_weak.size()

In [None]:
### degree centrality
# in-edges
node_in_degree = nx.in_degree_centrality(p_weak)
# out-edges
node_out_degree = nx.out_degree_centrality(p_weak)

print "5 most central companies according to in-edges degree centrality:"
pprint.pprint(Counter(node_in_degree).most_common(5))

print "\n5 most central companies according to out-edges degree centrality:"
pprint.pprint(Counter(node_out_degree).most_common(5))

In [None]:
# calculate betweenness centrality
node_dict = nx.betweenness_centrality(p_weak)

# use to print with betweenness_centrality
print "Top 5 most central company according to betweenness centrality:"
pprint.pprint(Counter(node_dict).most_common(5))

In [None]:
### eigenvector centrality
# in-edges
node_in_eigen = nx.eigenvector_centrality(p_weak)
# For out-edges eigenvector centrality first reverse the graph with G.reverse().
node_out_eigen = nx.eigenvector_centrality(p_weak.reverse())

print "5 most central companies according to in-edges eigenvector centrality:"
pprint.pprint(Counter(node_in_eigen).most_common(5))

print "\n5 most central companies according to out-edges eigenvector centrality:"
pprint.pprint(Counter(node_out_eigen).most_common(5))

## What if?

What if google disappears and all nodes connecting directly to google with it.