** 02805 Social graphs and interactions **

# Network Construction

In [2]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
from __future__ import division # all numbers are float
import geoplotlib # plot points on maps
from geoplotlib.utils import BoundingBox
import geopy # get geo location according to addresses
from geopy.exc import GeocoderServiceError
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
import time # sleep timer
from tqdm import tqdm_notebook # make a nice progressbar
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
merged = directory + '/merged_data.pkl'
extraction_csv = directory + '/company_data.csv'

# specify nltk data dir, otherwise LookupError error
nltk.data.path.append(os.getcwd() + '/../nltk_data')
from nltk.corpus import names

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [3]:
# load data from all extractions
com_dat = dict()
if os.path.isfile(merged):
    with open(merged, 'rb') as f:
        com_dat = pickle.load(f)

## 1. Construct the Company Network

In [7]:
# create one node per company name (keys of data)
c_graph = nx.DiGraph()

In [9]:
# create nodes and edges
for c_name, comp in com_dat.iteritems():

    # don't add non companies
    if 'is_company' not in comp or not comp['is_company']:
        continue

    # add a node for the company
    c_graph.add_node(
        c_name,
        name=comp['name'], 
        type=comp['type'],
        key_people=comp['key_people'],
        industry=comp['industry'],
        founded=comp['founded'],
        location_geopy=comp['location_geopy'],
        location_city=comp['location_city'],
        location_country=comp['location_country'],
        location_gps=comp['location_gps'],
        defunct=comp['defunct'],
        subsid=comp['subsid'],
        products=comp['products'],
        num_employees=comp['num_employees'],
        parent=comp['parent']
    )

    # show example of related company
    if c_name == 'Apple Inc.':
        print comp['links']

    # add an edge for the company
    for e in comp['links']:
        if e in com_dat and 'is_company' in com_dat[e] and com_dat[e]['is_company']:
            c_graph.add_edge(c_name, e)

set([u'T-Systems', u'American Airlines Group', u'Bharat Sanchar Nigam Limited', u'Tata Teleservices', u'FingerWorks', u'Amgen', u'Yves Saint Laurent (brand)', u'Huawei', u'Reuters', u'Applied Materials', u'Comcast', u'IBM Global Services', u'Deloitte', u'Biogen', u'Cond\xe9 Nast', u'Sanmina Corporation', u'Vodafone', u'Statista', u'Samsung Electronics', u'Sony', u'Olympus Corporation', u'HP Inc.', u'Sony Mobile', u'Nokia', u'Xerox', u'Rakuten', u'UL (safety organization)', u'American Express', u'Booz Allen Hamilton', u'Alphabet Inc.', u'MediaTek', u'The Priceline Group', u'CA Technologies', u'NTT DATA', u'Burberry', u'China Unicom', u'Interbrand', u'PARC (company)', u'KDDI', u'Cerner', u'Jabil Circuit', u'Silicon Power', u'Konica Minolta', u'IBM', u'Boston Consulting Group', u'Leidos', u'Canon Inc.', u'Asus', u'KPMG', u'SanDisk', u'NeXT', u'Bharti Airtel', u'Adobe Systems', u'Plextor', u'Whole Foods Market', u'Fiserv', u'SAP SE', u'Costco', u'Am\xe9rica M\xf3vil', u'China Mobile', u'Li

In [11]:
print "There are {0} nodes in the network.".format(len(c_graph.nodes()))
print "There are {0} edges in the network.".format(len(c_graph.edges()))

There are 56871 nodes in the network.
There are 1051701 edges in the network.


## 2. Analysis

Weakly Connected Component is maximal subgraph of a directed graph such that for every pair of vertices $u$, $v$ in the subgraph, there is an undirected path from $u$ to $v$ and a directed path from $v$ to $u$.

In [13]:
c_weak = sorted(nx.weakly_connected_component_subgraphs(c_graph), key=len, reverse=True)[0]
print 'The size (number of edges) of subgraph with largest weakly connected component is:', c_weak.size()

The size (number of nodes) of subgraph with largest weakly connected component is: 1050597


In [15]:
### degree centrality
# in-edges
node_in_degree = nx.in_degree_centrality(c_weak)
# out-edges
node_out_degree = nx.out_degree_centrality(c_weak)

5 most central companies according to in-edges degree centrality:
[(u'Microsoft', 0.02822179555391057),
 (u'Reuters', 0.027273358162344722),
 (u'Sony', 0.0203335723703995),
 (u'Google', 0.01924633926299475),
 (u'IBM', 0.01799717782044461)]

5 most central companies according to out-edges degree centrality:
[(u'Telia Company', 0.012237155613130076),
 (u'Hitachi', 0.012167757755210623),
 (u'Vodafone', 0.012075227277984686),
 (u'Sony', 0.011843901084919846),
 (u'Panasonic', 0.01135811607948368)]


In [20]:
print "10 most central companies according to in-edges degree centrality:"
pprint(Counter(node_in_degree).most_common(10))

print "\n10 most central companies according to out-edges degree centrality:"
pprint(Counter(node_out_degree).most_common(10))

5 most central companies according to in-edges degree centrality:
[(u'Microsoft', 0.02822179555391057),
 (u'Reuters', 0.027273358162344722),
 (u'Sony', 0.0203335723703995),
 (u'Google', 0.01924633926299475),
 (u'IBM', 0.01799717782044461),
 (u'HSBC', 0.017233801383330635),
 (u'Vodafone', 0.01674801637789447),
 (u'Apple Inc.', 0.01663235328136205),
 (u'Nintendo', 0.016331629230377755),
 (u'Deutsche Bank', 0.016169700895232366)]

5 most central companies according to out-edges degree centrality:
[(u'Telia Company', 0.012237155613130076),
 (u'Hitachi', 0.012167757755210623),
 (u'Vodafone', 0.012075227277984686),
 (u'Sony', 0.011843901084919846),
 (u'Panasonic', 0.01135811607948368),
 (u'Toshiba', 0.011311850840870712),
 (u'Kyocera', 0.011288718221564228),
 (u'Mitsubishi Electric', 0.010802933216128062),
 (u'Comcast', 0.010687270119595643),
 (u'Fujitsu', 0.010664137500289158)]


Technology companies with most in/out connections.

In [None]:
### betweenness centrality
node_dict = nx.betweenness_centrality(c_weak)

In [19]:
# use to print with betweenness_centrality
print "Top 10 most central company according to betweenness centrality:"
pprint(Counter(node_dict).most_common(10))

Top 10 most central company according to betweenness centrality:


NameError: name 'node_dict' is not defined

In [16]:
### eigenvector centrality
# in-edges
node_in_eigen = nx.eigenvector_centrality(c_weak)
# For out-edges eigenvector centrality first reverse the graph with G.reverse().
node_out_eigen = nx.eigenvector_centrality(c_weak.reverse())

5 most central companies according to in-edges eigenvector centrality:


AttributeError: 'function' object has no attribute 'pprint'

In [18]:
print "10 most central companies according to in-edges eigenvector centrality:"
pprint(Counter(node_in_eigen).most_common(10))

print "\n10 most central companies according to out-edges eigenvector centrality:"
pprint(Counter(node_out_eigen).most_common(10))

5 most central companies according to in-edges eigenvector centrality:
[(u'Whole Foods Market', 0.06513246683947559),
 (u'Costco', 0.06468616518807528),
 (u'Lidl', 0.06460187590810752),
 (u'Aldi', 0.06458624112930075),
 (u'Albertsons', 0.06457294638825081),
 (u'Ahold Delhaize', 0.06455722688742424),
 (u'Giant-Carlisle', 0.06450301020239359),
 (u'Walmart', 0.0644512274072413),
 (u'Target Corporation', 0.0643983220707179),
 (u'Safeway Inc.', 0.06433794401085813)]

5 most central companies according to out-edges eigenvector centrality:
[(u'Albertsons', 0.06462208777787326),
 (u'Delhaize Group', 0.06456307776485336),
 (u'Kroger', 0.06450215740126888),
 (u'SuperValu (United States)', 0.06433151533748266),
 (u'Costco', 0.06429693816311595),
 (u'Whole Foods Market', 0.0642940979253253),
 (u'Star Market', 0.06400422092269757),
 (u'Acme Markets', 0.06396728921941867),
 (u'Aldi', 0.0639353172275687),
 (u'Stop & Shop', 0.06392336303972086)]


Delhaize Group has Stop & Shop as subsidiary.

## Rebuilding the Network

Without the section See also, References and External links

https://en.wikipedia.org/wiki/Delhaize_Group#See_also

In [21]:
for c_name, comp in com_dat.iteritems():

    # don't add non companies
    if 'is_company' not in comp or not comp['is_company']:
        continue
    
    # parse wikitext
    wiki_raw_cleaned = comp['wiki_raw']
    wiki_code = mwparserfromhell.parse(wiki_raw_cleaned, skip_style_tags=True)
    exclude_sections = [u'==See also==', u'==References==', u'==External links==']
    
    for e in exclude_sections:
        if e in wiki_code.filter_headings():
            # take the first part of the wikicode excluding the unwanted section
            wiki_raw_cleaned = wiki_raw_cleaned.split(e)[0]

    wiki_code = mwparserfromhell.parse(wiki_raw_cleaned, skip_style_tags=True)
    # go through each links and sort
    links = set()
    for link in wiki_code.filter_wikilinks():
        if re.match(r'\[\[.+:.+\]\]', unicode(link)):
            continue
        # [[T\xe4by]], [[Hertz|MHz]]
        links.update(re.findall(r'\[\[(.+?)(?:\|.+)?\]\]', unicode(link)))

    # list of links from specific sections intersected with old link list
    comp['links'] = comp['links'].intersection(links)
    
    if c_name == 'Delhaize Group':
        print links, "\n"
        print comp['links'], "\n"
        print comp['links'], "\n"


set([u'Naamloze vennootschap', u'Alfa-Beta Vassilopoulos', u'Ahold Delhaize', u'Mega Image', u'Anderlecht', u'Ahold', u'Dick Boer', u'Aldi', u'Hannaford Brothers Company', u'Food Lion LLC', u'Louis Delhaize Group', u'REWE Group', u'Cub Foods', u'BI-LO (United States)', u'Mats Jansson', u'De Standaard', u'Belgium', u'Brussels', u'Indonesia', u'Frans Muller', u'Zellik', u"Kash n' Karry", u'Charleroi', u'Billa (supermarket)', u'Sweetbay Supermarket', u'wholesale', u'Maxi (Serbian supermarket)', u'convenience store', u'Sint-Jans-Molenbeek', u'hypermarket', u'S.A. (corporation)', u'Food Lion']) 

set([u'Strack & Van Til', u'Acme Markets', u'Inditex', u'UCB (company)', u'Price Chopper Supermarkets', u'Cash & Carry', u'Jer\xf3nimo Martins', u'Quality Dairy Company', u'Cofinimmo', u'Quality Markets', u'Xpect Discounts', u'Omni Superstore', u'H&M', u'Uwajimaya', u'Food City (K-VA-T)', u'Target Corporation', u'Harris Teeter', u'Ampm', u'Rouses', u'OXXO', u'Ackermans & van Haaren', u'Carrefour Ci

In [None]:
matches = re.findall(r'[\[(.+?)\]\]', wiki_text)

## What if?

What if google disappears and all nodes connecting directly to google with it.