** 02805 Social graphs and interactions **

# Basic Stats

In [1]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
from __future__ import division # all numbers are float
import geoplotlib # plot points on maps
from geoplotlib.utils import BoundingBox
import geopy # get geo location according to addresses
from geopy.exc import GeocoderServiceError
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
import time # sleep timer
from tqdm import tqdm_notebook # make a nice progressbar
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
merged = directory + '/merged_data.pkl'
extraction_csv = directory + '/company_data.csv'

# specify nltk data dir, otherwise LookupError error
nltk.data.path.append(os.getcwd() + '/../nltk_data')
from nltk.corpus import names

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


## Load Dataset

In [32]:
# load data from all extractions
companies = dict()
if os.path.isfile(merged):
    with open(merged, 'rb') as f:
        companies = pickle.load(f)

In [33]:
# convert from dict into dataframe
comp_df = pd.DataFrame.from_dict(companies, orient='index')
comp_df.index.name = 'wiki_title'
comp_df

Unnamed: 0_level_0,location_city,location_gps,name,links,wiki_name,name_url_quoted,wiki_url,is_company,wiki_api_url,location_geopy,...,type,industry,key_people,founded,products,location,logo,defunct,parent,subsid
wiki_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&pizza,,,&pizza,"{Ruby Tuesday (restaurant), City Sports}",&pizza,%26pizza,https://en.wikipedia.org/w/index.php?title=%26...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,,,,2012-11-27 00:00:00,,,,,,
+Beryll,,,+Beryll,"{Henri Bendel, Fred Segal, Los Angeles Times}",+Beryll,%2BBeryll,https://en.wikipedia.org/w/index.php?title=%2B...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,private,[Fashion accessories],[Sigmar Berg – Chief Designer],2006-11-28 00:00:00,"[luxury goods, designer sunglasses]",,"{u'wiki_raw_code': u'Beryll logo.jpg', u'wiki_...",,,
...instore,,"(53.6467031, -1.7832076)",...instore,"{Home Bargains, Heron Foods, Tesco, Poundstret...",...instore,...instore,https://en.wikipedia.org/w/index.php?title=......,True,https://en.wikipedia.org/w/api.php?&action=par...,"Huddersfield, Yorkshire and the Humber, Englan...",...,Retail,,,2003-11-28 00:00:00,,"Huddersfield, England, United Kingdom",{u'wiki_raw_code': u'[[File:Instore-logo.png]]...,2009-11-28 00:00:00,,
01 Communique,,"(43.5892854, -79.6441645)",01 Communique,{},01 Communique,01+Communique,https://en.wikipedia.org/w/index.php?title=01+...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Ont., Peel Region, Ontario, Canada",...,Public Listed Company (),[Software],"[Andrew Cheung, President]",1992-11-28 00:00:00,"[Remote Access Software, Online Help Desk Supp...","Mississauga, Ontario, Canada","{u'wiki_raw_code': u'01 Communique Logo.svg', ...",,,
01 Distribution,Rome,"(6.130419, 1.215829)",'''01 Distribution''',{RAI},01 Distribution,01+Distribution,https://en.wikipedia.org/w/index.php?title=01+...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Lomé, Togo",...,public company,"[films, animation]","[{u'last': u'Del Brocco', u'suffix': u'', u'ti...",2001-06-21 00:00:00,,,,,,
07th Expansion,,"(36.5748441, 139.2394179)",07th Expansion,{Alchemist (company)},07th Expansion,07th+Expansion,https://en.wikipedia.org/w/index.php?title=07t...,True,https://en.wikipedia.org/w/api.php?&action=par...,日本,...,,"[Sound novels, Video game industry, Interactiv...",,,"[Higurashi no Naku Koro ni, Umineko no Naku Ko...",Japan,,,,
0verflow,"Kanda, Chiyoda , Tokyo","(35.6917842, 139.770917)",0verflow,{},0verflow,0verflow,https://en.wikipedia.org/w/index.php?title=0ve...,True,https://en.wikipedia.org/w/api.php?&action=par...,"神田, 神田ふれあい通り, 鍛冶町2, 鍛冶町, 東京, 千代田区, 東京都, 101-00...",...,Private subsidiary,"[Interactive entertainment, Brand novelties]","[{u'last': u'Ōnuma', u'suffix': u'', u'title':...",1997-11-28 00:00:00,"[School Days, Summer Days, Cross Days]",,"{u'wiki_raw_code': u'0verflowlogo.png', u'wiki...",,Stack Ltd.,
1-2-3 (fuel station),,,1-2-3,{Statoil Fuel & Retail},1-2-3 (fuel station),1-2-3+%28fuel+station%29,https://en.wikipedia.org/w/index.php?title=1-2...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,Fuel station,,,2000-12-05 00:00:00,,,,,,
1-800 Contacts,,"(40.5246711, -111.8638225)",1-800 Contacts,"{Johnson & Johnson, DITTO, CooperVision, AEA I...",1-800 Contacts,1-800+Contacts,https://en.wikipedia.org/w/index.php?title=1-8...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Draper, Utah, United States of America",...,Subsidiary,[Contact lens retail],"[{u'last': u'Jonathan C. Coon', u'suffix': u''...",1995-11-28 00:00:00,[Contact lenses],"Draper, Utah",,,,
1-800-FREE-411,,"(47.66003045, -122.290454247)",Marchex,"{Google, Liberty Media, Tellme Networks, March...",1-800-FREE-411,1-800-FREE-411,https://en.wikipedia.org/w/index.php?title=1-8...,True,https://en.wikipedia.org/w/api.php?&action=par...,"U, 4200, Mary Gates Memorial Drive Northeast, ...",...,Public,[Telecommunications],"[Russell C. Horowitz, CEO]",2005-11-28 00:00:00,[1-800-FREE411 directory service],"Seattle, WA, U.S.","{u'wiki_raw_code': u'800free411.gif', u'wiki_f...",,Liberty Media (9%),


## Basic Cleaning

### Name Conversion from List

For example Chinese companies the name is a list and also contains the Chinese name.

In [34]:
convert_names = comp_df['name'].tolist()
for idx, ele in enumerate(convert_names):
    if isinstance(ele, list):
        # some lists are empty, then replace name with wiki_name
        if not ele[0]:
            convert_names[idx] = comp_df.index[idx]
        else:
            convert_names[idx] = ele[0]
    else:
        convert_names[idx] = ele

comp_df['name'] = convert_names

### Companies with missing name

Name misses should not occur because the wikipedia link name is taken if it is missing in the company infobox. But the companies where `'is_company'` is `None` are filtered.

In [35]:
c_w_name = comp_df.dropna(subset=['name'])
c_w_name

Unnamed: 0_level_0,location_city,location_gps,name,links,wiki_name,name_url_quoted,wiki_url,is_company,wiki_api_url,location_geopy,...,type,industry,key_people,founded,products,location,logo,defunct,parent,subsid
wiki_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&pizza,,,&pizza,"{Ruby Tuesday (restaurant), City Sports}",&pizza,%26pizza,https://en.wikipedia.org/w/index.php?title=%26...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,,,,2012-11-27 00:00:00,,,,,,
+Beryll,,,+Beryll,"{Henri Bendel, Fred Segal, Los Angeles Times}",+Beryll,%2BBeryll,https://en.wikipedia.org/w/index.php?title=%2B...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,private,[Fashion accessories],[Sigmar Berg – Chief Designer],2006-11-28 00:00:00,"[luxury goods, designer sunglasses]",,"{u'wiki_raw_code': u'Beryll logo.jpg', u'wiki_...",,,
...instore,,"(53.6467031, -1.7832076)",...instore,"{Home Bargains, Heron Foods, Tesco, Poundstret...",...instore,...instore,https://en.wikipedia.org/w/index.php?title=......,True,https://en.wikipedia.org/w/api.php?&action=par...,"Huddersfield, Yorkshire and the Humber, Englan...",...,Retail,,,2003-11-28 00:00:00,,"Huddersfield, England, United Kingdom",{u'wiki_raw_code': u'[[File:Instore-logo.png]]...,2009-11-28 00:00:00,,
01 Communique,,"(43.5892854, -79.6441645)",01 Communique,{},01 Communique,01+Communique,https://en.wikipedia.org/w/index.php?title=01+...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Ont., Peel Region, Ontario, Canada",...,Public Listed Company (),[Software],"[Andrew Cheung, President]",1992-11-28 00:00:00,"[Remote Access Software, Online Help Desk Supp...","Mississauga, Ontario, Canada","{u'wiki_raw_code': u'01 Communique Logo.svg', ...",,,
01 Distribution,Rome,"(6.130419, 1.215829)",'''01 Distribution''',{RAI},01 Distribution,01+Distribution,https://en.wikipedia.org/w/index.php?title=01+...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Lomé, Togo",...,public company,"[films, animation]","[{u'last': u'Del Brocco', u'suffix': u'', u'ti...",2001-06-21 00:00:00,,,,,,
07th Expansion,,"(36.5748441, 139.2394179)",07th Expansion,{Alchemist (company)},07th Expansion,07th+Expansion,https://en.wikipedia.org/w/index.php?title=07t...,True,https://en.wikipedia.org/w/api.php?&action=par...,日本,...,,"[Sound novels, Video game industry, Interactiv...",,,"[Higurashi no Naku Koro ni, Umineko no Naku Ko...",Japan,,,,
0verflow,"Kanda, Chiyoda , Tokyo","(35.6917842, 139.770917)",0verflow,{},0verflow,0verflow,https://en.wikipedia.org/w/index.php?title=0ve...,True,https://en.wikipedia.org/w/api.php?&action=par...,"神田, 神田ふれあい通り, 鍛冶町2, 鍛冶町, 東京, 千代田区, 東京都, 101-00...",...,Private subsidiary,"[Interactive entertainment, Brand novelties]","[{u'last': u'Ōnuma', u'suffix': u'', u'title':...",1997-11-28 00:00:00,"[School Days, Summer Days, Cross Days]",,"{u'wiki_raw_code': u'0verflowlogo.png', u'wiki...",,Stack Ltd.,
1-2-3 (fuel station),,,1-2-3,{Statoil Fuel & Retail},1-2-3 (fuel station),1-2-3+%28fuel+station%29,https://en.wikipedia.org/w/index.php?title=1-2...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,Fuel station,,,2000-12-05 00:00:00,,,,,,
1-800 Contacts,,"(40.5246711, -111.8638225)",1-800 Contacts,"{Johnson & Johnson, DITTO, CooperVision, AEA I...",1-800 Contacts,1-800+Contacts,https://en.wikipedia.org/w/index.php?title=1-8...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Draper, Utah, United States of America",...,Subsidiary,[Contact lens retail],"[{u'last': u'Jonathan C. Coon', u'suffix': u''...",1995-11-28 00:00:00,[Contact lenses],"Draper, Utah",,,,
1-800-FREE-411,,"(47.66003045, -122.290454247)",Marchex,"{Google, Liberty Media, Tellme Networks, March...",1-800-FREE-411,1-800-FREE-411,https://en.wikipedia.org/w/index.php?title=1-8...,True,https://en.wikipedia.org/w/api.php?&action=par...,"U, 4200, Mary Gates Memorial Drive Northeast, ...",...,Public,[Telecommunications],"[Russell C. Horowitz, CEO]",2005-11-28 00:00:00,[1-800-FREE411 directory service],"Seattle, WA, U.S.","{u'wiki_raw_code': u'800free411.gif', u'wiki_f...",,Liberty Media (9%),


### Companies with duplicate names

Can come from users that put company profiles on their pages, compare:
* https://en.wikipedia.org/wiki/Amgen
* https://en.wikipedia.org/wiki/User:Gjanko/sandbox

In [36]:
# find duplicate company names
c_dupl = pd.concat(g for _, g in comp_df.groupby('name') if len(g) > 1)
print "Found", len(c_dupl['name']), "duplicates:"
print c_dupl['name']

Found 395 duplicates:
wiki_title
C-Station                                                                       
Etisalat                                                                        
Fukuoka Broadcasting Corporation                                                
Hot (Israel)                                                                    
Nepal Telecom                                                                   
Tanzania Agricultural Development Bank                                          
Zain Group                                                                      
Big Bazaar                                                      '''Big Bazaar'''
Big Bazaar (Rourkela)                                           '''Big Bazaar'''
ASICS                                                          ASICS Corporation
Haglöfs                                                        ASICS Corporation
Abrazo Scottsdale Campus                                Abrazo Scottsdale Ca

In [37]:
# add extra column adding the length of the link set
for i, row in c_dupl.iterrows():
  c_dupl.set_value(i, 'link_amount', len(row['links']))

In [38]:
s_list = [ 'Wendy\'s',
          'Wendy\'s Supa Sundaes' ]

for s in s_list:
    print s, c_dupl.loc[s, 'link_amount']

Wendy's 164.0
Wendy's Supa Sundaes 48.0


In [39]:
# sort after multiple values, sorts the columns with least links and employees
sort_cols = ['name', 'link_amount', 'num_employees']
c_dupl_sort = c_dupl.sort_values(by=sort_cols, ascending=[True,True,True])
c_dupl_sort[['link_amount', 'name', 'num_employees']]

Unnamed: 0_level_0,link_amount,name,num_employees
wiki_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tanzania Agricultural Development Bank,0.0,,
Zain Group,1.0,,7000.0
Fukuoka Broadcasting Corporation,9.0,,
Etisalat,17.0,,
Nepal Telecom,60.0,,5471.0
C-Station,92.0,,
Hot (Israel),338.0,,
Big Bazaar (Rourkela),3.0,'''Big Bazaar''',
Big Bazaar,4.0,'''Big Bazaar''',36000.0
Haglöfs,1.0,ASICS Corporation,200.0


In [40]:
# now drop the one's with duplicate names, will take the second duplicate with less links
c_dupl_sort.drop_duplicates('name', inplace=True)
c_dupl_sort

Unnamed: 0_level_0,location_city,location_gps,name,links,wiki_name,name_url_quoted,wiki_url,is_company,wiki_api_url,location_geopy,...,industry,key_people,founded,products,location,logo,defunct,parent,subsid,link_amount
wiki_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tanzania Agricultural Development Bank,Dar es Salaam,,,{},Tanzania Agricultural Development Bank,Tanzania+Agricultural+Development+Bank,https://en.wikipedia.org/w/index.php?title=Tan...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,[Development finance institution],"[{u'last': u'Kurwijila', u'suffix': u'', u'tit...",2015-08-07 00:00:00,,,,,,,0.0
Big Bazaar (Rourkela),,,'''Big Bazaar''',"{Future Group, Rourkela Steel Plant, Big Bazaar}",Big Bazaar (Rourkela),Big+Bazaar+%28Rourkela%29,https://en.wikipedia.org/w/index.php?title=Big...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,[Retailing],[Mr. Sadashiv Nayak],2014-12-20 00:00:00,[Hypermarket],,"{u'wiki_raw_code': u'Big Bazaar Logo.jpg', u'w...",,Future Group,,3.0
Haglöfs,,"(59.6749712, 14.5208584)",ASICS Corporation,{ASICS},Haglöfs,Hagl%C3%B6fs,https://en.wikipedia.org/w/index.php?title=Hag...,True,https://en.wikipedia.org/w/api.php?&action=par...,Sverige,...,[outdoor equipment],,1914-11-27 00:00:00,"[hardware, clothing, footwear]",Sweden,"{u'wiki_raw_code': u'Image:Logo Haglofs.png', ...",,ASICS,,1.0
Abrazo Scottsdale Campus,,"(33.4485866, -112.0773455)",Abrazo Scottsdale Campus,"{Abrazo Community Health Network, Tenet Health...",Abrazo Scottsdale Campus,Abrazo+Scottsdale+Campus,https://en.wikipedia.org/w/index.php?title=Abr...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Phoenix, Maricopa County, Arizona, United Stat...",...,[Health Care],,1983-11-28 00:00:00,"[Health care Services, Emergency room services...","Phoenix, Arizona",,,,,8.0
Al Marjan Island LLC,Ras Al Khaimah,"(25.7737705, 55.938232)",Al Marjan Island,{},Al Marjan Island LLC,Al+Marjan+Island+LLC,https://en.wikipedia.org/w/index.php?title=Al+...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Ras Al Khaimah, ‏رأس الخيمة‎, الإمارات العربيّ...",...,[Real estate],,2013-11-28 00:00:00,,,,,,,0.0
Alchemy Boulders,,"(7.2930922, 80.6350768)",Alchemy,{},Alchemy Boulders,Alchemy+Boulders,https://en.wikipedia.org/w/index.php?title=Alc...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Kandy, මහනුවර දිස්ත්‍රික්කය, Central Province,...",...,[Mining & Mineral Processing],,1996-11-28 00:00:00,,"Kandy, Sri Lanka",{u'wiki_raw_code': u'[[File:Alchemy Boulders l...,,,,0.0
Amplify (company),,,Amplify,"{Asus, News Corp}",Amplify (company),Amplify+%28company%29,https://en.wikipedia.org/w/index.php?title=Amp...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,[Education],"[{u'last': u'Klein', u'suffix': u'', u'title':...",2000-11-27 00:00:00,"[Amplify Tablet, digital curriculum, assessmen...","55 Washington Street\nSuite 900\nBrooklyn, NY ...",,,,,2.0
Andersen Tax,,"(37.7792808, -122.4192362)",Andersen Tax LLC,"{HSBC, Arthur Andersen}",Andersen Tax,Andersen+Tax,https://en.wikipedia.org/w/index.php?title=And...,True,https://en.wikipedia.org/w/api.php?&action=par...,"SF, California, United States of America",...,[Professional Services],"[Mark Vorsatz, CEO]",2002-11-28 00:00:00,,"San Francisco, California",,,,,2.0
Anki (Finnish company),,"(60.1674086, 24.9425683)",Anki,{},Anki (Finnish company),Anki+%28Finnish+company%29,https://en.wikipedia.org/w/index.php?title=Ank...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Helsinki, Helsingin seutukunta, Uusimaa, Etelä...",...,[Rug manufacture],"[{u'last': u'Björnberg', u'suffix': u'', u'tit...",1968-11-28 00:00:00,[Handmade rugs],"Helsinki, Finland",,,,,0.0
Applause (digital testing company),"Framingham, Massachusetts","(42.2792625, -71.4161719)","Applause, Inc.","{Google, Concur Technologies}",Applause (digital testing company),Applause+%28digital+testing+company%29,https://en.wikipedia.org/w/index.php?title=App...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Framingham, Middlesex County, Massachusetts, U...",...,"[Software Testing, Crowdsourcing, Software Qua...","[{u'last': u'', u'suffix': u'', u'title': u'CE...",2007-08-28 00:00:00,,,"{u'wiki_raw_code': u'[[File:Applause,_Inc.jpg|...",,,,2.0


In [41]:
# now drop the rows of our main dataframe which are in the dataframe with the unwanted duplicates
dupl_companies = list(c_dupl_sort.index.values)
c_w_name.drop(dupl_companies)

Unnamed: 0_level_0,location_city,location_gps,name,links,wiki_name,name_url_quoted,wiki_url,is_company,wiki_api_url,location_geopy,...,type,industry,key_people,founded,products,location,logo,defunct,parent,subsid
wiki_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&pizza,,,&pizza,"{Ruby Tuesday (restaurant), City Sports}",&pizza,%26pizza,https://en.wikipedia.org/w/index.php?title=%26...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,,,,2012-11-27 00:00:00,,,,,,
+Beryll,,,+Beryll,"{Henri Bendel, Fred Segal, Los Angeles Times}",+Beryll,%2BBeryll,https://en.wikipedia.org/w/index.php?title=%2B...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,private,[Fashion accessories],[Sigmar Berg – Chief Designer],2006-11-28 00:00:00,"[luxury goods, designer sunglasses]",,"{u'wiki_raw_code': u'Beryll logo.jpg', u'wiki_...",,,
...instore,,"(53.6467031, -1.7832076)",...instore,"{Home Bargains, Heron Foods, Tesco, Poundstret...",...instore,...instore,https://en.wikipedia.org/w/index.php?title=......,True,https://en.wikipedia.org/w/api.php?&action=par...,"Huddersfield, Yorkshire and the Humber, Englan...",...,Retail,,,2003-11-28 00:00:00,,"Huddersfield, England, United Kingdom",{u'wiki_raw_code': u'[[File:Instore-logo.png]]...,2009-11-28 00:00:00,,
01 Communique,,"(43.5892854, -79.6441645)",01 Communique,{},01 Communique,01+Communique,https://en.wikipedia.org/w/index.php?title=01+...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Ont., Peel Region, Ontario, Canada",...,Public Listed Company (),[Software],"[Andrew Cheung, President]",1992-11-28 00:00:00,"[Remote Access Software, Online Help Desk Supp...","Mississauga, Ontario, Canada","{u'wiki_raw_code': u'01 Communique Logo.svg', ...",,,
01 Distribution,Rome,"(6.130419, 1.215829)",'''01 Distribution''',{RAI},01 Distribution,01+Distribution,https://en.wikipedia.org/w/index.php?title=01+...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Lomé, Togo",...,public company,"[films, animation]","[{u'last': u'Del Brocco', u'suffix': u'', u'ti...",2001-06-21 00:00:00,,,,,,
07th Expansion,,"(36.5748441, 139.2394179)",07th Expansion,{Alchemist (company)},07th Expansion,07th+Expansion,https://en.wikipedia.org/w/index.php?title=07t...,True,https://en.wikipedia.org/w/api.php?&action=par...,日本,...,,"[Sound novels, Video game industry, Interactiv...",,,"[Higurashi no Naku Koro ni, Umineko no Naku Ko...",Japan,,,,
0verflow,"Kanda, Chiyoda , Tokyo","(35.6917842, 139.770917)",0verflow,{},0verflow,0verflow,https://en.wikipedia.org/w/index.php?title=0ve...,True,https://en.wikipedia.org/w/api.php?&action=par...,"神田, 神田ふれあい通り, 鍛冶町2, 鍛冶町, 東京, 千代田区, 東京都, 101-00...",...,Private subsidiary,"[Interactive entertainment, Brand novelties]","[{u'last': u'Ōnuma', u'suffix': u'', u'title':...",1997-11-28 00:00:00,"[School Days, Summer Days, Cross Days]",,"{u'wiki_raw_code': u'0verflowlogo.png', u'wiki...",,Stack Ltd.,
1-2-3 (fuel station),,,1-2-3,{Statoil Fuel & Retail},1-2-3 (fuel station),1-2-3+%28fuel+station%29,https://en.wikipedia.org/w/index.php?title=1-2...,True,https://en.wikipedia.org/w/api.php?&action=par...,,...,Fuel station,,,2000-12-05 00:00:00,,,,,,
1-800 Contacts,,"(40.5246711, -111.8638225)",1-800 Contacts,"{Johnson & Johnson, DITTO, CooperVision, AEA I...",1-800 Contacts,1-800+Contacts,https://en.wikipedia.org/w/index.php?title=1-8...,True,https://en.wikipedia.org/w/api.php?&action=par...,"Draper, Utah, United States of America",...,Subsidiary,[Contact lens retail],"[{u'last': u'Jonathan C. Coon', u'suffix': u''...",1995-11-28 00:00:00,[Contact lenses],"Draper, Utah",,,,
1-800-FREE-411,,"(47.66003045, -122.290454247)",Marchex,"{Google, Liberty Media, Tellme Networks, March...",1-800-FREE-411,1-800-FREE-411,https://en.wikipedia.org/w/index.php?title=1-8...,True,https://en.wikipedia.org/w/api.php?&action=par...,"U, 4200, Mary Gates Memorial Drive Northeast, ...",...,Public,[Telecommunications],"[Russell C. Horowitz, CEO]",2005-11-28 00:00:00,[1-800-FREE411 directory service],"Seattle, WA, U.S.","{u'wiki_raw_code': u'800free411.gif', u'wiki_f...",,Liberty Media (9%),


## General Statistics in the Data

Top 10:
* Company with most links.
* Company with most employees.
* Most influential people's first name

Graph:
* Companies by country.
* Companies by age of foundation.
(Company listed in most countries.)

NetworkX:
* What is the most central company in Denmark and Europe?

In [12]:
cnt = Counter()
# iterate every row that represents a company
for index, row in c_w_name.iterrows():
    # empty link list
    if isinstance(row['links'], float):
        continue
    cnt[row['name']] = len(row['links'])

print "Companies with most links:"
pprint(cnt.most_common(10))

Companies with most links:
[(u'Telia Company AB', 529),
 (u'Hitachi, Ltd.', 526),
 (u'Vodafone Group', 522),
 (u'Sony Corporation', 512),
 (u'Panasonic Corporation', 491),
 (u'Toshiba Corporation', 489),
 (u'Kyocera Corporation', 488),
 (u'Mitsubishi Electric Corporation', 467),
 (u'Comcast Corporation', 462),
 (u'Fujitsu Ltd.', 461)]


In [23]:
cnt = Counter()
for index, row in comp_df.iterrows():
    if pd.isnull(row['num_employees']):
        continue
    cnt[row['name']] = row['num_employees']

print "Highest employee rate by company:"
pprint(cnt.most_common(10))

Most companies by country:
[(u'UMW Holdings Berhad', 1100000.0),
 (u'JSC Russian Railways', 942808.0),
 (u'Rostec', 900000.0),
 (u'China Post Group Corporation', 860200.0),
 (u'Tata Group', 660800.0),
 (u'G4S plc', 618000.0),
 (u'Volkswagen AG', 610076.0),
 (u'Volkswagen', 610000.0),
 (u'People Ready', 600000.0),
 (u'Tesco PLC', 597784.0)]


In [58]:
# the human parser is not as accurate, so we need to sort out some false names
def check_if_real(p):
    if p and p.isalpha() and \
        not re.search(r'President|Chairman|Manag|Founder|VP|Officer|CTO|CEO|CFO|COO|Director', p, re.IGNORECASE):
            return True
    return False


females = set(names.words('female.txt'))
cnt_first, cnt_last, cnt_female = Counter(), Counter(), Counter()
for index, comp in companies.iteritems():
    if 'key_people' not in comp or not comp['key_people']:
        continue
    for person in comp['key_people']:
        if isinstance(person, dict):
            if check_if_real(person['first']):
                cnt_first[person['first']] += 1
                if person['first'] in females:
                    cnt_female[person['first']] += 1
            if check_if_real(person['last']):
                cnt_last[person['last']] += 1
            
print "Most common male first names by company key people:"
pprint(cnt_first.most_common(10))

print "Most common female first names by company key people:"
pprint(cnt_female.most_common(10))

print "Most common last names by company key people:"
pprint(cnt_last.most_common(10))

Most common male first names by company key people:
[(u'John', 554),
 (u'David', 454),
 (u'Michael', 340),
 (u'Mark', 243),
 (u'Robert', 238),
 (u'Peter', 233),
 (u'James', 210),
 (u'Richard', 204),
 (u'Paul', 196),
 (u'William', 161)]
Most common female first names by company key people:
[(u'Chris', 139),
 (u'George', 95),
 (u'Daniel', 86),
 (u'Tim', 83),
 (u'Bill', 79),
 (u'Frank', 76),
 (u'Tony', 67),
 (u'Andy', 65),
 (u'Alex', 59),
 (u'Lee', 49)]
Most common last names by company key people:
[(u'Smith', 88),
 (u'Lee', 54),
 (u'Jones', 43),
 (u'Miller', 43),
 (u'Brown', 41),
 (u'Taylor', 41),
 (u'Wilson', 40),
 (u'Williams', 33),
 (u'Gupta', 33),
 (u'Davis', 33)]


John Smith seems to be a very successful business man.

In [None]:
def geo_plot(geodata):
    """
    Plot given coordinate input
    """

    # bounding box on the minima and maxima of the data
    geoplotlib.set_bbox(
        BoundingBox(
            max(geodata['lat']), 
            max(geodata['lon']), 
            min(geodata['lat']), 
            min(geodata['lon'])
        ));
    
    # kernel density estimation visualization
    geoplotlib.kde(geodata, bw=5, cut_below=1e-3, cmap='hot', alpha=170)
    # google tiles with lyrs=y ... hybrid
    geoplotlib.tiles_provider({
        #'url': lambda zoom, xtile, ytile: 'https://mt1.google.com/vt/lyrs=y&hl=en&x=%d&y=%d&z=%d' % (xtile, ytile, zoom),
        'url': lambda zoom, xtile, ytile: 'https://maps.wikimedia.org/osm-intl/%d/%d/%d.png' % (zoom, xtile, ytile),
        'tiles_dir': 'DTU-social_graphs',
        'attribution': 'DTU - 02805 Social graphs and interactions'
    })
    
    geoplotlib.inline();

In [59]:
def get_all_geodata(dataset, bounds=None):

    # filter bad rows
    dataset = dataset[dataset.location_gps.notnull()]
    
    # ignore the warning about chained assignments
    pd.options.mode.chained_assignment = None 
    # make 2 extra columns for split longitude and latitude
    dataset['lat'], dataset['lon'] = zip(*dataset.location_gps)    
    
    # only activity in the boundaries
    if not bounds:
        include = dataset.location_gps.notnull()
    else:
        include = (dataset.lat > bounds[0]) & \
        (dataset.lat < bounds[1]) & \
        (dataset.lon > bounds[2]) & \
        (dataset.lon < bounds[3])
        
    # get data in the format geoplotlib requires. We put the geodata in a dictionary structured as follows
    geo_coords = dataset.loc[include].location_gps.tolist()
    geo_data = {
        "lat": [float(x[0]) for x in geo_coords], 
        "lon": [float(x[1]) for x in geo_coords]
    }
    return geo_data

In [60]:
# create the dictionary with lat and lon
geodat_all_comp = get_all_geodata(comp_df)
geodat_us_comp = get_all_geodata(
    c_w_name,
    # [min lat, max lat, min lon, max lon)]
    [24.9493, 49.5904, -125.0011, -66.9326]
)
geodat_eu_comp = get_all_geodata(
    c_w_name,
    # [min lat, max lat, min lon, max lon)]
    [27.6363, 70, -25, 40]
)
geodat_asia_comp = get_all_geodata(
    c_w_name,
    # [min lat, max lat, min lon, max lon)]
    [0, 43, 70, 160]
)

In [None]:
print 'South-East Asia companies'
geo_plot(geodat_asia_comp)

South-East Asia companies


In [144]:
print 'EU companies'
geo_plot(geodat_eu_comp)

EU companies
('smallest non-zero count', 7.1647865443840454e-10)
('max count:', 12.54885409693652)


In [128]:
print 'US companies'
geo_plot(geodat_us_comp)

US companies
('smallest non-zero count', 1.4329573088768091e-09)
('max count:', 9.3308931345834427)


In [145]:
print 'all companies'
geo_plot(geodat_all_comp)

all companies
('smallest non-zero count', 1.4329573088768091e-09)
('max count:', 18.971452283211814)
