** 02805 Social graphs and interactions **

# Basic Stats

In [149]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
from __future__ import division # all numbers are float
import geopy # get geo location according to addresses
from geopy.exc import GeocoderServiceError # exception
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
import time # sleep timer
from tqdm import tqdm_notebook # make a nice progressbar
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
merged = directory + '/merged_data.pkl'
extraction_csv = directory + '/company_data.csv'

Once deleted, variables cannot be recovered. Proceed (y/[n])? n
Nothing done.


## Load Dataset

In [4]:
# load data from all extractions
companies = dict()
if os.path.isfile(merged):
    with open(merged, 'rb') as f:
        companies = pickle.load(f)

In [5]:
# convert from dict into dataframe
comp_df = pd.DataFrame.from_dict(companies, orient='index')
comp_df

Unnamed: 0,location_city,wiki_url,links,wiki_raw,wiki_name,name_url_quoted,wiki_api_url,all_links,wiki_page_id,location_country,...,type,key_people,founded,location,logo,defunct,country,page_id,parent,subsid
&pizza,,https://en.wikipedia.org/w/index.php?title=%26...,"{Ruby Tuesday (restaurant), City Sports}",{{Infobox company\n| name = &pizza...,&pizza,%26pizza,https://en.wikipedia.org/w/api.php?&action=par...,"[City Sports, DC Central Kitchen, Fast casual ...",47858476.0,,...,,,2012-11-25 00:00:00,,,,,,,
+Beryll,,https://en.wikipedia.org/w/index.php?title=%2B...,"{Henri Bendel, Fred Segal, Los Angeles Times}",{{for|related terms|Beryl (disambiguation)}}\n...,+Beryll,%2BBeryll,https://en.wikipedia.org/w/api.php?&action=par...,"[Angelina Jolie, Anna Pacquin, Austria, Bauhau...",13860681.0,,...,private,Sigmar Berg – Chief Designer,"Santa Monica, California (2006)",,"{u'wiki_raw_code': u'Beryll logo.jpg', u'wiki_...",,,,,
...instore,,https://en.wikipedia.org/w/index.php?title=......,"{Home Bargains, Heron Foods, Tesco, Poundstret...",{{Use dmy dates|date=July 2015}}\n{{Use Britis...,...instore,...instore,https://en.wikipedia.org/w/api.php?&action=par...,"[Aldi, Asda, BBC News Online, B & M, Bargain B...",9291975.0,,...,Retail,,2003,"Huddersfield, England, United Kingdom",{u'wiki_raw_code': u'[[File:Instore-logo.png]]...,2009,,,,
01 Communique,,https://en.wikipedia.org/w/index.php?title=01+...,{},{{Infobox company\n| name = 01 Communique\n...,01 Communique,01+Communique,https://en.wikipedia.org/w/api.php?&action=par...,"[Arlington, Virginia, Citrix, I'm InTouch, Mis...",15244876.0,,...,Public Listed Company (),"Andrew Cheung, President",1992,"Mississauga, Ontario, Canada",{u'wiki_raw_code': u'[[File:01 Communique Logo...,,,,,
01 Distribution,Rome,https://en.wikipedia.org/w/index.php?title=01+...,{RAI},{{Company\n| name = '''01 Distribution'''\n| l...,01 Distribution,01+Distribution,https://en.wikipedia.org/w/api.php?&action=par...,"[Animation, Film distribution, Film industry, ...",46724099.0,Italy,...,public company,"[{u'last': u'Del Brocco', u'suffix': u'', u'ti...",21 June 2001,,,,,,,
07th Expansion,,https://en.wikipedia.org/w/index.php?title=07t...,{Alchemist (company)},{{Expand Japanese|07th Expansion|date=January ...,07th Expansion,07th+Expansion,https://en.wikipedia.org/w/api.php?&action=par...,"[Alchemist (company), Comiket, Crunchyroll, Dō...",5936289.0,,...,,,,Japan,,,,,,
0verflow,"Kanda, Chiyoda , Tokyo",https://en.wikipedia.org/w/index.php?title=0ve...,{},{{Other uses|Overflow (disambiguation)}}\n{{In...,0verflow,0verflow,https://en.wikipedia.org/w/api.php?&action=par...,"[Anime, Anime News Network, CEO, Chiyoda, Toky...",10387049.0,Japan,...,Private subsidiary,"[{u'last': u'Ōnuma', u'suffix': u'', u'title':...",1997,,"{u'wiki_raw_code': u'0verflowlogo.png', u'wiki...",,,,Stack Ltd.,
1-2-3 (fuel station),,https://en.wikipedia.org/w/index.php?title=1-2...,{Statoil Fuel & Retail},{{Infobox company\n| name = 1-2-3\n...,1-2-3 (fuel station),1-2-3+%28fuel+station%29,https://en.wikipedia.org/w/api.php?&action=par...,"[Baltic states, Denmark, Fuel station, Kaunas,...",14018485.0,,...,Fuel station,,5 December 2000,,,,,,,
1-800 Contacts,,https://en.wikipedia.org/w/index.php?title=1-8...,"{Johnson & Johnson, DITTO, CooperVision, AEA I...",{{Refimprove|date=October 2009}}\n{{Infobox co...,1-800 Contacts,1-800+Contacts,https://en.wikipedia.org/w/api.php?&action=par...,"[1-800 Contacts, Inc. v. WhenU.com, Inc., AEA ...",4613366.0,,...,Subsidiary,"[{u'last': u'Jonathan C. Coon', u'suffix': u''...",1995,"Draper, Utah",,,,,,
1-800-FREE-411,,https://en.wikipedia.org/w/index.php?title=1-8...,"{Google, Liberty Media, Tellme Networks, March...",{{Infobox company |\n name = Marchex |\n logo ...,1-800-FREE-411,1-800-FREE-411,https://en.wikipedia.org/w/api.php?&action=par...,"[4-1-1, 800-The-Info, Android (operating syste...",18480351.0,,...,Public,"Russell C. Horowitz, CEO","2005 in Burlington, MA","Seattle, WA, U.S.","{u'wiki_raw_code': u'800free411.gif', u'wiki_f...",,,,Liberty Media (9%),


## Basic Cleaning

### Name Conversion from List

For example Chinese companies the name is a list and also contains the Chinese name.

In [48]:
convert_names = comp_df['name'].tolist()
for idx, ele in enumerate(convert_names):
    if isinstance(ele, list):
        # some lists are empty, then replace name with wiki_name
        if not ele[0]:
            convert_names[idx] = comp_df.index[idx]
        else:
            convert_names[idx] = ele[0]
    else:
        convert_names[idx] = ele

comp_df['name'] = convert_names

### Companies with missing name

Name misses should not occur because the wikipedia link name is taken if it is missing in the company infobox.

In [68]:
c_w_name = comp_df.dropna(subset=['name'])
c_w_name

Unnamed: 0,location_city,wiki_url,links,wiki_raw,wiki_name,name_url_quoted,wiki_api_url,all_links,wiki_page_id,location_country,...,type,key_people,founded,location,logo,defunct,country,page_id,parent,subsid
&pizza,,https://en.wikipedia.org/w/index.php?title=%26...,"{Ruby Tuesday (restaurant), City Sports}",{{Infobox company\n| name = &pizza...,&pizza,%26pizza,https://en.wikipedia.org/w/api.php?&action=par...,"[City Sports, DC Central Kitchen, Fast casual ...",47858476.0,,...,,,2012-11-25 00:00:00,,,,,,,
+Beryll,,https://en.wikipedia.org/w/index.php?title=%2B...,"{Henri Bendel, Fred Segal, Los Angeles Times}",{{for|related terms|Beryl (disambiguation)}}\n...,+Beryll,%2BBeryll,https://en.wikipedia.org/w/api.php?&action=par...,"[Angelina Jolie, Anna Pacquin, Austria, Bauhau...",13860681.0,,...,private,Sigmar Berg – Chief Designer,"Santa Monica, California (2006)",,"{u'wiki_raw_code': u'Beryll logo.jpg', u'wiki_...",,,,,
...instore,,https://en.wikipedia.org/w/index.php?title=......,"{Home Bargains, Heron Foods, Tesco, Poundstret...",{{Use dmy dates|date=July 2015}}\n{{Use Britis...,...instore,...instore,https://en.wikipedia.org/w/api.php?&action=par...,"[Aldi, Asda, BBC News Online, B & M, Bargain B...",9291975.0,,...,Retail,,2003,"Huddersfield, England, United Kingdom",{u'wiki_raw_code': u'[[File:Instore-logo.png]]...,2009,,,,
01 Communique,,https://en.wikipedia.org/w/index.php?title=01+...,{},{{Infobox company\n| name = 01 Communique\n...,01 Communique,01+Communique,https://en.wikipedia.org/w/api.php?&action=par...,"[Arlington, Virginia, Citrix, I'm InTouch, Mis...",15244876.0,,...,Public Listed Company (),"Andrew Cheung, President",1992,"Mississauga, Ontario, Canada",{u'wiki_raw_code': u'[[File:01 Communique Logo...,,,,,
01 Distribution,Rome,https://en.wikipedia.org/w/index.php?title=01+...,{RAI},{{Company\n| name = '''01 Distribution'''\n| l...,01 Distribution,01+Distribution,https://en.wikipedia.org/w/api.php?&action=par...,"[Animation, Film distribution, Film industry, ...",46724099.0,Italy,...,public company,"[{u'last': u'Del Brocco', u'suffix': u'', u'ti...",21 June 2001,,,,,,,
07th Expansion,,https://en.wikipedia.org/w/index.php?title=07t...,{Alchemist (company)},{{Expand Japanese|07th Expansion|date=January ...,07th Expansion,07th+Expansion,https://en.wikipedia.org/w/api.php?&action=par...,"[Alchemist (company), Comiket, Crunchyroll, Dō...",5936289.0,,...,,,,Japan,,,,,,
0verflow,"Kanda, Chiyoda , Tokyo",https://en.wikipedia.org/w/index.php?title=0ve...,{},{{Other uses|Overflow (disambiguation)}}\n{{In...,0verflow,0verflow,https://en.wikipedia.org/w/api.php?&action=par...,"[Anime, Anime News Network, CEO, Chiyoda, Toky...",10387049.0,Japan,...,Private subsidiary,"[{u'last': u'Ōnuma', u'suffix': u'', u'title':...",1997,,"{u'wiki_raw_code': u'0verflowlogo.png', u'wiki...",,,,Stack Ltd.,
1-2-3 (fuel station),,https://en.wikipedia.org/w/index.php?title=1-2...,{Statoil Fuel & Retail},{{Infobox company\n| name = 1-2-3\n...,1-2-3 (fuel station),1-2-3+%28fuel+station%29,https://en.wikipedia.org/w/api.php?&action=par...,"[Baltic states, Denmark, Fuel station, Kaunas,...",14018485.0,,...,Fuel station,,5 December 2000,,,,,,,
1-800 Contacts,,https://en.wikipedia.org/w/index.php?title=1-8...,"{Johnson & Johnson, DITTO, CooperVision, AEA I...",{{Refimprove|date=October 2009}}\n{{Infobox co...,1-800 Contacts,1-800+Contacts,https://en.wikipedia.org/w/api.php?&action=par...,"[1-800 Contacts, Inc. v. WhenU.com, Inc., AEA ...",4613366.0,,...,Subsidiary,"[{u'last': u'Jonathan C. Coon', u'suffix': u''...",1995,"Draper, Utah",,,,,,
1-800-FREE-411,,https://en.wikipedia.org/w/index.php?title=1-8...,"{Google, Liberty Media, Tellme Networks, March...",{{Infobox company |\n name = Marchex |\n logo ...,1-800-FREE-411,1-800-FREE-411,https://en.wikipedia.org/w/api.php?&action=par...,"[4-1-1, 800-The-Info, Android (operating syste...",18480351.0,,...,Public,"Russell C. Horowitz, CEO","2005 in Burlington, MA","Seattle, WA, U.S.","{u'wiki_raw_code': u'800free411.gif', u'wiki_f...",,,,Liberty Media (9%),


### Companies with duplicate names

Can come from users that put company profiles on their pages, compare:
* https://en.wikipedia.org/wiki/Amgen
* https://en.wikipedia.org/wiki/User:Gjanko/sandbox

In [50]:
# find duplicate company names
c_dupl = pd.concat(g for _, g in comp_df.groupby('name') if len(g) > 1)
print "Found", len(c_dupl['name']), "duplicates:"
print c_dupl['name']

Found 297 duplicates:
Big Bazaar                                                      '''Big Bazaar'''
Big Bazaar (Rourkela)                                           '''Big Bazaar'''
Abrazo Scottsdale Campus                                Abrazo Scottsdale Campus
Abrazo Scottsdale Campus Arizona                        Abrazo Scottsdale Campus
Al Marjan Island                                                Al Marjan Island
Al Marjan Island LLC                                            Al Marjan Island
Amplify (company)                                                        Amplify
Amplify (distributor)                                                    Amplify
Andersen Tax                                                    Andersen Tax LLC
Andersen Tax LLC                                                Andersen Tax LLC
Anki (Finnish company)                                                      Anki
Anki (company)                                                              Anki
Argex 

In [51]:
# add extra column adding the length of the link set
for i, row in c_dupl.iterrows():
  c_dupl.set_value(i, 'link_amount', len(row['links']))

In [52]:
s_list = [ 'Wendy\'s',
          'Wendy\'s Supa Sundaes' ]

for s in s_list:
    print s, c_dupl.loc[s, 'link_amount']

Wendy's 163.0
Wendy's Supa Sundaes 48.0


In [53]:
# sort after multiple values, sorts the columns with least links and employees
sort_cols = ['name', 'link_amount', 'num_employees']
c_dupl_sort = c_dupl.sort_values(by=sort_cols, ascending=[True,True,True])
c_dupl_sort[['link_amount', 'name', 'num_employees']]

Unnamed: 0,link_amount,name,num_employees
Big Bazaar (Rourkela),3.0,'''Big Bazaar''',
Big Bazaar,4.0,'''Big Bazaar''',36000
Abrazo Scottsdale Campus,8.0,Abrazo Scottsdale Campus,650
Abrazo Scottsdale Campus Arizona,8.0,Abrazo Scottsdale Campus,650
Al Marjan Island LLC,0.0,Al Marjan Island,
Al Marjan Island,1.0,Al Marjan Island,
Amplify (company),2.0,Amplify,
Amplify (distributor),2.0,Amplify,
Andersen Tax,2.0,Andersen Tax LLC,500
Andersen Tax LLC,2.0,Andersen Tax LLC,800


In [55]:
# now drop the one's with duplicate names, will take the second duplicate with less links
c_dupl_sort.drop_duplicates('name', inplace=True)
c_dupl_sort

Unnamed: 0,location_city,wiki_url,links,wiki_raw,wiki_name,name_url_quoted,wiki_api_url,all_links,wiki_page_id,location_country,...,key_people,founded,location,logo,defunct,country,page_id,parent,subsid,link_amount
Big Bazaar (Rourkela),,https://en.wikipedia.org/w/index.php?title=Big...,"{Future Group, Rourkela Steel Plant, Big Bazaar}",{{Infobox company\n| name = '''Big Bazaar'''\n...,Big Bazaar (Rourkela),Big+Bazaar+%28Rourkela%29,https://en.wikipedia.org/w/api.php?&action=par...,"[Akshaya Patra Foundation, Asian Workers Devel...",46454193.0,,...,Mr. Sadashiv Nayak,"December 20th, 2014",,"{u'wiki_raw_code': u'Big Bazaar Logo.jpg', u'w...",,,,Future Group,,3.0
Abrazo Scottsdale Campus,,https://en.wikipedia.org/w/index.php?title=Abr...,"{Abrazo Community Health Network, Tenet Health...",{{refimprove|date=December 2013}}\n{{Infobox c...,Abrazo Scottsdale Campus,Abrazo+Scottsdale+Campus,https://en.wikipedia.org/w/api.php?&action=par...,"[Abrazo Community Health Network, Abrazo Healt...",48338967.0,,...,,1983,"Phoenix, Arizona",,,,,,,8.0
Al Marjan Island LLC,Ras Al Khaimah,https://en.wikipedia.org/w/index.php?title=Al+...,{},{{refimprove|date=July 2015}}\n\n{{Infobox com...,Al Marjan Island LLC,Al+Marjan+Island+LLC,https://en.wikipedia.org/w/api.php?&action=par...,"[Ras Al Khaimah, The National (Abu Dhabi), Uni...",47113267.0,United Arab Emirates,...,,2013,,,,,,,,0.0
Amplify (company),,https://en.wikipedia.org/w/index.php?title=Amp...,"{Asus, News Corp}",{{Infobox company|name = Amplify|native_name_l...,Amplify (company),Amplify+%28company%29,https://en.wikipedia.org/w/api.php?&action=par...,"[Amplify Tablet, Android (operating system), A...",38755162.0,,...,"[{u'last': u'Klein', u'suffix': u'', u'title':...",2000-11-25 00:00:00,"55 Washington Street\nSuite 900\nBrooklyn, NY ...",,,,,,,2.0
Andersen Tax,,https://en.wikipedia.org/w/index.php?title=And...,"{HSBC, Arthur Andersen}",{{Infobox company\n| name = Anders...,Andersen Tax,Andersen+Tax,https://en.wikipedia.org/w/api.php?&action=par...,"[Arthur Andersen, Family office, HSBC, Hedge f...",29840306.0,,...,"Mark Vorsatz, CEO",2002,"San Francisco, California",,,,,,,2.0
Anki (Finnish company),,https://en.wikipedia.org/w/index.php?title=Ank...,{},{{Infobox company\n| name = Anki\n| logo = ...,Anki (Finnish company),Anki+%28Finnish+company%29,https://en.wikipedia.org/w/api.php?&action=par...,"[Carpet, Closeout (sale), Cotton, Finland, Hel...",48954654.0,,...,"[{u'last': u'Björnberg', u'suffix': u'', u'tit...",1968,"[Helsinki, Finland , Headquarters and Store ...",,,,,,,0.0
Argex Titanium Inc.,Montreal,https://en.wikipedia.org/w/index.php?title=Arg...,{},{{Use mdy dates|date=September 2014}}\n{{Infob...,Argex Titanium Inc.,Argex+Titanium+Inc.,https://en.wikipedia.org/w/api.php?&action=par...,"[Montreal, Ticker symbol, Titanium Dioxide, Ti...",40615891.0,Canada,...,"[{u'last': u'Roy Bonnell', u'suffix': u'CEO, D...",2009,,,,,,,,0.0
Army and Air Force Motion Picture Service,,https://en.wikipedia.org/w/index.php?title=Arm...,{Army and Air Force Exchange Service},{{multiple issues|\n{{unreferenced|date=May 20...,Army and Air Force Motion Picture Service,Army+and+Air+Force+Motion+Picture+Service,https://en.wikipedia.org/w/api.php?&action=par...,"[Army and Air Force Exchange Service, Dallas, ...",38587842.0,,...,,1920,"Dallas, Texas, United States",,1975,,,United States Department of Defense,,1.0
"Atari, Inc.",,https://en.wikipedia.org/w/index.php?title=Ata...,"{Commodore International, Warner Communication...",{{about||information on the Atari brand and it...,"Atari, Inc.",Atari%2C+Inc.,https://en.wikipedia.org/w/api.php?&action=par...,"[Al Alcorn, Anti-Aircraft (video game), Apple ...",16462490.0,,...,,1972-07-26 00:00:00,"Sunnyvale, California, United States","{u'wiki_raw_code': u'Atari logo alt.svg', u'wi...",1984-03-12 00:00:00,,,Warner Communications (1976–1984),,11.0
Athena (company),,https://en.wikipedia.org/w/index.php?title=Ath...,{},{{Refimprove|date=July 2012}}\n{{Infobox compa...,Athena (company),Athena+%28company%29,https://en.wikipedia.org/w/api.php?&action=par...,"[Arcade Game, BioMetal (video game), Champions...",5445173.0,,...,,,,,,,,,,0.0


In [69]:
# now drop the rows of our main dataframe which are in the dataframe with the unwanted duplicates
dupl_companies = list(c_dupl_sort.index.values)
c_w_name.drop(dupl_companies)

Unnamed: 0,location_city,wiki_url,links,wiki_raw,wiki_name,name_url_quoted,wiki_api_url,all_links,wiki_page_id,location_country,...,type,key_people,founded,location,logo,defunct,country,page_id,parent,subsid
&pizza,,https://en.wikipedia.org/w/index.php?title=%26...,"{Ruby Tuesday (restaurant), City Sports}",{{Infobox company\n| name = &pizza...,&pizza,%26pizza,https://en.wikipedia.org/w/api.php?&action=par...,"[City Sports, DC Central Kitchen, Fast casual ...",47858476.0,,...,,,2012-11-25 00:00:00,,,,,,,
+Beryll,,https://en.wikipedia.org/w/index.php?title=%2B...,"{Henri Bendel, Fred Segal, Los Angeles Times}",{{for|related terms|Beryl (disambiguation)}}\n...,+Beryll,%2BBeryll,https://en.wikipedia.org/w/api.php?&action=par...,"[Angelina Jolie, Anna Pacquin, Austria, Bauhau...",13860681.0,,...,private,Sigmar Berg – Chief Designer,"Santa Monica, California (2006)",,"{u'wiki_raw_code': u'Beryll logo.jpg', u'wiki_...",,,,,
...instore,,https://en.wikipedia.org/w/index.php?title=......,"{Home Bargains, Heron Foods, Tesco, Poundstret...",{{Use dmy dates|date=July 2015}}\n{{Use Britis...,...instore,...instore,https://en.wikipedia.org/w/api.php?&action=par...,"[Aldi, Asda, BBC News Online, B & M, Bargain B...",9291975.0,,...,Retail,,2003,"Huddersfield, England, United Kingdom",{u'wiki_raw_code': u'[[File:Instore-logo.png]]...,2009,,,,
01 Communique,,https://en.wikipedia.org/w/index.php?title=01+...,{},{{Infobox company\n| name = 01 Communique\n...,01 Communique,01+Communique,https://en.wikipedia.org/w/api.php?&action=par...,"[Arlington, Virginia, Citrix, I'm InTouch, Mis...",15244876.0,,...,Public Listed Company (),"Andrew Cheung, President",1992,"Mississauga, Ontario, Canada",{u'wiki_raw_code': u'[[File:01 Communique Logo...,,,,,
01 Distribution,Rome,https://en.wikipedia.org/w/index.php?title=01+...,{RAI},{{Company\n| name = '''01 Distribution'''\n| l...,01 Distribution,01+Distribution,https://en.wikipedia.org/w/api.php?&action=par...,"[Animation, Film distribution, Film industry, ...",46724099.0,Italy,...,public company,"[{u'last': u'Del Brocco', u'suffix': u'', u'ti...",21 June 2001,,,,,,,
07th Expansion,,https://en.wikipedia.org/w/index.php?title=07t...,{Alchemist (company)},{{Expand Japanese|07th Expansion|date=January ...,07th Expansion,07th+Expansion,https://en.wikipedia.org/w/api.php?&action=par...,"[Alchemist (company), Comiket, Crunchyroll, Dō...",5936289.0,,...,,,,Japan,,,,,,
0verflow,"Kanda, Chiyoda , Tokyo",https://en.wikipedia.org/w/index.php?title=0ve...,{},{{Other uses|Overflow (disambiguation)}}\n{{In...,0verflow,0verflow,https://en.wikipedia.org/w/api.php?&action=par...,"[Anime, Anime News Network, CEO, Chiyoda, Toky...",10387049.0,Japan,...,Private subsidiary,"[{u'last': u'Ōnuma', u'suffix': u'', u'title':...",1997,,"{u'wiki_raw_code': u'0verflowlogo.png', u'wiki...",,,,Stack Ltd.,
1-2-3 (fuel station),,https://en.wikipedia.org/w/index.php?title=1-2...,{Statoil Fuel & Retail},{{Infobox company\n| name = 1-2-3\n...,1-2-3 (fuel station),1-2-3+%28fuel+station%29,https://en.wikipedia.org/w/api.php?&action=par...,"[Baltic states, Denmark, Fuel station, Kaunas,...",14018485.0,,...,Fuel station,,5 December 2000,,,,,,,
1-800 Contacts,,https://en.wikipedia.org/w/index.php?title=1-8...,"{Johnson & Johnson, DITTO, CooperVision, AEA I...",{{Refimprove|date=October 2009}}\n{{Infobox co...,1-800 Contacts,1-800+Contacts,https://en.wikipedia.org/w/api.php?&action=par...,"[1-800 Contacts, Inc. v. WhenU.com, Inc., AEA ...",4613366.0,,...,Subsidiary,"[{u'last': u'Jonathan C. Coon', u'suffix': u''...",1995,"Draper, Utah",,,,,,
1-800-FREE-411,,https://en.wikipedia.org/w/index.php?title=1-8...,"{Google, Liberty Media, Tellme Networks, March...",{{Infobox company |\n name = Marchex |\n logo ...,1-800-FREE-411,1-800-FREE-411,https://en.wikipedia.org/w/api.php?&action=par...,"[4-1-1, 800-The-Info, Android (operating syste...",18480351.0,,...,Public,"Russell C. Horowitz, CEO","2005 in Burlington, MA","Seattle, WA, U.S.","{u'wiki_raw_code': u'800free411.gif', u'wiki_f...",,,,Liberty Media (9%),


## Add geolocation Data

[OpenStreetMap Nominatim](https://wiki.openstreetmap.org/wiki/Nominatim) or [Google Geocoding API (V3)](https://developers.google.com/maps/documentation/geocoding/start) is used to resolve GPS coordinates to places. Uses the Python library [geopy](https://pypi.python.org/pypi/geopy/1.11.0).

In [145]:
# show how many companies have an address
def get_location(row, geolocators, _type=None):
    
    # check from top to bottom for location
    loc_str = None
    loc_series = pd.Series({
                    'location_geopy': loc_str,
                    'location_gps': (None, None)})
    if 'location_geopy' in row and 'location_gps' in row:
        return pd.Series({
                    'location_geopy': row['location_geopy'],
                    'location_gps': row['location_gps']})
    
    row_to_check = ['location', 'location_city', 'location_country']
    for r in row_to_check:
        val = row[r]
        if isinstance(val, list):
            val = " ".join(val)
        if not pd.isnull(val):
            loc_str = val
    if not loc_str:
        return loc_series

    for g in geolocators:
        try:
            loc = g.geocode(loc_str, timeout=10)
        except GeocoderServiceError:
            return loc_series
        if loc:
            # max 1 request per second, http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy
            time.sleep(1)
            return pd.Series({
                    'location_geopy': loc.address,
                    'location_gps': (loc.latitude, loc.longitude)})
    return loc_series

In [148]:
# initialize the different geolocator APIs
g_osm = geopy.geocoders.Nominatim()
g_google = geopy.geocoders.GoogleV3() # can also be with g_api_key
#geolocators = [g_google, g_osm] # GeocoderQuotaExceeded
geolocators = [g_osm]

# applies gets location function for each row, test on subset
df_loc = c_w_name[0:5].merge(
    c_w_name[0:5].apply(lambda row: get_location(row, geolocators), axis=1),
    left_index=True, 
    right_index=True)
df_loc[['location_geopy', 'location_gps']]

Unnamed: 0,location_geopy,location_gps
&pizza,,"(None, None)"
+Beryll,,"(None, None)"
...instore,"Huddersfield, Yorkshire and the Humber, Englan...","(53.6467031, -1.7832076)"
01 Communique,"Ont., Peel Region, Ontario, Canada","(43.5892854, -79.6441645)"
01 Distribution,Italia,"(42.6384261, 12.674297)"


In [151]:
# progress bar of apply
tqdm_notebook().pandas(desc="Resolving location")
c_dat = c_w_name.merge(
    c_w_name.progress_apply(lambda row: get_location(row, geolocators), axis=1),
    left_index=True, 
    right_index=True)

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"





GeocoderServiceError: ('HTTP Error 500: Internal Server Error', u'occurred at index BioTek')

## Clean Datafields when List

In [155]:
# specify the type for each field
types = {
    # when first link is crawled
    'wiki_name': unicode,
    'wiki_url': str,
    'name_url_quoted': str,
    
    # when WIKI API is crawled
    'wiki_page_id': int,
    'wiki_api_url': str,
    'all_links': list,
    'links': set,
    'is_company': bool,
    'wiki_raw': unicode,
    
    # from extraction2
    'oc_api_url': str,
    'oc_api_search_url': str,
    'oc_api_network_url': str,
    
    # added only when Infobox company exists or fields from OpenCorporates
    # not all fields always exist, they are NaN in the resulting DataFrame
    'name': unicode, 
    'type': unicode, 
    'founded': datetime.datetime, 
    'defunct': datetime.datetime, 
    'location': unicode,
    'location_city': unicode, 
    'location_country': unicode,
    'location_geopy': unicode,
    'location_gps': tuple,
    # following not in OC
    'countries': set, # added with extraction 3
    'logo': dict, 
    'key_people': list, # additonally processed with nameparser.HumanName (dict)
    'industry': list, 
    'subsid': list,
    'products': list, 
    'num_employees': int, 
    'parent': unicode, 
    'homepage': unicode
}

c_dat[['links']] = c_dat[['links']].astype(set)

c_dat.dtypes

0                    object
all_links            object
country              object
defunct              object
founded              object
homepage             object
industry             object
is_company           object
key_people           object
links                object
location             object
location_city        object
location_country     object
logo                 object
name                 object
name_url_quoted      object
num_employees        object
page_id             float64
parent               object
products             object
subsid               object
type                 object
wiki_api_url         object
wiki_name            object
wiki_page_id        float64
wiki_raw             object
wiki_url             object
dtype: object

## Save Data

In [152]:
# save as CSV file
c_dat.to_csv(extraction_csv, encoding='utf-8', index_label='wiki_title')

In [None]:


c_dat = pd.read_csv(extraction_csv, index_col=0)

## General Statistics in the Data

* Company with most links.
* Company with most employees.
* Companies by country.
* Companies by age of foundation (graph)
* Most influential people's first name (top 10)
* What is the most central company in Denmark and Europe?
* Company listed in most countries.

In [175]:
cnt = Counter()
# iterate every row that represents a company
for index, row in c_dat.iterrows():
    # empty link list
    if isinstance(row['links'], float):
        continue
    cnt[row['name']] = len(row['links'])

print "Companies with most links:"
pprint(cnt.most_common(10))

# Italian banks have a category section: Banking in Italy (e.g.: https://en.wikipedia.org/wiki/Crediop)

Companies with most links:
[('Dexia Crediop', 2457),
 ('Banca Centropadana', 2450),
 ('Fidi Toscana', 2445),
 ('Banca Finnat', 2445),
 ('ChiantiBanca', 2445),
 ('Caript', 2441),
 ('Banca Popolare Pugliese', 2426),
 ('CR Veneto', 2425),
 ("Banca d'Alba", 2421),
 ('Banca Esperia', 2420)]


In [None]:
cnt = Counter()
for index, row in c_dat.iterrows():
    if isinstance(row['num_employees'], int):
        cnt[row['name']] = int(row['num_employees'])

print "Most companies by country:"
pprint(cnt.most_common(10))