** 02805 Social graphs and interactions **

# Data Extraction Part 2

In [31]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
from __future__ import division # all numbers are float
import geopy # get geo location according to addresses
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
import time # sleep timer
from tqdm import tqdm_notebook # make a nice progressbar
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
merged = directory + '/merged_data.pkl'
extraction_csv = directory + '/company_data.csv'

Once deleted, variables cannot be recovered. Proceed (y/[n])? n
Nothing done.


In [2]:
def get_company_pages(url, companies):
    # request first page
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'lxml')
    # find all li elements in ul with links
    table = soup.find("table", attrs={"class": "wikitable"})
    rows = table.find_all("tr")
    
    for row in rows:
        cells = row.find_all("td")
        if cells:
            a = cells[1].find('a', href=True)
            wiki_name = a.get_text()
            companies[wiki_name] = {
                'wiki_name': wiki_name.strip(),
                'wiki_url': a['href'],
                'name_url_quoted': urllib.quote_plus(wiki_name.encode('utf-8'))}

    # request more pages in recursive loop following "next 1000 entries" link
    content = soup.find("div", attrs={"id": "content"})
    next_link = content.find('a', href=True, text='Next 1000 entries')
    if next_link:
        companies = get_company_pages(next_link['href'], companies)

    return companies

In [3]:
# parse HTML
wiki_company_articles = u'https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=Company&limit=1000'

companies = get_company_pages(wiki_company_articles, dict())
print len(companies), "companies were extracted."

47858 companies were extracted.


In [4]:
# load data from first extraction
extract1 = dict()
if os.path.isfile(ex1_fdat):
    with open(ex1_fdat, 'rb') as f:
        extract1 = pickle.load(f)

In [6]:
company = 'AT&T'
print "data from wmflabs.org:", companies[company]

if extract1 and company in extract1:
    print "data with infobox company:", extract1[company]['wiki_url']

# some lists are included that are definetly not companies
print companies.keys()[0:20]

data from wmflabs.org: {'name_url_quoted': 'AT%26T', 'wiki_url': 'https://en.wikipedia.org/w/index.php?title=AT%26T', 'wiki_name': u'AT&T'}
data with infobox company: https://en.wikipedia.org/w/index.php?title=AT%26T
[u'Category:Companies established in the 1640s', u'Nordisk Mobiltelefon (Sweden)', u'Candover Investments', u'J.O.B. Records', u'\xc7al\u0131k Holding', u'Hansabank', u'Gunvor (company)', u'Inditex', u'Memphis Running Tours', u'Handheld Group', u'Taiwan Sugar Corporation', u'Trelleborg (company)', u'Other World Computing', u'Idlebrain.com', u'Category:Railway companies established in 1980', u'Nails Inc.', u'Duckworth Overlook', u'Washington Gas', u'Eveve', u'List of stuffed toy manufacturers']


In [7]:
def get_json_from_url(url):
    r = requests.get(url)
    
    # on HTML error codes
    if r.status_code != 200:
        return None

    # try converting into JSON
    try:
        sec = r.json()
        return sec
    except ValueError:  # includes simplejson.decoder.JSONDecodeError
        print 'WARN: Decoding JSON has failed on:', url
    return None

In [8]:
def parse_date(date):
    if date:
        dateparser.parse(date, settings={
            'PREFER_DATES_FROM': 'past',
            'DATE_ORDER': 'YMD'})
    return date

In [30]:
def get_c_info(company):
    """Get wiki links, verify company existance and get additional data"""
    
    link_url = u'{0}{1}?{2}&{3}&{4}&page={5}'.format(
        wiki_base, wiki_api, action, dat_format, properties, company['name_url_quoted'])

    c_content = get_json_from_url(link_url)
    if not c_content or 'parse' not in c_content:
        print 'No parsable content on:', link_url
        company['is_company'] = False
        return company

    # add certain fields
    company['wiki_page_id'] = c_content['parse']['pageid']
    company['wiki_api_url'] = link_url
    company['all_links'] = [x['*'] for x in c_content['parse']['links'] if x['ns'] == 0]
    company['wiki_raw'] = c_content['parse']['wikitext']['*']
    # name can not be found in an infobox
    company['name'] = c_content['parse']['title']
    
    # check company on OpenCorporates to verify existance
    # see: https://api.opencorporates.com/documentation/API-Reference
    # example: https://api.opencorporates.com/v0.4/companies/search?q=Anglo-Persian%20Oil%20Company
    oc_api_base = 'https://api.opencorporates.com/v0.4/companies'
    # normalise_company_name=true - 
    # order=score - sort after score not alphabetic
    oc_properties = '&normalise_company_name=true&order=score'
    company['oc_api_search_url'] = '{0}/search?q={1}{2}'.format(oc_api_base, company['name_url_quoted'], oc_properties)
        
    # get data
    oc_resp = get_json_from_url(company['oc_api_search_url'])  
    # return if there are no results for the company
    if not oc_resp or not oc_resp['results']:
        company['is_company'] = False
        return company
    
    # take the first company with highest score if results
    results = oc_resp['results']['companies']
    if not results:
        company['is_company'] = False
        return company
    comp = results[0]['company']
    # set all the fields that match with what can be gathered from Infobox company template
    company['name'] = comp['name']
    company['type'] = comp['company_type']   
    company['defunct'] = parse_date(comp['dissolution_date'])
    company['founded'] = parse_date(comp['incorporation_date'])
    if comp['registered_address']:
        company['location_country'] = comp['registered_address']['country']
        company['location_city'] = comp['registered_address']['locality']
        company['location'] = comp['registered_address']['street_address']

    # look if there is any network data on OC
    # difficult because search for BP does not necessarily return the right company
    # also OC has network data only for a very small subset
    company['oc_api_url'] = '{0}/{1}/{2}'.format(
        oc_api_base, comp['jurisdiction_code'], comp['company_number'])
    company['oc_api_network_url'] = company['oc_api_url'] + '/network'
    c_network = get_json_from_url(company['oc_api_network_url'])
    if c_network and c_network['results']:
        print "Company network found for:", company['wiki_name'], company['oc_api_network_url']

    # otherwise save it as new company
    company['is_company'] = True
    return company

In [10]:
# https://en.wikipedia.org/w/api.php?action=parse&page=Audi&prop=links|wikitext
wiki_base = u'https://en.wikipedia.org'
wiki_api = u'/w/api.php'
action = u'action=parse'
dat_format = u'format=json'
properties = u'prop=links|wikitext'

In [11]:
# other: 'Audi', 'Apple Inc.', 'Microsoft'
company = 'Anglo-Persian Oil Company'

if company not in extract1:
    print "Company", company, "does not exist in extraction 1."

print "Excerpt of", company, "dict structure:"
pprint(get_c_info(companies[company]))

Company Anglo-Persian Oil Company does not exist in extraction 1.
Excerpt of Anglo-Persian Oil Company dict structure:
{'all_links': [u"1953 Iranian coup d'\xe9tat",
               u'2007 Gasoline Rationing Plan in Iran',
               u'2009 Bond Helicopters Eurocopter AS332 crash',
               u'ARCO',
               u'ARCO Arena',
               u'ARCO Tower',
               u'Abadan, Iran',
               u'Abadan Crisis',
               u'Abadan Crisis timeline',
               u'Abadan Refinery',
               u'Abdication',
               u'Abdolhossein Teymourtash',
               u'Acropolis Rally',
               u'Ahmadabad-e Mosaddeq',
               u'Air BP',
               u'Akaki Khoshtaria',
               u'Ali Razmara',
               u"All the Shah's Men",
               u'Allies of World War II',
               u'Alyeska Pipeline Service Company',
               u'Amoco',
               u'Amoco Building (New Orleans)',
               u'Amoco Milford Haven',
  

In [27]:
def scrape_companies(companies, com_dat):
    """Gets two dictionaries and parses those who don't have same content"""

    # scrape links and extra information of all companies, call with progress-bar
    for company in tqdm_notebook(companies, desc='Fetching companies'):
        # don't repeat if pickle file already contains data
        if com_dat and \
            company in com_dat and \
            'is_company' in com_dat[company] and \
            com_dat[company]['is_company']:
                companies[company]['is_company'] = False
                continue

        # also don't reprocess already parsed one's
        if 'is_company' not in companies[company]:
            # extract any company information
            companies[company] = get_c_info(companies[company])
    return companies

In [32]:
# get company file from extraction 2
com_dat_pickle = dict()
if os.path.isfile(ex2_fdat):
    with open(ex2_fdat, 'rb') as f:
        com_dat_pickle = pickle.load(f)
        # update previously fetched data to same keys
        companies.update(com_dat_pickle)

In [33]:
%%time
# crawl for all companies that were not prcoessed yet and cotain no or a false 'is_company' flag
companies = scrape_companies(companies, extract1)

### OC limitation 500 requests per month... API key sign up didn't succeed
# https://api.opencorporates.com/documentation/API-Reference#usage_limits

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


No parsable content on: https://en.wikipedia.org/w/api.php?action=parse&format=json&prop=links|wikitext&page=3+Tier+Logic
No parsable content on: https://en.wikipedia.org/w/api.php?action=parse&format=json&prop=links|wikitext&page=HTC+Global+Services
No parsable content on: https://en.wikipedia.org/w/api.php?action=parse&format=json&prop=links|wikitext&page=NeonMob
No parsable content on: https://en.wikipedia.org/w/api.php?action=parse&format=json&prop=links|wikitext&page=ModeAudio
No parsable content on: https://en.wikipedia.org/w/api.php?action=parse&format=json&prop=links|wikitext&page=Verisart
No parsable content on: https://en.wikipedia.org/w/api.php?action=parse&format=json&prop=links|wikitext&page=File%3AHudsons+Bay+Company+Flag.svg
No parsable content on: https://en.wikipedia.org/w/api.php?action=parse&format=json&prop=links|wikitext&page=Confidential+Global+Investigations
No parsable content on: https://en.wikipedia.org/w/api.php?action=parse&format=json&prop=links|wikitext&pa

In [34]:
cnt, cnt_extr1 = 0, 0
for c in companies:
    if 'is_company' in companies[c] and not companies[c]['is_company']:
        cnt += 1
        if c in extract1 and extract1[c]['is_company']:
            cnt_extr1 += 1
print cnt, "companies were marked as no company.", cnt_extr1, "of them because already procesed in extraction 1."

47858 companies were marked as no company. 21676 of them because already procesed in extraction 1.


In [35]:
# get links of all the valid companies
def create_link_set(extr_companies):
    all_c = set()
    for c in extr_companies:
        if 'is_company' in extr_companies[c] and extr_companies[c]['is_company']:
            all_c.add(c)
    return all_c

In [36]:
# full set of companies is from extract1 and companies combined
ex1_links = create_link_set(extract1)
ex2_links = create_link_set(companies)
all_companies = ex1_links | ex2_links
print "In total there are {0} companies ({1} from extraction 1 and {2} from extraction 2).".format(
    len(all_companies), len(extract1), len(ex2_links))

In total there are 56837 companies (56853 from extraction 1 and 0 from extraction 2).


In [37]:
# store company data in one binary file
with open(ex2_fdat, 'wb') as f:
    pickle.dump(companies, f)