** 02805 Social graphs and interactions **

# Data Extraction Part 2

In [1]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
from __future__ import division # all numbers are float
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
from tqdm import tqdm_notebook # make a nice progressbar
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
merged = directory + '/merged_data.pkl'
extraction_csv = directory + '/company_data.csv'

Once deleted, variables cannot be recovered. Proceed (y/[n])? n
Nothing done.


In [81]:
def get_company_pages(url, companies):
    # request first page
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'lxml')
    # find all li elements in ul with links
    table = soup.find("table", attrs={"class": "wikitable"})
    rows = table.find_all("tr")
    
    for row in rows:
        cells = row.find_all("td")
        if cells:
            a = cells[1].find('a', href=True)
            wiki_name = a.get_text()
            companies[wiki_name] = {
                'wiki_name': wiki_name,
                'wiki_url': a['href'],
                'name_url_quoted': urllib.quote_plus(wiki_name.encode('utf-8'))}

    # request more pages in recursive loop following "next 1000 entries" link
    content = soup.find("div", attrs={"id": "content"})
    next_link = content.find('a', href=True, text='Next 1000 entries')
    if next_link:
        companies = get_company_pages(next_link['href'], companies)

    return companies

In [82]:
# parse HTML
wiki_company_articles = u'https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=Company&limit=1000'

companies = get_company_pages(wiki_company_articles, dict())
print len(companies), "companies were extracted."

47858 companies were extracted.


In [83]:
# load data from first extraction
extract1 = dict()
if os.path.isfile(ex1_fdat):
    with open(ex1_fdat, 'rb') as f:
        extract1 = pickle.load(f)

In [84]:
company = 'AT&T'
print companies[company]['wiki_url']

if extract1 and company in extract1:
    print extract1[company]['wiki_url']

https://en.wikipedia.org/w/index.php?title=AT%26T
https://en.wikipedia.org/w/index.php?title=AT%26T


In [92]:
def get_json_from_url(url):
    r = requests.get(url)
    
    # on HTML error codes
    if r.status_code != 200:
        return None

    # try converting into JSON
    try:
        sec = r.json()
        return sec
    except ValueError:  # includes simplejson.decoder.JSONDecodeError
        print 'WARN: Decoding JSON has failed on:', url
    return None

In [97]:
def parse_date(date):
    if date:
        dateparser.parse(date, settings={
            'PREFER_DATES_FROM': 'past',
            'DATE_ORDER': 'YMD'})
    return date

In [102]:
def get_c_info(link_url, all_companies, company):
    """Get wiki links, verify company existance and get additional data"""
    
    c_content = get_json_from_url(link_url)

    if not c_content and 'parse' not in c_content:
        print 'No parsable content on:', link_url
        company['is_company'] = False
        return company

    # add certain fields
    company['wiki_page_id'] = c_content['parse']['pageid']
    company['wiki_name'] = c_content['parse']['title']
    company['wiki_api_url'] = link_url
    # list of links intersected with list containing all companies
    links = [x['*'] for x in c_content['parse']['links'] if x['ns'] == 0]
    company['all_links'] = links
    company['links'] = all_companies.intersection(links)
    company['wiki_raw'] = c_content['parse']['wikitext']['*']
    
    # check company on OpenCorporates to verify existance
    # see: https://api.opencorporates.com/documentation/API-Reference
    # example: https://api.opencorporates.com/v0.4/companies/search?q=Anglo-Persian%20Oil%20Company
    oc_api_base = 'https://api.opencorporates.com/v0.4/companies'
    # normalise_company_name=true - 
    # order=score - sort after score not alphabetic
    oc_properties = '&normalise_company_name=true&order=score'
    company['oc_api_url'] = '{0}/search?q={1}{2}'.format(oc_api_base, company['name_url_quoted'], oc_properties)
        
    # get data
    oc_resp = get_json_from_url(company['oc_api_url'])  
    # take the first company with highest score
    company_data = oc_resp['results']['companies']
    # return if there are no results for the company
    if not company_data:
        company['is_company'] = False
        return company
    
    comp = company_data[0]['company']
    # set all the fields that match with what can be gathered from Infobox company template
    company['name'] = comp['name']
    company['type'] = comp['company_type']   
    company['defunct'] = parse_date(comp['dissolution_date'])
    company['founded'] = parse_date(comp['incorporation_date'])
    company['location_country'] = comp['registered_address']['country']
    company['location_city'] = comp['registered_address']['locality']
    company['location'] = comp['registered_address']['street_address']

    # look if there is any network data on OC
    # difficult because search for BP does not necessarily return the right company
    # also OC has network data only for a very small subset
    company['oc_api_network_url'] = '{0}/{1}/{2}/network'.format(
        oc_api_base, comp['jurisdiction_code'], comp['company_number'])    
    c_network = get_json_from_url(company['oc_api_network_url'])
    if c_network['results']:
        print "Company network found for:", company['wiki_name'], company['oc_api_network_url']

    # discard if this company was already processed
    if extract1 and \
        company['wiki_name'] in extract1 and \
        'is_company' in extract1[company['wiki_name']] and \
        extract1[company['wiki_name']]['is_company']:
            company['is_company'] = False
            return company
    # otherwise save it as new company
    company['is_company'] = True
    return company

In [103]:
# https://en.wikipedia.org/w/api.php?action=parse&page=Audi&prop=links|wikitext
wiki_base = u'https://en.wikipedia.org'
wiki_api = u'/w/api.php'
action = u'action=parse'
dat_format = u'format=json'
properties = u'prop=links|wikitext'

# other: 'Audi', 'Apple Inc.', 'Microsoft'
company = 'AT&T'
link_url = u'{0}{1}?{2}&{3}&{4}&page={5}'.format(
    wiki_base, wiki_api, action, dat_format, properties, urllib.quote_plus(company.encode('utf-8')))

# full set of companies is from extract1 and companies combined
all_companies = set(companies.keys() + extract1.keys())
print "In total there are {0} companies ({1} processed and {2} unprocessed).".format(
    len(all_companies), len(companies.keys), len(extract1.keys()))

In total there are 82988 unprocessed companies.


In [104]:
print "Excerpt of", company, "dict structure:"
pprint(get_c_info(link_url, all_companies, companies[company]))

Excerpt of AT&T dict structure:


KeyError: 'is_company'

In [None]:
### now check if new links must be added to companies from extraction 1

# get all companies where 'is_company' is true

# merge extraction1 and extraction2 (from extraction2 oc fields can be added)