** 02805 Social graphs and interactions **

# Data Extraction Part 1

In [1]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
from __future__ import division # all numbers are float
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
from tqdm import tqdm_notebook # make a nice progressbar
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
merged = directory + '/merged_data.pkl'
extraction_csv = directory + '/company_data.csv'

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
def get_json_from_url(url):
    r = requests.get(url)
    
    # on HTML error codes
    if r.status_code != 200:
        return None

    # try converting into JSON
    try:
        sec = r.json()
        return sec
    except ValueError:  # includes simplejson.decoder.JSONDecodeError
        print 'WARN: Decoding JSON has failed on:', url
    return None

In [3]:
def get_comp_pages(api_url, extension=None):
    
    ext_api_url = api_url
    if extension:
        ext_api_url += extension
    
    # retrieve JSON
    sites_w_template = get_json_from_url(ext_api_url)

    # check for valid data
    if not sites_w_template or 'query' not in sites_w_template:
        print "WARN: query returned no data:", infobox_url + properties
    else:
        # iterate over list of companies that embed infobox and save to dict
        for page in sites_w_template['query']['embeddedin']:
            # unicode to str
            quoted_name = urllib.quote_plus(page['title'].encode('utf-8'))
            companies.update({
                page['title']: {
                    'wiki_page_id': page['pageid'],
                    'wiki_name': page['title']),
                    'wiki_url': u'{0}{1}?title={2}'.format(wiki_base, wiki_index, quoted_name),
                    'name_url_quoted': quoted_name
                }
            })

        # run the function recursively, when company found
        if 'continue' in sites_w_template:
            extension = '&eicontinue=' + sites_w_template['continue']['eicontinue']
            return get_comp_pages(api_url, extension)
    
    return companies

Example query for extraction of pages with "Infobox company" template:
* https://en.wikipedia.org/w/api.php?&action=query&list=embeddedin&eititle=Template:Infobox+company&einamespace=0&eilimit=max
* https://en.wikipedia.org/w/api.php?&action=query&list=embeddedin&eititle=Template:Infobox+company&einamespace=0&eilimit=max&eicontinue=0|167638


In [17]:
# specify wikipedia API URL parameters
wiki_base = u'https://en.wikipedia.org'
wiki_api = u'/w/api.php'
wiki_index = u'/w/index.php'
action = u'action=query'
dat_format = u'format=json'
# eititle=Template:Infobox+company - embedded pages with Template {{Infobox Company}}
# einamespace=0 only articles
# eilimit=max - maximum number before continuing to minimize requests
properties = u'list=embeddedin&eititle=Template:Infobox+company&einamespace=0&eilimit=max'

# concatenate URL
infobox_url = '{0}{1}?&{2}&{3}&'.format(
    wiki_base, wiki_api, action, dat_format)

# init and update with recursive function
companies = dict()
get_comp_pages(infobox_url + properties)

print len(companies), "companies were extracted."

0 companies were extracted.


In [15]:
print companies['AT&T']

{'links': set([u'Frontier Florida', u'T-Systems', u'AT&T Corporation', u'Cisco Systems', u'Verizon New England', u'Tata Teleservices', u'KDDI', u'Indiana Bell', u'Dell EMC', u'Northern New England Spinco', u'Huawei', u'BendBroadband', u'Applied Materials', u'Comcast', u'IBM Global Services', u'Deloitte', u'AT&T Broadband', u'Citigroup', u'EPB', u'Sanmina Corporation', u'Hughes Communications', u'NTT Docomo USA', u'Samsung Electronics', u'CenturyLink', u'Olympus Corporation', u'HP Inc.', u'Sony Mobile', u'Qwest', u'AOL', u'Xerox', u'AT&T Communications', u'Houston Cellular', u'AT&T Mobility', u'Hewlett Packard Enterprise', u'AT&T Wireless Services', u'Booz Allen Hamilton', u'Frontier Southwest', u'SK Hynix', u'Prodigy (online service)', u'BellSouth Long Distance', u'MediaTek', u'Verizon Virginia', u'Southern New England Telecommunications', u'CA Technologies', u'NTT DATA', u'Verizon New Jersey', u'China Unicom', u'Advanced Mobile Phone Service', u'Gogo Inflight Internet', u'Silicon Powe

In [4]:
def get_temp_val(temp, k):
    """Check if the parameter exists in the infobox template"""

    try:
        param = temp.get(k)
        return param
    except ValueError:
        # try alternative keys (template not always consistent)
        if k == 'founded':
            return get_temp_val(temp, 'foundation')
        if 'location' in k and 'hq_' not in k:
            return get_temp_val(temp, 'hq_' + k)
    # key not found in template
    return None

In [5]:
def parse_employee_nr(input_val, c_name):
    """
    Parse a proper number out of the various inputs
    Case possibilities tested:
    
    s_list = [
        '15-50',
        '~300+',
        'circa 40',
        '9,985(Dec 2011)',
        'over 10,000 in 10 countries',
        'Five',
        'over 1 million',
        '10.000',
        'Part of Popular, Inc., which has 8,000 employees'
    ]

    for s in s_list:
        print parse_employee_nr(s, 'test')
    """
    
    # match the first number, dot or comma separation optional
    m = re.search(r'[0-9]+([,\.][0-9]+)?', unicode(input_val))
    if m:
        try:
            # replace , and conert to int
            return int(m.group().replace(',', '').replace('.', ''))
        except ValueError:
            print "WARN: Failed conversion of:{0} (company: {1})".format(
                input_val, c_name)
    return None

In [6]:
def parse_people_name(names):
    """
    Tim Cook (CEO) becomes 
    {u'last': u'Cook', u'suffix': u'', u'title': u'', u'middle': u'', u'nickname': u'CEO', u'first': u'Tim'}
    """

    # if n makes sure no empty list value
    items = [HumanName(n).as_dict() for n in names if n]
    return items

In [7]:
def parse_wiki_raw(param, k, c_name):
    """Extract the information on the raw value without notable wiki markup"""

    # strip_code does not work properly for [[File:: https://github.com/earwig/mwparserfromhell/issues/136
    if k == 'logo':
        split_val = unicode(param.value).strip().split('|')
        # first value always image link
        if split_val[0]:
            # avoid bs4 warning
            val = split_val[0]
            if not re.match(r'http://.*', val):
                val = bs4.BeautifulSoup(val, 'lxml').text
            val = val.replace('[[', '').replace(' ', '_')
            if val:
                # can be written without [[File: or [[Image:
                if 'File:' in val or 'Image:' in val:
                    pass
                else:
                    val = 'File:' + val
                # submit raw version too, also links can differ
                return {
                    'wiki_commons_link': 'https://commons.wikimedia.org/wiki/' + val,
                    'wiki_file_link': wiki_base + '/wiki/' + val,
                    'wiki_raw_code': param.value.strip()}
    
    # fields can contain break separations, e.g. Microsoft: 
    # [[John W. Thompson]] <small> ([[Chairman]]) </small> <br /> [[Satya Nadella]]
    if '<br' in param.value:
        # replace the HTML breaks with real newline
        param.value = re.sub(r'<br>|<br ?/>', '\n', unicode(param.value))
        # get rid of media wiki markup and split into parts
        val = unicode(param.value.strip_code())
        # strip code does not always remove HTML tags
        val = bs4.BeautifulSoup(val, 'lxml').text
        items = val.split('\n')
        if k == 'key_people':
            return parse_people_name(items)
        if k == 'num_employees':
            return parse_employee_nr(items[-1], c_name)
        return items

    # only assuming one value when no breaks, too vague to use other seperators than <br>
    val = param.value.strip_code()

    # avoid bs4 warning when parsing HTML link
    if k != 'homepage':
        # properly remove HTML remains
        val = bs4.BeautifulSoup(val, 'lxml').text
    if k == 'homepage':
        # there are cases like: [http://www.absn.tv/ ABS] for homepage/website that don't match URL template
        match_link = re.search(r'\[(.+)\]', unicode(param.value))
        if match_link:
            val = match_link.group(1)
        # if whitespaces, then still no valid link
        if ' ' in val:
            # HTML needs to be ignored, e.g.: <!-- {{URL|www.example.com}} --> (also strips text)
            rem_html = bs4.BeautifulSoup(val, 'lxml').text
            if rem_html:
                return param.value.strip().split(' ')[0].replace('[', '')
            else:
                return None
    if k == 'num_employees':
        return parse_employee_nr(val, c_name)

    # make sure string didn't just contain whitespaces
    if val:
        return unicode(val).strip()
    # it can be that the field exists but is empty
    return None

In [27]:
def parse_wiki_template(param, k, c_name):
    """Extract further information if templates in key value"""

    for tem in param.value.filter_templates():
        # handle dates, URL and list
        if k == 'founded' or k == 'defunct':
            if tem.name.matches(('start date and age', 'start date', 'end date')):
                # valid date: {{Start date and age|2003|January|5|df=1}}
                date = [unicode(p) for p in tem.params if '=' not in unicode(p)]
                # concatenate the date and give to dateparser
                if not date:
                    return datetime.time()
                # if only the year is given returns todays date in that year
                return dateparser.parse("/".join(date), settings={
                        'PREFER_DATES_FROM': 'past',
                        'DATE_ORDER': 'YMD'})
            else:
                return datetime.time()
        elif k == 'homepage' and tem.name.matches(('url', 'URL')):
            if tem.params:
                try:
                    url = tem.get(1)
                except ValueError:
                    print "WARN: Could not get first element in params:", tem.params
                    return unicode(tem.params[0]).strip()
                return unicode(url).strip()
        elif tem.name.matches('unbulleted list'):
            # replace wiki link markup and remove extra citations etc.
            items = [p.value.strip_code().strip() for p in tem.params]
            if k == 'key_people':
                return parse_people_name(items)
            if k == 'num_employees':
                # e.g.: {{unbulleted list|{{loss}}0 (7 February 2013)|850 (6 February 2013)}}
                # usually last value contains current number
                return parse_employee_nr(items[-1], c_name)
            return items
        elif tem.name.matches(('plainlist', 'flatlist')):
            if tem.params:
                items = tem.params[0].value.strip_code().strip().split('\n')
                if k == 'key_people':
                    # use nameparse to exclude titles, etc.
                    return parse_people_name(items)
                return items
        elif 'formatnum' in tem.name and k == 'num_employees':
            # {{formatnum:1234}}, colon not detected by mwparser
            m = re.search(r'formatnum:(.*)', unicode(tem.name))
            if m:
                return parse_employee_nr(m.group(1), c_name)

    # if the template didn't match anything parse raw text
    return parse_wiki_raw(param, k, c_name)

In [9]:
def parse_wiki_text(company):
    """Parse MediaWiki markup to extract detailed information"""

    # remove ref tags, bs4 adds unwanted body tags, thus regex better
    wiki_raw = re.sub(r'<ref.*?</ref>|<ref>.*?</ref>', '', company['wiki_raw'])

    comp_infobox = dict()
    # parse the wikimedia syntax
    # style can be ignored: https://github.com/earwig/mwparserfromhell/issues/115
    code = mwparserfromhell.parse(wiki_raw, skip_style_tags=True)
    # filter for the infobox
    # If matches is a regex, the flags passed to re.search() are re.IGNORECASE ...
    c_template = code.filter_templates(
        matches=r'infobox company|infobox_company|company infobox|company|infobox dot-com company')
    if not c_template:
        # try regex approach for any infobox, if nothing found above
        match = re.search(r'({{infobox.*\n(?:\|.*\n|\*.*\n)+}})', wiki_raw, re.IGNORECASE)
        if match:
            infobox_temp = mwparserfromhell.parse(
                match.group(1), skip_style_tags=True).filter_templates()
            if infobox_temp:
                infobox_temp = infobox_temp[0]
            else:
                print "WARN: No parsable company infobox template for:", company['wiki_name']
                return comp_infobox
        else:
            print "WARN: No company infobox found for:", company['wiki_name']
            return comp_infobox
    else:
        infobox_temp = c_template[0]

    # find values for each key
    key_list = ['name', 'logo', 'type', 'key_people', 'industry', 'founded', 'location', \
                'location_city', 'location_country', 'defunct', 'subsid', \
                'products', 'num_employees', 'parent', 'homepage']
    for k in key_list:
        param = get_temp_val(infobox_temp, k)
        if param:
            val = parse_wiki_template(param, k, company['wiki_name'])
            if val:
                comp_infobox[k] = val
    # if there was no name given, take name of company node
    if 'name' not in comp_infobox:
        comp_infobox['name'] = company['wiki_name']
    return comp_infobox

In [10]:
def get_c_info(all_companies, company):
    """Get links and the wikitext which is used to extract key information about the company"""

    # concatenate URL from previously set values
    link_url = '{0}{1}?&{2}&{3}&{4}&pageid={5}'.format(
        wiki_base, wiki_api, action, properties, dat_format, company['wiki_page_id'])

    # get JSON and check for validity
    c_content = get_json_from_url(link_url)
    if not c_content or 'parse' not in c_content:
        print 'WARN: No parsable content on: {0} under page {1} (id: {2})'.format(
            link_url, company['wiki_name'], company['wiki_page_id'])
        company['is_company'] = False
        return company

    # save new fields
    company['wiki_api_url'] = link_url
    # list of links intersected with list containing all companies
    links = [x['*'] for x in c_content['parse']['links'] if x['ns'] == 0]
    company['all_links'] = links
    company['links'] = all_companies.intersection(links)

    # original wikitext to parse {{Infobox company
    company['wiki_raw'] = c_content['parse']['wikitext']['*']
    
    # extract company info from box, only on wiki text not HTML
    company_infobox = parse_wiki_text(company)
    company.update(company_infobox)
    company['is_company'] = True
    
    # keep non-company nodes but mark
    if not company_infobox:
        company['is_company'] = False
    return company

Example query to extract wikitext and links on a wiki page:
* https://en.wikipedia.org/w/api.php?action=parse&page=Audi&prop=links|wikitext
* https://en.wikipedia.org/w/api.php?action=parse&page=AT%26T&prop=links|wikitext
* https://en.wikipedia.org/w/api.php?&action=parse&format=json&prop=links|wikitext&pageid=17555269

In [19]:
action = u'action=parse'
dat_format = u'format=json'
properties = u'prop=links|wikitext'

# other: 'Audi', 'Apple Inc.', 'Microsoft'
company = 'Groupe Casino'

# create unique set of all companies
all_companies = set(companies.keys())

print "Excerpt of", company, "dict structure:"
pprint(get_c_info(all_companies, companies[company]))

Excerpt of Groupe Casino dict structure:
{'all_links': [u'France',
               u'Air France-KLM',
               u'Aldi',
               u'Argentina',
               u'Arkema',
               u'Atac',
               u'Auchan',
               u'Brazil',
               u'Bricomarch\xe9',
               u'Bureau Veritas',
               u'CAC Next 20',
               u'CGG (company)',
               u'Carrefour',
               u'Carrefour City',
               u'Carrefour Express',
               u'Carrefour Market',
               u'Cdiscount',
               u'Champion (supermarket)',
               u'Coccinelle',
               u'Colombia',
               u'Cora (hypermarket)',
               u'Dassault Syst\xe8mes',
               u'Dia (supermarket chain)',
               u'Docks de France',
               u'E.Leclerc',
               u'Ed (supermarket)',
               u'Edenred',
               u'Equity (finance)',
               u'Euromarch\xe9',
               u'Euronext',
  

In [23]:
def scrape_companies(companies, com_dat):
    """Gets two dictionaries and parses those who don't have same content"""

    # scrape links and extra information of all companies, call with progress-bar
    for company in tqdm_notebook(companies, desc='Fetching companies'):
        # don't repeat if pickle file already contains data
        if com_dat and \
            company in com_dat and \
            'is_company' in com_dat[company] and \
            com_dat[company]['is_company']:
                continue
        # extract any company information
        companies[company] = get_c_info(all_companies, companies[company])
    return companies

In [28]:
# get company file from extraction 1, check for changes
com_dat_pickle = dict()
if os.path.isfile(ex1_fdat):
    with open(ex1_fdat, 'rb') as f:
        com_dat_pickle = pickle.load(f)

In [None]:
%%time
# 500 companies take about 2-3 mins
# for 64k that is approx. 128 cykles which should total maximum 5 hours crawl time
# ((64000 / 500) * 2) / 60

# crawl for all companies that were not prcoessed yet and cotain no or a false 'is_company' flag
companies = scrape_companies(companies, com_dat_pickle)

WARN: No company infobox found for: McDonald's legal cases
WARN: No company infobox found for: History of Burger King
WARN: No company infobox found for: KFC advertising
WARN: No company infobox found for: Albemarle Corporation
WARN: No company infobox found for: UXC
WARN: No company infobox found for: Burger King products
WARN: No company infobox found for: Fondo Común
WARN: Could not get first element in params: [u'sor.cz/site/index.php?lang=en']


In [25]:
test_excerpt = 'Microsoft'
print "Excerpt of", test_excerpt, "dict structure:", companies[test_excerpt]

Excerpt of Microsoft dict structure: {'location_city': u'Microsoft Redmond campus, Redmond, Washington', 'links': set([u'T-Systems', u'Toshiba', u'Tellme Networks', u'Norwegian Cruise Line', u'American Airlines Group', u'Big Fish Games', u'Visio Corporation', u'Rare (company)', u'Tata Teleservices', u'KDDI', u'Amgen', u'Huawei', u'Reuters', u'Applied Materials', u'Comcast', u'IBM Global Services', u'Pando Networks', u'Deloitte', u'Seagate Technology', u'Biogen', u'CBS Interactive', u'Powerset (company)', u'Vodafone', u'Thomson Reuters', u'Mylan', u'Sony', u'Olympus Corporation', u'HP Inc.', u'Sony Mobile', u'Quanta Computer', u'Revolution Analytics', u'AOL', u'Xerox', u'Agence France-Presse', u'American Express', u'Sysinternals', u'Booz Allen Hamilton', u'Time Inc.', u'SK Telecom', u'Alphabet Inc.', u'QFC', u'MediaTek', u'The Priceline Group', u'CA Technologies', u'NTT DATA', u'Samsung Electronics', u'Ctrip', u'Pfizer', u'Rackspace', u'Tata Consultancy Services', u'Jabil Circuit', u'Si

In [26]:
# store company data in one binary file
with open(ex1_fdat, 'wb') as f:
    pickle.dump(companies, f)