** 02805 Social graphs and interactions **

# Data Extraction Part 1

In [1]:
# IPython global cell magic
%reset
%matplotlib inline

# import all necessary packages
import bs4 # HTML parser
from collections import Counter, OrderedDict # counting elements and ordering keys in dictionaries
import community # python-louvain package
from __future__ import division # all numbers are float
import geopy # get geo location according to addresses
from geopy.exc import GeocoderServiceError
import datetime # handle date objects
import dateparser # parse any (also foreign) date format to object: https://pypi.python.org/pypi/dateparser
import itertools # iterators for efficient looping
import json # JSON parser
import math # math operations
from matplotlib import pyplot as plt # plotting figures
import mwparserfromhell # parse MediaWiki syntax: https://github.com/earwig/mwparserfromhell
from nameparser import HumanName # parse a human name
import networkx as nx # networks creation library
import nltk # natural language processing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import operator # efficient operator functions
import os # operating system operations, e.g.: with files and folders
import pandas as pd # use easy-to-use data frames for data analysis
import pickle # python data structures as files
from pprint import pprint # print data structures prettier
import re # regex
import requests # request URL content
import time # sleep timer
from tqdm import tqdm_notebook # make a nice progressbar
import urllib # handle special URL chars

# make working directory
directory = os.getcwd() + '/companies'
if not os.path.exists(directory):
    os.makedirs(directory)

# files
ex1_fdat = directory + '/extraction1_data.pkl'
ex2_fdat = directory + '/extraction2_data.pkl'
ex3_tmp_fdat = directory + '/tmp_extraction3_data.pkl'
ex3_fdat = directory + '/extraction3_data.pkl'
merged = directory + '/merged_data.pkl'
extraction_csv = directory + '/company_data.csv'

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
def get_json_from_url(url):
    r = requests.get(url)
    
    # on HTML error codes
    if r.status_code != 200:
        return None

    # try converting into JSON
    try:
        sec = r.json()
        return sec
    except ValueError:  # includes simplejson.decoder.JSONDecodeError
        print 'WARN: Decoding JSON has failed on:', url
    return None

In [3]:
def get_comp_pages(api_url, extension=None):
    
    ext_api_url = api_url
    if extension:
        ext_api_url += extension
    
    # retrieve JSON
    sites_w_template = get_json_from_url(ext_api_url)

    # check for valid data
    if not sites_w_template or 'query' not in sites_w_template:
        print "WARN: query returned no data:", infobox_url + properties
    else:
        # iterate over list of companies that embed infobox and save to dict
        for page in sites_w_template['query']['embeddedin']:
            # unicode to str
            quoted_name = urllib.quote_plus(page['title'].encode('utf-8'))
            companies.update({
                page['title']: {
                    'wiki_page_id': int(page['pageid']),
                    'wiki_name': unicode(page['title']),
                    'wiki_url': u'{0}{1}?title={2}'.format(wiki_base, wiki_index, quoted_name),
                    'name_url_quoted': quoted_name
                }
            })

        # run the function recursively, when company found
        if 'continue' in sites_w_template:
            extension = '&eicontinue=' + sites_w_template['continue']['eicontinue']
            return get_comp_pages(api_url, extension)
    
    return companies

Example query for extraction of pages with "Infobox company" template:
* https://en.wikipedia.org/w/api.php?&action=query&list=embeddedin&eititle=Template:Infobox+company&einamespace=0&eilimit=max
* https://en.wikipedia.org/w/api.php?&action=query&list=embeddedin&eititle=Template:Infobox+company&einamespace=0&eilimit=max&eicontinue=0|167638


In [4]:
# specify wikipedia API URL parameters
wiki_base = u'https://en.wikipedia.org'
wiki_api = u'/w/api.php'
wiki_index = u'/w/index.php'
action = u'action=query'
dat_format = u'format=json'
# eititle=Template:Infobox+company - embedded pages with Template {{Infobox Company}}
# einamespace=0 only articles
# eilimit=max - maximum number before continuing to minimize requests
properties = u'list=embeddedin&eititle=Template:Infobox+company&einamespace=0&eilimit=max'

# concatenate URL
infobox_url = '{0}{1}?&{2}&{3}&'.format(
    wiki_base, wiki_api, action, dat_format)

# init and update with recursive function
companies = dict()
get_comp_pages(infobox_url + properties)

print len(companies), "companies were extracted."

56843 companies were extracted.


In [5]:
print companies['AT&T']

{'name_url_quoted': 'AT%26T', 'wiki_page_id': 17555269, 'wiki_url': u'https://en.wikipedia.org/w/index.php?title=AT%26T', 'wiki_name': u'AT&T'}


In [6]:
def get_temp_val(temp, k):
    """Check if the parameter exists in the infobox template"""

    try:
        param = temp.get(k)
        return param
    except ValueError:
        # try alternative keys (template not always consistent)
        if k == 'founded':
            return get_temp_val(temp, 'foundation')
        if 'location' in k and 'hq_' not in k:
            return get_temp_val(temp, 'hq_' + k)
    # key not found in template
    return None

In [7]:
def parse_employee_nr(input_val, c_name):
    """
    Parse a proper number out of the various inputs
    Case possibilities tested:
    
    s_list = [
        '15-50',
        '~300+',
        'circa 40',
        '9,985(Dec 2011)',
        'over 10,000 in 10 countries',
        'Five',
        'over 1 million',
        '10.000',
        'Part of Popular, Inc., which has 8,000 employees'
    ]

    for s in s_list:
        print parse_employee_nr(s, 'test')
    """
    
    # match the first number, dot or comma separation optional
    m = re.search(r'[0-9]+([,\.][0-9]+)?', unicode(input_val))
    if m:
        try:
            # replace , and conert to int
            return int(m.group().replace(',', '').replace('.', ''))
        except ValueError:
            print "WARN: Failed conversion of:{0} (company: {1})".format(
                input_val, c_name)
    return None

In [8]:
def parse_people_name(names):
    """
    Tim Cook (CEO) becomes 
    {u'last': u'Cook', u'suffix': u'', u'title': u'', u'middle': u'', u'nickname': u'CEO', u'first': u'Tim'}
    """

    # if n makes sure no empty list value
    items = [HumanName(n).as_dict() for n in names if n]
    return items

In [9]:
def parse_date(date):
    if date: 
        # if only the year is given returns todays date in that year
        date_parsed = dateparser.parse(date, settings={
            'PREFER_DATES_FROM': 'past',
            'DATE_ORDER': 'YMD'})
        # try to extract the first number with 4 digits from 18XX-20XX
        if not date_parsed:
            m = re.search(r'(18|19|20\d{2})', date)
            if m:
                return dateparser.parse(m.group())
        else:
            return date_parsed
    # return null time: 00:00:00
    return datetime.time()

In [10]:
def parse_wiki_raw(param, k, c_name):
    """Extract the information on the raw value without notable wiki markup"""

    # strip_code does not work properly for [[File:: https://github.com/earwig/mwparserfromhell/issues/136
    if k == 'logo':
        split_val = unicode(param.value).strip().split('|')
        # first value always image link
        if split_val[0]:
            # avoid bs4 warning
            val = split_val[0]
            if not re.match(r'http://.*', val):
                val = bs4.BeautifulSoup(val, 'lxml').text
            val = val.replace('[[', '').replace(' ', '_')
            if val:
                # can be written without [[File: or [[Image:
                if 'File:' in val or 'Image:' in val:
                    pass
                else:
                    val = 'File:' + val
                # submit raw version too, also links can differ
                return {
                    'wiki_commons_link': 'https://commons.wikimedia.org/wiki/' + val,
                    'wiki_file_link': wiki_base + '/wiki/' + val,
                    'wiki_raw_code': param.value.strip()}
    
    # fields can contain break separations, e.g. Microsoft: 
    # [[John W. Thompson]] <small> ([[Chairman]]) </small> <br /> [[Satya Nadella]]
    if '<br' in param.value:
        # replace the HTML breaks with real newline
        param.value = re.sub(r'<br>|<br ?/>', '\n', unicode(param.value))
        # get rid of media wiki markup and split into parts
        val = unicode(param.value.strip_code())
        # strip code does not always remove HTML tags
        val = bs4.BeautifulSoup(val, 'lxml').text
        items = val.split('\n')
        if k == 'key_people':
            return parse_people_name(items)
        if k == 'num_employees':
            return parse_employee_nr(items[-1], c_name)
        return items

    # only assuming one value when no breaks, too vague to use other seperators than <br>
    val = param.value.strip_code()

    # avoid bs4 warning when parsing HTML link
    if k != 'homepage':
        # properly remove HTML remains
        val = bs4.BeautifulSoup(val, 'lxml').text
    if k == 'homepage':
        # there are cases like: [http://www.absn.tv/ ABS] for homepage/website that don't match URL template
        match_link = re.search(r'\[(.+)\]', unicode(param.value))
        if match_link:
            val = match_link.group(1)
        # if whitespaces, then still no valid link
        if ' ' in val:
            # HTML needs to be ignored, e.g.: <!-- {{URL|www.example.com}} --> (also strips text)
            rem_html = bs4.BeautifulSoup(val, 'lxml').text
            if rem_html:
                return param.value.strip().split(' ')[0].replace('[', '')
            else:
                return None
    if k == 'num_employees':
        return parse_employee_nr(val, c_name)

    # make sure string didn't just contain whitespaces
    if val:
        return unicode(val).strip()
    # it can be that the field exists but is empty
    return None

In [11]:
def parse_wiki_template(param, k, c_name):
    """Extract further information if templates in key value"""

    for tem in param.value.filter_templates():
        # handle dates, URL and list
        if k == 'founded' or k == 'defunct':
            if tem.name.matches(('start date and age', 'start date', 'end date')):
                # valid date: {{Start date and age|2003|January|5|df=1}}
                date = [unicode(p) for p in tem.params if '=' not in unicode(p)]
                # concatenate the date and give to dateparser
                if not date:
                    return datetime.time()
                return parse_date("/".join(date))
            else:
                return datetime.time()
        elif k == 'homepage' and tem.name.matches(('url', 'URL')):
            if tem.params:
                try:
                    url = tem.get(1)
                except ValueError:
                    print "WARN: Could not get first element in params:", tem.params
                    return unicode(tem.params[0]).strip()
                return unicode(url).strip()
        elif tem.name.matches('unbulleted list'):
            # replace wiki link markup and remove extra citations etc.
            items = [p.value.strip_code().strip() for p in tem.params]
            if k == 'key_people':
                return parse_people_name(items)
            if k == 'num_employees':
                # e.g.: {{unbulleted list|{{loss}}0 (7 February 2013)|850 (6 February 2013)}}
                # usually last value contains current number
                return parse_employee_nr(items[-1], c_name)
            return items
        elif tem.name.matches(('plainlist', 'flatlist')):
            if tem.params:
                items = tem.params[0].value.strip_code().strip().split('\n')
                if k == 'key_people':
                    # use nameparse to exclude titles, etc.
                    return parse_people_name(items)
                return items
        elif 'formatnum' in tem.name and k == 'num_employees':
            # {{formatnum:1234}}, colon not detected by mwparser
            m = re.search(r'formatnum:(.*)', unicode(tem.name))
            if m:
                return parse_employee_nr(m.group(1), c_name)

    # if the template didn't match anything parse raw text
    return parse_wiki_raw(param, k, c_name)

In [12]:
def parse_wiki_text(company):
    """Parse MediaWiki markup to extract detailed information"""

    # remove ref tags, bs4 adds unwanted body tags, thus regex better
    wiki_raw = re.sub(r'<ref.*?</ref>|<ref>.*?</ref>', '', company['wiki_raw'])

    comp_infobox = dict()
    # parse the wikimedia syntax
    # style can be ignored: https://github.com/earwig/mwparserfromhell/issues/115
    code = mwparserfromhell.parse(wiki_raw, skip_style_tags=True)
    # filter for the infobox
    # If matches is a regex, the flags passed to re.search() are re.IGNORECASE ...
    c_template = code.filter_templates(
        matches=r'infobox company|infobox_company|company infobox|company|infobox dot-com company')
    if not c_template:
        # try regex approach for any infobox, if nothing found above
        match = re.search(r'({{infobox.*\n(?:\|.*\n|\*.*\n)+}})', wiki_raw, re.IGNORECASE)
        if match:
            infobox_temp = mwparserfromhell.parse(
                match.group(1), skip_style_tags=True).filter_templates()
            if infobox_temp:
                infobox_temp = infobox_temp[0]
            else:
                print "WARN: No parsable company infobox template for:", company['wiki_name']
                return comp_infobox
        else:
            print "WARN: No company infobox found for:", company['wiki_name']
            return comp_infobox
    else:
        infobox_temp = c_template[0]

    # find values for each key
    key_list = ['name', 'logo', 'type', 'key_people', 'industry', 'founded', 'location', \
                'location_city', 'location_country', 'defunct', 'subsid', \
                'products', 'num_employees', 'parent', 'homepage']
    for k in key_list:
        param = get_temp_val(infobox_temp, k)
        if param:
            val = parse_wiki_template(param, k, company['wiki_name'])
            if val:
                comp_infobox[k] = val
    # if there was no name given, take name of company node
    if 'name' not in comp_infobox:
        comp_infobox['name'] = company['wiki_name']
    return comp_infobox

In [13]:
def get_c_info(company):
    """Get links and the wikitext which is used to extract key information about the company"""

    # reparse old text if text and links already there
    if 'wiki_raw' not in company or 'all_links' not in company:   
        # concatenate URL from previously set values
        link_url = '{0}{1}?&{2}&{3}&{4}&pageid={5}'.format(
            wiki_base, wiki_api, action, properties, dat_format, company['wiki_page_id'])

        # get JSON and check for validity
        c_content = get_json_from_url(link_url)
        if not c_content or 'parse' not in c_content:
            print 'WARN: No parsable content on: {0} under page {1} (id: {2})'.format(
                link_url, company['wiki_name'], company['wiki_page_id'])
            company['is_company'] = False
            return company

        # save new fields
        company['wiki_api_url'] = unicode(link_url)
        company['all_links'] = [x['*'] for x in c_content['parse']['links'] if x['ns'] == 0]

        # original wikitext to parse {{Infobox company
        company['wiki_raw'] = c_content['parse']['wikitext']['*']
    
    # extract company info from box, only on wiki text not HTML
    company_infobox = parse_wiki_text(company)
    company.update(company_infobox)
    company['is_company'] = True
    
    # keep non-company nodes but mark
    if not company_infobox:
        company['is_company'] = False
    return company

Example query to extract wikitext and links on a wiki page:
* https://en.wikipedia.org/w/api.php?action=parse&page=Audi&prop=links|wikitext
* https://en.wikipedia.org/w/api.php?action=parse&page=AT%26T&prop=links|wikitext
* https://en.wikipedia.org/w/api.php?&action=parse&format=json&prop=links|wikitext&pageid=17555269

In [19]:
action = u'action=parse'
dat_format = u'format=json'
properties = u'prop=links|wikitext'

# other: 'Audi', 'Apple Inc.', 'Microsoft'
company = 'Audi'

print "Excerpt of", company, "dict structure:"
pprint(
    get_c_info(companies[company]))

Excerpt of Audi dict structure:
{'all_links': [u'Germany',
               u'United States',
               u'United Kingdom',
               u'Japan',
               u'France',
               u'Belgium',
               u'Brazil',
               u'Bahrain',
               u'China',
               u'Malaysia',
               u'Uruguay',
               u'Argentina',
               u'Monaco',
               u'Audi',
               u'1,000,000,000 (number)',
               u'1. FC Nuremberg',
               u'1936 Summer Olympics',
               u'1971 24 Hours of Le Mans',
               u'1982 World Rally Championship season',
               u'1983 World Rally Championship season',
               u'1984 World Rally Championship season',
               u'1985 World Rally Championship season',
               u'1986 World Rally Championship season',
               u'1999 24 Hours of Le Mans',
               u'2000 24 Hours of Le Mans',
               u'2000 American Le Mans Series season',


In [14]:
def scrape_companies(companies):
    """Gets two dictionaries and parses those who don't have same content"""

    # scrape links and extra information of all companies, call with progress-bar
    for company in tqdm_notebook(companies, desc='Companies'):
        # don't repeat if pickle file already contains data
        if 'is_company' in companies[company] and \
            companies[company]['is_company']:
                continue
        # extract any company information
        companies[company] = get_c_info(companies[company])
    return companies

In [15]:
# get company file from extraction 1
com_dat_pickle = dict()
if os.path.isfile(ex1_fdat):
    with open(ex1_fdat, 'rb') as f:
        com_dat_pickle = pickle.load(f)
        # update previously fetched data to same keys
        companies.update(com_dat_pickle)

In [20]:
%%time
# 500 companies take about 2-3 mins
# for 64k that is approx. 128 cykles which should total maximum 5 hours crawl time
# ((64000 / 500) * 2) / 60

# crawl for all companies that were not prcoessed yet and cotain no or a false 'is_company' flag
companies = scrape_companies(companies)

WARN: No company infobox found for: McDonald's legal cases
WARN: No company infobox found for: History of Burger King
WARN: No company infobox found for: KFC advertising
WARN: No company infobox found for: Burger King products
WARN: No company infobox found for: Protandim
WARN: No company infobox found for: Burger King legal issues
WARN: No company infobox found for: History of McDonald's
WARN: No company infobox found for: History of KFC
WARN: No company infobox found for: McDonald's advertising
WARN: No company infobox found for: Burger King advertising
WARN: No company infobox found for: Fondo Común
WARN: No company infobox found for: List of Burger King ad programs
WARN: No company infobox found for: List of McDonald's ad programs
WARN: No company infobox found for: List of McDonald's products

CPU times: user 1.36 s, sys: 112 ms, total: 1.48 s
Wall time: 4.94 s


## Add Geolocation Data

In [21]:
# show how many companies have an address
def get_location(comp, geolocators):
    
    # location variables
    loc_str = None
    loc_dict = {
        'location_geopy': loc_str,
        'location_gps': (None, None)}
    if 'location_geopy' in comp and 'location_gps' in comp:
        # if not None
        if comp['location_geopy'] and comp['location_gps']:
            return {
                'location_geopy': comp['location_geopy'],
                'location_gps': comp['location_gps']}
    
    # check from top to bottom for location
    fields_to_check = ['location', 'location_city', 'location_country']
    for f in fields_to_check:
        # continue if field not in company
        if f not in comp:
            continue
        # check if value unicode or list
        val = comp[f]
        if val:
            loc_str = val
            if isinstance(val, list):
                loc_str = " ".join(val)
            # fields are ordered, highest with value wins
            break
    # return if none of the fields set
    if not loc_str:
        return loc_dict
    
    # avoid request by looking into already processed
    if loc_str in all_locations:
        return all_locations[loc_str]

    # try to validate and get coordinates with geopy
    for g in geolocators:
        try:
            loc = g.geocode(loc_str, timeout=10)
        except GeocoderServiceError:
            print "HTML or API error on", comp['wiki_name']
            return loc_dict
        if loc:
            # max 1 request per second, http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy
            time.sleep(1)
            loc_dict = {
                'location_geopy': loc.address,
                'location_gps': (loc.latitude, loc.longitude)}
            all_locations[loc_str] = loc_dict
            return loc_dict
    return loc_dict

In [23]:
# initialize the different geolocator APIs
g_osm = geopy.geocoders.Nominatim()
g_google = geopy.geocoders.GoogleV3() # can also be with g_api_key
#geolocators = [g_google, g_osm] # gives GeocoderQuotaExceeded without API key
geolocators = [g_osm]

# save looked up locations to not make API request
all_locations = dict()
for c in tqdm_notebook(companies, desc='Geolocation'):
    if not companies[c]['is_company']:
        continue
    processed_loc = get_location(companies[c], geolocators)
    companies[c].update(processed_loc)

HTML or API error on Ufone
HTML or API error on Equity Group Holdings Limited
HTML or API error on Grafotechna
HTML or API error on First North Luzon Transit
HTML or API error on Horex
HTML or API error on Empire Today
HTML or API error on United Copper
HTML or API error on Kaplan Hawksmere
HTML or API error on Trans-Asia Shipping Lines
HTML or API error on MySQL AB
HTML or API error on Marvelous USA
HTML or API error on Miss Globe International
HTML or API error on Madison Marquette
HTML or API error on National Small Industries Corporation
HTML or API error on Ectaco
HTML or API error on Arvand Free Zone
HTML or API error on Nortec Software
HTML or API error on Jowett Cars
HTML or API error on Nichicon
HTML or API error on Petrocaribe
HTML or API error on Bank Rakyat
HTML or API error on Norwegian Cruise Line
HTML or API error on Nippon Sheet Glass
HTML or API error on Hesteel Group
HTML or API error on Argus Media
HTML or API error on Head (company)
HTML or API error on Lixil Group


KeyboardInterrupt: 

## Basic Data Cleaning

In [25]:
# specify the type for each field
types = {
    # when first link is crawled
    'wiki_name': unicode,
    'wiki_url': unicode,
    'name_url_quoted': unicode,
    
    # when WIKI API is crawled
    'wiki_page_id': int,
    'wiki_api_url': unicode,
    'all_links': list,
    'links': set,
    'is_company': bool,
    'wiki_raw': unicode,
    
    # from extraction2
    'wb_api_url': unicode,
    'wb_api_search_url': unicode,
    
    # added only when Infobox company exists or fields from OpenCorporates
    # not all fields always exist, they are NaN in the resulting DataFrame
    'name': unicode, 
    'type': unicode, 
    'founded': datetime.datetime, 
    'defunct': datetime.datetime, 
    'location': unicode,
    'location_city': unicode,
    'location_country': unicode,
    'location_geopy': unicode,
    'location_gps': tuple,
    # following not in OC
    'countries': set, # added with extraction 3
    'logo': dict,
    'key_people': list, # additonally processed with nameparser.HumanName (dict)
    'industry': list, 
    'subsid': list,
    'products': list, 
    'num_employees': int, 
    'parent': unicode, 
    'homepage': unicode
}

In [26]:
for c, comp in companies.iteritems():
    for k, val in comp.iteritems():
        if not val:
            continue
        # list values that are not supposed to be lists, to single value
        elif types[k] != list and isinstance(val, list):
            if k == 'num_employees':
                companies[c][k] = parse_employee_nr(val[0], c)
            elif types[k] == datetime.datetime:
                 companies[c][k] = parse_date(val[0])
            else:
                companies[c][k] = val[0]
        # string values to unicode
        elif types[k] == unicode and isinstance(val, str):
            companies[c][k] = unicode(val)
        # single values with one element or only one dict, e.g. in key_people, to list
        elif types[k] == list and not isinstance(val, list):
            companies[c][k] = [val]
        # single unicode values to datetime obj
        elif types[k] == datetime.datetime and isinstance(val, unicode):
                companies[c][k] = parse_date(val)

In [27]:
# reiterate to check conversion result
failed_types = dict()
for c, comp in companies.iteritems():
    for k, val in comp.iteritems():
        # show values that do not fit specified type
        if val and not isinstance(val, types[k]):
            # non extracted logos just deleted
            if k == 'logo':
                print val
                companies[c][k] = None
            if k not in failed_types:
                failed_types.update({
                    k: {
                        'is_type': type(val),
                        'should_be_type': types[k]}})

print failed_types

{}


In [28]:
# delete all None type objects
companies = { k: v for k, v in companies.iteritems() if v}

In [29]:
cnt = 0
for c in companies:
    # are there any unprocessed entries
    if 'is_company' not in companies[c]:
        print c
        continue
    if not companies[c]['is_company']:
        cnt += 1
print cnt, "companies were marked as no company.\nOthers look like:\n"

test_excerpt = 'Microsoft'
print "Excerpt of", test_excerpt, "dict structure:", companies[test_excerpt]

14 companies were marked as no company.
Others look like:

Excerpt of Microsoft dict structure: {'num_employees': 114000, 'founded': datetime.datetime(1975, 4, 4, 0, 0), 'name_url_quoted': u'Microsoft', 'wiki_api_url': u'https://en.wikipedia.org/w/api.php?&action=parse&prop=links|wikitext&format=json&pageid=19001', 'location_geopy': u'Building 99 Parking Garage, Northeast 36th Street, Microsoft West Campus, Redmond, King County, Washington, 98052, United States of America', 'logo': {'wiki_raw_code': u'Microsoft logo and wordmark.svg', 'wiki_file_link': u'https://en.wikipedia.org/wiki/File:Microsoft_logo_and_wordmark.svg', 'wiki_commons_link': u'https://commons.wikimedia.org/wiki/File:Microsoft_logo_and_wordmark.svg'}, 'wiki_name': u'Microsoft', 'subsid': [u'List of Microsoft subsidiaries'], 'location_city': u'Microsoft Redmond campus, Redmond, Washington', 'name': u'Microsoft Corporation', 'wiki_url': u'https://en.wikipedia.org/w/index.php?title=Microsoft', 'industry': [u'Computer soft

In [30]:
# store company data in one binary file
with open(ex1_fdat, 'wb') as f:
    pickle.dump(companies, f)