This notebook is the data munging part of the visualization on the collective interconnectedness of the top 50 largest United States companies. It uses the `watsongraph` and `pandas` libraries to write the requested data to a `model.json` that is then read by the `d3` visualization.

You can see the visualization itself [here](http://bl.ocks.org/ResidentMario/793fe40d743b98d05dea).

In [85]:
from pandas import DataFrame
import pandas as pd

# Use Pandas DataFrame manipulations to import and slice up the data the way we want it.
frame = pd.read_csv("fortune500.csv")
frame = frame.ix[:, ['company', 'industry']]
# industries = list(set(list(frame.ix[:, 'industry'])))
companies = list(frame.ix[:, 'company'])

In [40]:
from watsongraph.node import conceptualize

# Map Fortune 500's company names to their Wikipedia article titles using watsongraph.node.conceptualize.
nodes = [conceptualize(company) for company in companies]

In [86]:
# Attach the nodes to our frame.
frame['node'] = nodes
frame = frame[['company', 'node', 'industry']]

In [87]:
# Unfortunately a few of the data points are lost during conceptualization. This is principally the result of the fact
# that the underlying Concept Insights graph uses an "image" of Wikipedia from 2011, which is missing some data on 2015.
# It's easiest to simply filter these out.
frame = frame[pd.isnull(frame['node']) == False]

In [103]:
from watsongraph.conceptmodel import ConceptModel

# Import these companies into a ConceptModel object.
# For now we will work with the top 50. cf. https://github.com/ResidentMario/watsongraph/issues/8
model = ConceptModel(list(frame['node'])[:50])
model.explode_edges(prune=True)

In [109]:
# On manual inspection Watson mistook Anthem the health insurance company for Anthem the band.
# Again it's easiest to filter this out. All of the other output looks good.
model.remove("Anthem (band)")

In [198]:
# Augment the model nodes with industry and rank, using the DataFrame elements.

def get_industry(company):
    return list(frame.ix[frame['node'] == company, 'industry'])[0]

def get_rank(company):
    return int(frame.ix[frame['node'] == company, 'industry'].index + 1)

model.map_property("industry", lambda company: get_industry(company))
model.map_property("rank", lambda company: get_rank(company))

In [320]:
import json

# Save the data to disk, to keep from having to rerun all of the burdensome queries above when reloading the notebook.
with open('model.json', 'w') as file:
    file.write(json.dumps(model.to_json(), indent=4))