## Data Munging

In [85]:
from pandas import DataFrame
import pandas as pd

# Use Pandas DataFrame manipulations to import and slice up the data the way we want it.
frame = pd.read_csv("fortune500.csv")
frame = frame.ix[:, ['company', 'industry']]
# industries = list(set(list(frame.ix[:, 'industry'])))
companies = list(frame.ix[:, 'company'])

In [40]:
from watsongraph.node import conceptualize

# Map Fortune 500's company names to their Wikipedia article titles using watsongraph.node.conceptualize.
nodes = [conceptualize(company) for company in companies]

In [86]:
# Attach the nodes to our frame.
frame['node'] = nodes
frame = frame[['company', 'node', 'industry']]

In [87]:
# Unfortunately a few of the data points are lost during conceptualization. This is principally the result of the fact
# that the underlying Concept Insights graph uses an "image" of Wikipedia from 2011, which is missing some data on 2015.
# It's easiest to simply filter these out.
frame = frame[pd.isnull(frame['node']) == False]

In [103]:
from watsongraph.conceptmodel import ConceptModel

# Import these companies into a ConceptModel object.
# For now we will work with the top 50. cf. https://github.com/ResidentMario/watsongraph/issues/8
model = ConceptModel(list(frame['node'])[:50])
model.explode_edges(prune=True)

In [109]:
# On manual inspection Watson mistook Anthem the health insurance company for Anthem the band.
# Again it's easiest to filter this out. All of the other output looks good.
model.remove("Anthem (band)")

In [159]:
frame

Unnamed: 0,company,node,industry
0,Walmart,Walmart,General Merchandisers
1,Exxon Mobil,ExxonMobil,Petroleum Refining
2,Chevron,Chevron Corporation,Petroleum Refining
3,Berkshire Hathaway,Berkshire Hathaway,Insurance: Property and Casualty (Stock)
4,Apple,Apple Inc.,"Computers, Office Equipment"
5,General Motors,General Motors,Motor Vehicles and Parts
6,Phillips 66,Phillips 66,Petroleum Refining
7,General Electric,General Electric,Diversified Financials
8,Ford Motor,Ford Motor Company,Motor Vehicles and Parts
10,McKesson,McKesson Corporation,Wholesalers: Health Care


In [198]:
# Augment the model nodes with industry and rank, using the DataFrame elements.

def get_industry(company):
    return list(frame.ix[frame['node'] == company, 'industry'])[0]

def get_rank(company):
    return int(frame.ix[frame['node'] == company, 'industry'].index + 1)

model.map_property("industry", lambda company: get_industry(company))
model.map_property("rank", lambda company: get_rank(company))

In [201]:
import json

# Save the data to disk, to keep from having to rerun all of the burdensome queries above when reloading the notebook.
with open('model.json', 'w') as file:
    file.write(json.dumps(model.to_json()))

## Visualization

In [202]:
# # Load from disk: skip to here when reloading the notebook.

# with open('model.json') as data_file:    
#     dataset = json.load(data_file)

In [213]:
# from IPython.display import Javascript

# # Pass to JS.
# Javascript("""window.dataset={};""".format(dataset))
# Javascript("""window.industries={};""".format(industries))

In [216]:
model.to_json()

{'directed': False,
 'graph': [('name', 'compose( ,  )')],
 'links': [{'source': 0, 'target': 15, 'weight': 0.83818614},
  {'source': 0, 'target': 1, 'weight': 0.8383687},
  {'source': 0, 'target': 42, 'weight': 0.57141024},
  {'source': 0, 'target': 28, 'weight': 0.94706297},
  {'source': 0, 'target': 29, 'weight': 0.75099915},
  {'source': 0, 'target': 30, 'weight': 0.643691},
  {'source': 0, 'target': 5, 'weight': 0.5871605},
  {'source': 0, 'target': 31, 'weight': 0.9746715},
  {'source': 0, 'target': 20, 'weight': 0.87822616},
  {'source': 0, 'target': 33, 'weight': 0.94444966},
  {'source': 0, 'target': 22, 'weight': 0.79003906},
  {'source': 0, 'target': 35, 'weight': 0.99605465},
  {'source': 1, 'target': 28, 'weight': 0.6183772},
  {'source': 1, 'target': 45, 'weight': 0.55965847},
  {'source': 1, 'target': 48, 'weight': 0.5597641},
  {'source': 1, 'target': 5, 'weight': 0.780927},
  {'source': 1, 'target': 32, 'weight': 0.7807296},
  {'source': 1, 'target': 30, 'weight': 0.60

In [214]:
%%javascript

require.config({
    paths: {
        d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3'
    }
});

<IPython.core.display.Javascript object>

In [225]:
%%html
<style>

.node {
  stroke: #fff;
  stroke-width: 1.5px;
}

.link {
  stroke: #999;
  stroke-opacity: .6;
}

</style>

In [226]:
%%javascript
require(['d3'], function(d3){
    /*
        Jupyter notebooks save their state between runtimes
        so the chart needs to be explicitly destroyed and
        recreated every time the code is run.
    */
    $("#chart").remove();
    element.append("<div id='chart' style='text-align:center;'></div>");

    
    var width = 960,
        height = 500;

//     var color = d3.scale.category20();

    var force = d3.layout.force()
        .charge(-120)
        .linkDistance(30)
        .size([width, height]);

    var svg = d3.select("#chart").append("svg")
        .attr("width", width)
        .attr("height", height);

    d3.json("model.json", function(error, graph) {
      if (error) throw error;

      force
          .nodes(graph.nodes)
          .links(graph.links)
          .start();

      var link = svg.selectAll(".link")
          .data(graph.links)
          .enter().append("line")
          .attr("class", "link")
          .style("stroke-width", function(d) { return Math.sqrt(d.weight); });

      var node = svg.selectAll(".node")
          .data(graph.nodes)
          .enter().append("circle")
          .attr("class", "node")
          .attr("r", 5)
//           .style("fill", function(d) { return color(d.group); })
          .style("fill", "gray")
          .call(force.drag);

      node.append("title")
          .text(function(d) { return d.id; });

      force.on("tick", function() {
        link.attr("x1", function(d) { return d.source.x; })
            .attr("y1", function(d) { return d.source.y; })
            .attr("x2", function(d) { return d.target.x; })
            .attr("y2", function(d) { return d.target.y; });

        node.attr("cx", function(d) { return d.x; })
            .attr("cy", function(d) { return d.y; });
      });
    });
});

<IPython.core.display.Javascript object>