# Visualizing wikipedia clickstream datasets with d3

In [66]:
import json
import networkx as nx

from collections import defaultdict
from IPython.display import display, display_pretty, Javascript, HTML

In [15]:
clickstream_file = "../../data/clickstream-022015.tsv"

In [38]:
with open(clickstream_file, "r") as f:
  tsv = f.readlines()

In [160]:
current = {
  "targets": defaultdict(int),
  "sources": defaultdict(int)
}

center = "Pi"

for line in tsv[1:]:
  l = line.split("\t")
  
  if l[3] == center:
    current["targets"][l[4]] = l[2]
  if l[4] == center:
    current["sources"][l[3]] = l[2]

In [163]:
# we append prefix otherwise sankey layout goes nut
links_in = [ [ "s:"+x[0], "o:"+center, x[1] ] for x in current["sources"].iteritems() ]
links_out = [ [ "o:"+center, "t:"+x[0], x[1] ] for x in current["targets"].iteritems() ]

links_in = [ x for x in links_in if "other-" not in x[0] ]
links_out = [ x for x in links_out if "other-" not in x[1] ]

nb = 10

links_in = sorted(links_in, key= lambda x: -int(x[2]))[0:nb]
links_out = sorted(links_out, key= lambda x: -int(x[2]))[0:nb]

nodes = ["o:"+center] + list({ n[0] for n in links_in } | { n[1] for n in links_out })

data = {
  "nodes": [ {"name": n } for n in nodes ],
  "links": [ {"source": nodes.index(l[0]), "target": nodes.index(l[1]), "value": int(l[2]) } for l in links_in + links_out ]
}

with open("sankey.json", "w") as f:
  json.dump(data, f)

In [169]:
display(HTML("""
<style>
#chart {
  height: 500px;
}

.node rect {
  cursor: move;
  fill-opacity: .9;
  shape-rendering: crispEdges;
}

.node text {
  pointer-events: none;
  text-shadow: 0 1px 0 #fff;
  font-size: 10px;
  font-family: Tahoma, sans serif;
}

.link {
  fill: none;
  stroke: #000;
  stroke-opacity: .2;
}

.link:hover {
  stroke-opacity: .5;
}
</style>
<div id="sankey"></div>
"""))

In [170]:
%%javascript
require.config({
    paths: {
        d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',
        sankey: "//cdn.rawgit.com/d3/d3-plugins/master/sankey/sankey"
    },
    shim: {
      "d3" : {
        exports : 'd3'
      },
      sankey: {
        deps: ['d3.global']
      }
    }
});

define("d3.global", ["d3"], function(_) {
  window.d3 = _;
});

require(["d3", "sankey"], function(d3){
  var w = 700;
  var h = 500;
  
  var color = d3.scale.category20();

  d3.select("#sankey").selectAll("*").remove();
  
  var svg = d3.select("#sankey")
    .append("svg")
    .attr("width", w)
    .attr("height", h);
  
  var sankey = d3.sankey()
    .nodeWidth(15)
    .nodePadding(10)
    .size([w,h]);
  
  var path = sankey.link();
  
//  d3.json("http://bost.ocks.org/mike/sankey/energy.json", function(data){
  d3.json("sankey.json", function(data){

    sankey
      .nodes(data.nodes)
      .links(data.links)
      .layout(32);
  
    console.log(data.links)
    
    var node = svg.append("g").selectAll(".node")
      .data(data.nodes)
      .enter().append("g")
      .attr("class", "node")
      .attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
      .call(d3.behavior.drag()
        .origin(function(d) { return d; })
        .on("dragstart", function() { this.parentNode.appendChild(this); })
        .on("drag", dragmove));

    node.append("rect")
      .attr("height", function(d) { return d.dy; })
      .attr("width", sankey.nodeWidth())
      .style("fill", function(d) { return d.color = color(d.name.replace(/ .*/, "")); })
      .style("stroke", function(d) { return d3.rgb(d.color).darker(2); })
      .append("title")
      .text(function(d) { return d.name });

    node.append("text")
      .attr("x", -6)
      .attr("y", function(d) { return d.dy / 2; })
      .attr("dy", ".35em")
      .attr("text-anchor", "end")
      .attr("transform", null)
      .text(function(d) { return d.name.split(":",2)[1]; })
      .filter(function(d) { return d.x < w / 2; })
      .attr("x", 6 + sankey.nodeWidth())
      .attr("text-anchor", "start");

    var link = svg.append("g").selectAll(".link")
      .data(data.links)
      .enter().append("path")
      .attr("class", "link")
      .attr("d", path)
      .style("stroke-width", function(d) { return Math.max(1, d.dy); })
      .sort(function(a, b) { return b.dy - a.dy; });
    
    console.log(link)
    
    function dragmove(d) {
      d3.select(this).attr("transform", "translate(" + d.x + "," + (d.y = Math.max(0, Math.min(h - d.dy, d3.event.y))) + ")");
      sankey.relayout();
      link.attr("d", path);
    }
    
  });
});

<IPython.core.display.Javascript object>