TWFwKGJhc2VtYXA9eyd1cmwnOiAnaHR0cDovL3tzfS50aWxlLm9wZW5zdHJlZXRtYXAuc2UvaHlkZGEvZnVsbC97en0ve3h9L3t5fS5wbmcnLCAnYXR0cmlidXRpb24nOiAnVGlsZXMgY291cnTigKY=


# Activity #1: MarketMap
* another way to visualize mappable data

## 1.a : explore the dataset

In [None]:
# our usual stuff
%matplotlib inline
import pandas as pd
import numpy as np

In [None]:
#!pip install xlrd # JPN, might have to run this

# note: this is quering from the web!  How neat is that??
df = pd.read_excel('https://query.data.world/s/ivl45pdpubos6jpsii3djsjwm2pcjv', skiprows=5)
# the above might take a while to load all the data

In [None]:
# what is in this dataframe? lets take a look at the top
df.head()
# this dataset is called: "Surgery Charges Across the U.S."
#  and its just showing us how much different procedures 
#  cost from different hospitals

In [None]:
# what kinds of data are we working with?
df.dtypes

In [None]:
# lets look at some summary data
# recall: this is like R's "summary" function
df.describe()
# so, things like the mean zipcode aren't
#  meaningful, same thing with provider ID
#  But certainly looking at the average
#  total payments, discharges, might 
#  be useful

In [None]:
# lets look at how many seperate types of surgery are 
#  represented in this dataset:
df["DRG Definition"].unique().size

In [None]:
# what about how many provider (hospital) names?
df["Provider Name"].unique().size

In [None]:
# how many states are represented
df["Provider State"].unique().size

In [None]:
# what are the state codes?
df["Provider State"].unique()

In [None]:
# lets figure out  what the most common surgeries are via how 
#  many many folks are discharged after each type of surgery
# (1)
most_common = df.groupby("DRG Definition")["Total Discharges"].sum()
most_common

# (2) but lets sort by the largest on top
most_common = df.groupby("DRG Definition")["Total Discharges"].sum().sort_values(ascending=False)
most_common

# (3) lets look at only the top 5, for fun
most_common[:5]

# (4) or we can only look at the names of the top 5:
most_common[:5].index.values

## 1.b: formatting data for MarketMap
* here we are going to practice doing some fancy things to clean this data
* this will be good practice for when you run into other datasets "in the wild"

In [None]:
# (1) lets create a little table of total discharges for
#  each type of surgery & state
total_discharges = df.groupby(["DRG Definition", "Provider State"])["Total Discharges"].sum()
total_discharges

# (2) the above is not intuative, lets prettify it
total_discharges = df.groupby(["DRG Definition", "Provider State"])["Total Discharges"].sum().unstack()
total_discharges

### Aside: lets quick check out what are the most frequent surgeries

In [None]:
# for our map, we are going to want to 
# normalize the discharges or each surgery 
# for each 
# state by the total discharges across all 
# states for a particular type of surger
#  lets add this to our total_discharges DF
total_discharges["Total"] = total_discharges.sum(axis = 1)
total_discharges["Total"].head() # just look at the first few

In [None]:
# finally, lets check out the most often
#  performed surgery across all states

# we can do this by sorting our DF by this total we just
# calculated:
total_discharges.sort_values(by = "Total", 
                             ascending=False, 
                             inplace = True)

# now lets just look at the first few of our 
#  sorted array
total_discharges.head()

# so, from this we see that joint replacement
#  or reattachment of a lower extremeity is 
#  the most likely surgery (in number of discharges)
# followed by surgeries for sepsis and then heart failure

In [None]:
# neat.  We won't need these for plotting, so we can remove our
# total column we just calculated
del total_discharges["Total"]
total_discharges.head()
# now we see that we are back to just states & surgeries
#  *but* our sorting is still by the total that we 
# previously calculated.
# spiffy!

## 1.c: plot data with bqplot

In [None]:
import bqplot
# by default bqplot does not import 
#  all packages, we have to 
#  explicitely import market_map
import bqplot.market_map # for access to market_map

In [None]:
# lets do our usual thing, but with a market map
#  instead of a heat map

# scales:
x_sc, y_sc = bqplot.OrdinalScale(), bqplot.OrdinalScale() # note, just a different way to call things
c_sc = bqplot.ColorScale(scheme="Blues")

# just a color axes for now:
c_ax = bqplot.ColorAxis(scale = c_sc, orientation = 'vertical')

# lets make the market map:

# (1) what should we plot for our color? lets take a look:
total_discharges.iloc[0].values, total_discharges.columns.values
# this is the total discharges for the most 
# popular surgical procedure
# the columns will be states

# (2) lets put this into a map
mmap = bqplot.market_map.MarketMap(color = total_discharges.iloc[0].values, 
                                  names = total_discharges.columns.values,
                                  scales={'color':c_sc}, 
                                   axes=[c_ax])

# (3) ok, but just clicking on things doesn't tell us too much
# lets add a little label to print out the total of the selected
import ipywidgets
label = ipywidgets.Label()
# link to market map
def get_data(change):
    # (3.1)
    #print(change['owner'].selected)
    # (3.2) loop
    v = 0.0 # to store total value
    for s in change['owner'].selected:
        v += total_discharges.iloc[0][total_discharges.iloc[0].index == s].values
    if v > 0: # in case nothing is selected
        # what are we printing?
        l = 'Total discharges of ' + \
        total_discharges.iloc[0].name + \
        ' = ' + str(v[0]) # note: v is by default an array
        label.value = l 
    
mmap.observe(get_data,'selected')
    
#mmap

# (3)
ipywidgets.VBox([label,mmap])

## Discussion:
* think back to the map we had last week: we can certainly plot this information with a more geo-realistic map
* what are the pros & cons of each style of map?  What do each highlight?  How are each biased?

## IF we have time: Re-do with other mapping system:

In [None]:
from us_state_abbrev import us_state_abbrev

sc_geo = bqplot.AlbersUSA()
state_data = bqplot.topo_load('map_data/USStatesMap.json')

#(1)
#states_map = bqplot.Map(map_data=state_data, scales={'projection':sc_geo})

#(2)
# library from last time
from states_utils import get_ids_and_names
ids, state_names = get_ids_and_names(states_map)

# color maps
import matplotlib.cm as cm
cmap = cm.Blues

# most popular surgery
popSurg = total_discharges.iloc[0]

# here, we will go through the process of getting colors to plot
#  each state with its similar color to the marketmap above:

#!pip install webcolors
from webcolors import rgb_to_hex
d = {} # empty dict to store colors
for s in states_map.map_data['objects']['subunits']['geometries']:
    if s['properties'] is not None:
        #print(s['properties']['name'], s['id'])
        # match states to abbreviations
        state_abbrev = us_state_abbrev[s['properties']['name']]
        #print(state_abbrev)
        v = popSurg[popSurg.index == state_abbrev].values[0]
        # renorm v to colors and then number of states
        v = (v - popSurg.values.min())/(popSurg.values.max()-popSurg.values.min())
        #print(v, int(cmap(v)[0]), int(cmap(v)[1]), int(cmap(v)[2]))
        # convert to from 0-1 to 0-255 rgbs
        c = [int(cmap(v)[i]*255) for i in range(3)]
        #d[s['id']] = rgb_to_hex([int(cmap(v)[0]*255), int(cmap(v)[1]*255), int(cmap(v)[2]*255)])
        d[s['id']] = rgb_to_hex(c)
    
    
def_tt = bqplot.Tooltip(fields=['name'])
    
states_map = bqplot.Map(map_data=state_data, scales={'projection':sc_geo}, colors = d, tooltip=def_tt)
# add interactions
states_map.interactions = {'click': 'select', 'hover': 'tooltip'}

# (3)
label = ipywidgets.Label()
# link to heat map
def get_data(change):
    v = 0.0 # to store total value
    if change['owner'].selected is not None:
        for s in change['owner'].selected:
            #print(s)
            sn = state_names[s == ids][0]
            state_abbrev = us_state_abbrev[sn]
            v += popSurg[popSurg.index == state_abbrev].values[0]
        if v > 0: # in case nothing is selected
            # what are we printing?
            l = 'Total discharges of ' + \
            popSurg.name + \
            ' = ' + str(v) # note: v is by default an array
            label.value = l 
    
states_map.observe(get_data,'selected')

fig=bqplot.Figure(marks=[states_map], 
                  title='US States Map Example',
                  fig_margin={'top': 0, 'bottom': 0, 'left': 0, 'right': 0}) # try w/o first and see
#fig
# (3)
ipywidgets.VBox([label,fig])

# Activity #2: Real quick ipyleaflets
* since cartopy wasn't working for folks, we'll quickly look at another option: ipyleaflets

In [4]:
#!pip install ipyleaflet
from ipyleaflet import *
# note: you might have to close and reopen you notebook
# to see the map

m = Map(center=(52, 10), zoom=8, basemap=basemaps.Hydda.Full)

#(2) street maps
strata_all = basemap_to_tiles(basemaps.Strava.All)
m.add_layer(strata_all)
m

TWFwKGJhc2VtYXA9eyd1cmwnOiAnaHR0cDovL3tzfS50aWxlLm9wZW5zdHJlZXRtYXAuc2UvaHlkZGEvZnVsbC97en0ve3h9L3t5fS5wbmcnLCAnYXR0cmlidXRpb24nOiAnVGlsZXMgY291cnTigKY=


# Activity #3: Networked data - Simple example


In [None]:
# lets start with some very basic node data
# **copy paste into chat **
node_data = [
    {"label": "Luke Skywalker", "media": "Star Wars", "shape": "rect"},
    {"label": "Jean-Luc Picard", "media": "Star Trek", "shape": "rect"},
    {"label": "Doctor Who", "media": "Doctor Who", "shape": "rect"},
    {"label": "Pikachu", "media": "Detective Pikachu", "shape": "circle"},
]

# we'll use bqplot.Graph to plot these
graph = bqplot.Graph(node_data=node_data,
                    colors = ["red", "red", "red", "red"])

fig = bqplot.Figure(marks = [graph])
fig

# you note I can pick them up and move them around, but they aren't connected in any way
# lets make some connections

In [None]:
node_data = [
    {"label": "Luke Skywalker", "media": "Star Wars", "shape": "rect"},
    {"label": "Jean-Luc Picard", "media": "Star Trek", "shape": "rect"},
    {"label": "Doctor Who", "media": "Doctor Who", "shape": "rect"},
    {"label": "Pikachu", "media": "Detective Pikachu", "shape": "circle"},
]

# lets link the 0th entry (luke skywalker) to both
#  jean-luc picard (1th entry) and pikachu (3rd entry)
link_data = [{'source': 0, 'target': 1}, {'source': 0, 'target':  3}]

graph = bqplot.Graph(node_data=node_data, link_data=link_data, 
                    colors = ["red", "red", "red", "red"])

#(2) we can also play with the springiness of our links:
graph.charge = -300 # setting it to positive makes them want to overlap and is, ingeneral, a lot of fun
# -300 is default

# (3) we can also change the link type:
graph.link_type = 'line' # arc = default, line, slant_line

# (4) highlight link direction, or not
graph.directed = False

fig = bqplot.Figure(marks = [graph])
fig

In [None]:
# we can do all the same things we've done with
# our previous map plots:
# for example, we can add a tooltip:
#(1)
tooltip = bqplot.Tooltip(fields=["media"])
graph = bqplot.Graph(node_data=node_data, link_data=link_data, 
                    colors = ["red", "red", "red", "red"],
                    tooltip=tooltip)

# we can also do interactive things with labels
label = ipywidgets.Label()

# note here that the calling sequence 
# is a little different - instead 
# of "change" we have "obj" and 
# "element"
def printstuff(obj, element):
    # (1.1)
    #print(obj)
    #print(element)
    label.value = 'Media = ' + element['data']['media']
    
graph.on_element_click(printstuff)


fig = bqplot.Figure(marks = [graph])
ipywidgets.VBox([label,fig])

# Activity #4: Network data - subset of facebook friends dataset
* from: https://snap.stanford.edu/data/egonets-Facebook.html
* dataset of friends lists

#### Info about this dataset:
* the original file you can read in has about 80,000 different connections
* it is ordered by the most connected person (person 0) at the top
* because this network would be computationally slow and just a hairball - we're going to be working with downsampled data
* for example, a file tagged "000090_000010" starts with the 10th most connected person, and only included connections up to the 90th most connected person
* Its worth noting that this dataset (linked here and on the webpage) also includes feature data like gender, last name, school, etc - however it is too sparse to be of visualization use to us

Check out the other social network links at the SNAP data webpage!

In [None]:
# from 10 to 150 connections, a few large nodes
#filename = 'facebook_combined_sm000150_000010.txt'

# this might be too large: one large node, up to 100 connections
#filename='facebook_combined_sm000100.txt'

# start here
filename = 'facebook_combined_sm000090_000010.txt'

# then this one
#filename = 'facebook_combined_sm000030_000000.txt'
# note how different the topologies are

network = pd.read_csv('/Users/jillnaiman1/Downloads/'+filename,
                sep=' ', names=['ind1', 'ind2'])
network

In [None]:
# build the network
node_data = []
link_data = []
color_data = [] # all same color

# add nodes
maxNet = max([network['ind1'].max(),network['ind2'].max()])
for i in range(maxNet+1):
    node_data.append({"label": str(i), 'shape_attrs': {'r': 8} }) # small circles
    
# now, make links
for i in range(len(network)):
    # we are linking the ith object to another jth object, but we 
    #  gotta figure out with jth object it is
    source_id = network.iloc[i]['ind1']
    target_id = network.iloc[i]['ind2']
    link_data.append({'source': source_id, 'target': target_id})
    color_data.append('blue')
                      
#link_data,node_data
#color_data

In [None]:
# plot

graph = bqplot.Graph(node_data=node_data, 
                     link_data = link_data,
                    colors=color_data)

# play with these for different graphs
graph.charge = -100 
graph.link_type = 'line'
graph.link_distance=50
# there is no direction to links
graph.directed = False

fig = bqplot.Figure(marks = [graph])
fig.layout.min_width='1000px'
fig.layout.min_height='900px'
# note: I think this has to be the layout for this to look right
fig

# in theory, we could color this network by what school folks are in, or some such
#  but while the dataset does contain some of these features, the 
#  answer rate is too sparse for our subset here

# Note: the below is just prep if you want to make your own subset datasets

In [None]:
# prep fb data by downsampling
minCon = 0
maxCon = 30
G = pd.read_csv('/Users/jillnaiman1/Downloads/facebook_combined.txt',sep=' ', names=['ind1', 'ind2'])
Gnew = np.zeros([2],dtype='int')
# loop and append
Gnew = G.loc[G['ind1']==minCon].values[0]
for i in xrange(G.loc[G['ind1']==minCon].index[0],len(G)):
    gl = G.loc[i].values
    if (gl[0] <= maxCon) and (gl[1] <= maxCon) and (gl[0] >= minCon) and (gl[1] >= minCon):
        Gnew = np.vstack((Gnew,gl))

np.savetxt('/Users/jillnaiman1/spring2019online/week09/data/facebook_combined_sm' + \
           str(maxCon).zfill(6) + '_' + str(minCon).zfill(6) + '.txt', Gnew,fmt='%i')

In [None]:
graph.link_distance