# Interactive Histogram

By:
 - Aditya Agrawal (adityaagrawa@cs.umass.edu) 
 - Sanjay Reddy S (ssatti@umass.edu)

#### Works on Python 3

This file presents the code and the plot for visualizing 'n' number of entities present in our text, ranging amongst 'n' types of entities, both of which can be interactively selected and viewed. NER_dict.pkl should be in the same directory as this IPython file.


### Loading Libraries

In [5]:
%load_ext autoreload
%autoreload 2

#Importing Libraries
import xml.etree.ElementTree as ET
from wordcloud import WordCloud
import pickle

#NLTK
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

#Bokeh
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.layouts import column, row, widgetbox
from bokeh.models import HoverTool, Plot
from bokeh.palettes import Spectral6
from bokeh.io import *
from bokeh.models.glyphs import ImageURL
from bokeh.models.widgets import Slider
from bokeh.application import Application
from bokeh.application.handlers import FunctionHandler
from bokeh.transform import factor_cmap   
from bokeh.models import Select

output_notebook()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Loading Saved Data

In [6]:
with open(r"NER_dict.pkl", "rb") as input_file:
    NER_dict = pickle.load(input_file)


Our NER_dict, i.e,- Named_Entity_Recognition_dictionary has been previously generated in the Interative_WordCloud.ipynb file, which takes a really long time to run, thus we have stored it using pickle module locally, for faster loading. Using our NER_dict, which contains a dictionary of types of entities, which are themselves dictionaries mapping the specific named entity to the counts of it's occurences. All the above has been coded in the aforementioned file. Using this visualization we can view our top 'N' entities, of different types, like- top people mentioned, top places mentioned in the text, etc.

### Plotting Interactive Histogram

In [16]:
top5 = []
counts = []

def get_data(n, label = "GPE"):
    global top5
    global counts
    global source
    top5.clear()
    counts.clear()
    for name, count in reversed(sorted(NER_dict[label].items(), key=lambda item: (item[1], item[0]))):
        if len(top5) < n:
            top5.append(name)
            counts.append(count)
        else:
            break
    source = ColumnDataSource(data=dict(top5=top5, counts=counts))
    
get_data(5)
source = ColumnDataSource(data=dict(top5=top5, counts=counts))

def modify_doc2(doc):
    def create_figure():
        t1, t2 = ticker1.value, ticker2.value
        get_data(int(t1),t2)
        global top5
        global source
        p = figure(x_range=top5, plot_height=550, plot_width = 150*len(top5), 
                   title=" Top N Occurences of Entites after NER Tagging")

        p.vbar(x='top5', top='counts', width=0.7, source=source, legend="top5",
               line_color='white', fill_color=factor_cmap('top5', palette=Spectral6*10, factors=top5))
        p.xgrid.grid_line_color = None
        p.y_range.start = 0
        p.legend.orientation = "horizontal"
        p.legend.location = "top_center"
        return p

    #Setting Up Widgets
    ticker1 = Select(value= '10', options=['5','10','15','20','25','50'], 
                                                                 title = "Number of Entities")
    ticker2 = Select(value= 'PERSON', 
        options=['GPE', 'ORGANIZATION', 'PERSON', 'FACILITY', 'GSP', 'LOCATION'], 
                                                                        title = "Entity Type")
    # Set up layouts and add to document
    layout = column(row(ticker1, ticker2), create_figure())
    
    # Set up callbacks
    def ticker1_change(attrname, old, new):
        layout.children[1] = create_figure()

    def ticker2_change(attrname, old, new):
        layout.children[1] = create_figure()

    ticker1.on_change('value', ticker1_change)
    ticker2.on_change('value', ticker2_change)
    doc.add_root(layout)

handler1 = FunctionHandler(modify_doc2)
app1 = Application(handler1)
show(app1, notebook_url = "localhost:8888")