In [13]:
#Importing Libraries
import xml.etree.ElementTree as ET
import pandas as pd
from wordcloud import WordCloud
import pickle
from collections import defaultdict
import networkx as nx
import datetime

#NLTK
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

#Bokeh
from bokeh.plotting import figure, curdoc
from bokeh.models import ColumnDataSource
from bokeh.layouts import column, row, widgetbox
from bokeh.models import HoverTool, Plot
from bokeh.palettes import Spectral8
from bokeh.io import *
from bokeh.models.glyphs import ImageURL
from bokeh.models.widgets import Slider
from bokeh.application import Application
from bokeh.application.handlers import FunctionHandler
from bokeh.models import Select
from bokeh.transform import factor_cmap 
from bokeh.io import show, output_file
from bokeh.models import Plot, Range1d, MultiLine, Circle, HoverTool, TapTool, BoxSelectTool, Text
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
from bokeh.palettes import Spectral4

import glob
output_notebook()

In [14]:
cd MC3\ Data

[Errno 2] No such file or directory: 'MC3 Data'
/Users/aditya/Google Drive/UMass/First Semester/690V/Final Project/HW8/MC3 Data


In [16]:
df = pd.read_csv('csv-1700-1830Final.csv', encoding = "ISO-8859-1")

In [17]:
#Reformatting data stream for separating out dates and times:
for index, value in enumerate( list( df['date(yyyyMMddHHmmss)'])):
    dt = datetime.datetime.strptime(str(value), "%Y%m%d%H%M%S")
    df.set_value(index, 'Date', "{}/{}/{}".format(dt.month, dt.day, dt.year))
    df.set_value(index, 'Hour', dt.hour)
    df.set_value(index, 'Minute', dt.minute)
    df.set_value(index, 'Seconds', dt.second)

In [18]:
#Ensuring correct ordering of incoming Data
df.sort_values(['Date','Hour','Minute','Seconds'])

#Segregating CCdata and Microblog data:

typegroups = df.groupby('type')
ccdata = typegroups.get_group(name = 'ccdata')
ccdata = pd.DataFrame(ccdata)

mbdata = typegroups.get_group(name = 'mbdata')
mbdata = pd.DataFrame(mbdata)

In [22]:
# Sampling 5 mins of data- take a bow and represent top words as a relation web with counts defining size of blobs.
word_counts = defaultdict(float)  
corpus = ''
corpus = ' '.join(df[df.Minute < 6.0]['message'])
corpus_lower = corpus.lower()
tokenizer = RegexpTokenizer(r'\w+')
corpus_word_distribution = FreqDist(tokenizer.tokenize(corpus_lower))
stopWords = set(stopwords.words('english'))

#Removing punctuation and stopwords
for stopword in stopWords:
    if stopword in corpus_word_distribution:
        del corpus_word_distribution[stopword]

def modify_doc1(doc):
    #Plotting in Bokeh
    global sentences
    global most_common
    sentences = list(df[df.Minute < 6.0]['message'])
    most_common = corpus_word_distribution.most_common(20)
    
    def dataSample(a,b):
        global sentences
        global most_common
        word_counts = defaultdict(float)  
        corpus = ''
        if a < 60:
            start = df['Minute'] >= a
            end = df['Minute'] < b+1
            sentences = list(df[start & end]['message'])
        else:
            a = a - 60
            b = b - 60
            start = df['Minute'] >= a
            end = df['Minute'] < b+1
            mid = df['Hour'] == 18.0
            sentences = list(df[start & mid & end]['message'])
        corpus = ' '.join(sentences)
        corpus_lower = corpus.lower()
        tokenizer = RegexpTokenizer(r'\w+')
        corpus_distribution = FreqDist(tokenizer.tokenize(corpus_lower))
        stopWords = set(stopwords.words('english'))
        
        #Removing punctuation and stopwords
        for stopword in stopWords:
            if stopword in corpus_distribution:
                del corpus_distribution[stopword]
        
        return [sentences, corpus_distribution.most_common(20)]
    
    def create_figure(sentences, most_common):
        G=nx.Graph(width=2, with_labels=True)
        plot = Plot(plot_width=600, plot_height=600,
                    x_range=Range1d(-1.1,1.1), y_range=Range1d(-1.1,1.1))
        plot.title.text = "Graph Interaction Demonstration"
        edges = defaultdict(int)
        for word, count in most_common:
            G.add_node(word)
            for word2, count2 in most_common:
                if word != word2:
                    for sentence in sentences:
                        if word in sentence and word2 in sentence:
                            edges[(word,word2)] += 1        
        for words, count in edges.items():
              G.add_edge(words[0], words[1])

        node_size = {k:5*v for k,v in G.degree().items()} 
        nx.set_node_attributes(G, 'node_size', node_size)
        source = ColumnDataSource(pd.DataFrame.from_dict({k:v for k,v in G.nodes(data=True)},orient='index'))

        graph_renderer = from_networkx(G, nx.circular_layout, scale=1, center=(0,0))
        graph_renderer.node_renderer.data_source = source

        ## Add tools
        hover = HoverTool(
            tooltips=[("Word", "$index")],
            renderers=[graph_renderer]
        )

        hover_no_tooltips = HoverTool(
            tooltips=None, renderers=[graph_renderer]
        )

        plot.add_tools(hover, hover_no_tooltips, TapTool(), BoxSelectTool())
        graph_renderer.node_renderer.glyph = Text(text= 'index')
        graph_renderer.node_renderer.level = 'glyph'
        graph_renderer.node_renderer.level = 'underlay'
        graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
        graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=5)
        graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=5)
        graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=5)
        graph_renderer.selection_policy = NodesAndLinkedEdges()
        graph_renderer.inspection_policy = EdgesAndLinkedNodes()
        plot.renderers.append(graph_renderer)
        return plot

    #Setting Up Widgets
    num_most_freq = Slider(title="Select time in mins: (Sampled at 5 mins)", value=5 , start=5, end=90 , step=5)
    inputs = widgetbox(num_most_freq)
    
    # Set up callbacks
    def update_data(attrname, old, new):
        #global count
        # Get the current slider values
        global sentences
        global most_common
        n = num_most_freq.value
        #count += 1
        myData = dataSample(n-5, n+1)
        sentences = myData[0]
        most_common = myData[1]      
        layout.children[1] = create_figure(sentences, most_common)
        
    num_most_freq.on_change('value', update_data)
    layout = column(inputs, create_figure(sentences, most_common), width=800)
    doc.add_root(layout)

handler1 = FunctionHandler(modify_doc1)
app1 = Application(handler1)
app1.create_document()
show(app1, notebook_url= 'localhost:8888')

ERROR:/Users/aditya/anaconda/lib/python3.6/site-packages/bokeh/core/validation/check.py:E-1010 (CDSVIEW_SOURCE_DOESNT_MATCH): CDSView used by Glyph renderer must have a source that matches the Glyph renderer's data source: GlyphRenderer(id='b59e87d7-c0cc-420c-9563-c6c4b300ac10', ...)


In [21]:
from bokeh.layouts import widgetbox
from bokeh.models.widgets import TextInput

from datetime import date
from random import randint
from bokeh.layouts import widgetbox
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn
from bokeh.models import HoverTool

def get_data(text):
    text = text.replace(',','|')
    tempdf = df[df['message'].str.contains(text, case = False)]
    return tempdf

def modify_doc2(doc):
    def create_figure():
        global source
        global columns
        text = text_input.value
        tempdf = get_data(text)
        counts = tempdf.groupby('date(yyyyMMddHHmmss)').count()['message']
        tempdf['date'] = pd.to_datetime(tempdf['date(yyyyMMddHHmmss)'], format = '%Y%m%d%H%M%S')
        source = ColumnDataSource(tempdf[['author','message', 'date']])
        
        # create a new plot with a datetime axis type
        hover = HoverTool(tooltips=[
                ("index", "$index"),
                #("data (using $) (x,y)", "($x, $y)"),
                #("data (using @) (x,y)", "(@x, @y)")
                ("Author ", "@author"),
                ("Tweets ", "@message"),
            ])
        
        p = figure(plot_width=1000, plot_height=350, tools = [hover], x_axis_type="datetime")
        #p.line(tempdf['date'], counts, color='navy', alpha=0.5)
        p.line('date', counts, color='navy', alpha=0.5, source = source)
        p.xgrid.grid_line_color = None
        p.y_range.start = 0
        p.legend.orientation = "horizontal"
        p.legend.location = "top_center"
        return p

    #Setting Up Widgets
    text_input = TextInput(value="Kronos", title="Keywords to search:")
    
    #data_table = DataTable(source=source, columns=columns, width=400, height=280)
    
    # Set up layouts and add to document
    layout = column(text_input, create_figure())
    
    # Set up callbacks
    def text_input_change(attrname, old, new):
        layout.children[1] = create_figure()

    text_input.on_change('value', text_input_change)
    doc.add_root(layout)

handler1 = FunctionHandler(modify_doc2)
app1 = Application(handler1)
show(app1, notebook_url = "localhost:8888")