##### Run the application on the Bokeh server using the following command:
bokeh serve --show SMDM_Project.ipynb

##### Importing all the  required packages

In [5]:
from bokeh.io import curdoc, output_file
from bokeh.layouts import row, column, widgetbox
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import Slider, TextInput, DataTable, TableColumn
from bokeh.plotting import figure
from bokeh.models.glyphs import Text
from bokeh.models.tools import HoverTool
from bokeh.models.annotations import LabelSet
import bokeh.palettes
import pandas as pd
import numpy as np

from ipywidgets import widgets
from IPython.display import display
from IPython.display import clear_output

from collections import Counter
from datetime import datetime, date
from numpy import pi
import csv
import re

##### Module to split the words in the tweets and count the number of occurences

In [14]:
def word_counter(comments, review):
  extracted_comments = " ".join([c[0].lower() for c in comments if c[1] == review])
  words = re.split("\s+", extracted_comments)
  return Counter(words)

##### Module to count the number of elements in each class in the training data set, to calculate the prior probability

In [6]:
def review_count(review,train_data):
  return len([t for t in train_data if t[1] == review])

##### Module to perform the actual prediction by calculating the conoditional probability and then multiplying with the prior probability

In [7]:
def predict(review, counts_num, probability, classCount):
  prediction = 1
  review_count = Counter(re.split("\s+", review))
  for word in review_count:
    # Calculate the probability for each term for each class
      prediction *=  review_count.get(word) * ((counts_num.get(word, 0) + 1) / (sum(counts_num.values()) + classCount))
    # Multiply each class' probability with the apriori probability of that class, which is passed to the function, as 'probability'
  return prediction * probability

##### Module to classify the tweets based on the probabilities obtained for each class

In [8]:
def predict(review, counts_num, probability, classCount):
  prediction = 1
  review_count = Counter(re.split("\s+", review))
  for word in review_count:
    # Calculate the probability for each term for each class
      prediction *=  review_count.get(word) * ((counts_num.get(word, 0) + 1) / (sum(counts_num.values()) + classCount))
    # Multiply each class' probability with the apriori probability of that class, which is passed to the function, as 'probability'
  return prediction * probability

##### Module to classify the tweets into classes based on the probabilities calculated

In [9]:
def decide(review, predict, negative_counts, negative_probability, negative_review_count, positive_counts, positive_probability, positive_review_count):
    negative_prediction = predict(review, negative_counts, negative_probability, negative_review_count)
    positive_prediction = predict(review, positive_counts, positive_probability, positive_review_count)
    
    # The class with the highest probability is assigned as the class of the tweet
    if negative_prediction > positive_prediction:
      return 'Negative'
    return 'Positive'

##### Module to concatenate all the tweets to print in the output screen

In [11]:
def update_text(neg_comments,pos_comments):
    datan.text = ""
    datap.text = ""
    for i in range(len(neg_comments)):
        datan.text = datan.text + "\n" + neg_comments[i] + "\n"
    for j in range(len(pos_comments)):
        datap.text = datap.text + "\n" + pos_comments[j] + "\n"

##### Module to perform Naive Bayes. This module calls other modules to calculate the probabilities and make the predictions

In [12]:
def naive_bayes(topic):
    clear_output()
    global p, dot, hover, hover1
    
    # Clearing the charts that were plotted for the previous tweet topic
    
    # Obtaining the chart components of the previous tweet topic
    first_old = dot.select(name="first")
    second_old = dot.select(name="second")
    
    pie = p.select(name="pie")
    
    pl = lp.select(name="pl")
    nl = ln.select(name="nl")
    
    # Removing the above obtained components from the renderer
    if len(first_old)>0:
        dot.renderers.remove(first_old[0])
    if len(second_old)>0:
        dot.renderers.remove(second_old[0])
    if len(pie)>0:
        p.renderers.remove(pie[0])
    if len(pl)>0:
        lp.renderers.remove(pl[0])
    if len(nl)>0:
        ln.renderers.remove(nl[0])
    
    # Setting a 'Valid' bit to indicate whether a tweets exist for a particular tweet topic
    # 1 => tweets exist for the entered tweet topic, in the dataset
    # 0 => tweets do not exist for the entered tweet topic, in the dataset
    valid = 1
    
    # Read in the training data set
    # Obtain the comments and the corresponding review sentiment for all the training reviews
    tex = []
    review = []
    with open("data/train.csv", encoding='latin1') as csvfile:
        csv_read = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in csv_read:
            tex.append(row[0])
            review.append(row[1])            
    
    # Append the sentiment to the comments
    train_data = np.column_stack([tex,review])
    
    # Calculate the number of positive and negative words in the tweets
    negative_counts = word_counter(train_data, 'Negative')
    positive_counts = word_counter(train_data, 'Positive')
    
    # Calculate the total number of tweets available for each class
    positive_review_count = review_count('Positive',train_data)
    negative_review_count = review_count('Negative',train_data)
    
    # Calculating the apriori probability of each class
    # apriori <- (number of documents in a class) / (total number of documents)
    positive_probability = positive_review_count / len(train_data)
    negative_probability = negative_review_count / len(train_data)   
    
    # Load the test data from the Twitter dataset
    test_data = []
    dt_tm = []
    with open("sentiment140/training.1600000.processed.noemoticon.csv", encoding='latin1') as csvfile:
        csv_read = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in csv_read:
            # Obtain the tweet from the dataset, as the value in column 6
            test_data.append(row[5])
            # Obtain the date related to the tweet, from column 3 of the dataset
            dt = row[2][4:11]+row[2][24:28]+" "+row[2][11:19].strip()
            # Modify the date string to a datetime object
            datetime_object = datetime.strptime(dt, '%b %d %Y %H:%M:%S')
            # Store the datetime objects in a list
            dt_tm.append(datetime_object.date())
    
    topic_tweets = []
    dt_tm_sub = []

    # Identify the tweets that contain the entered tweet topic
    # Extract the obtained tweets and the corresponding dates and store them in 2 separate lists
    for i in range(test_data.__len__()):
        if (topic.lower() in test_data[i].lower()):
            topic_tweets.append(test_data[i])
            dt_tm_sub.append(dt_tm[i])

    # Calculate the number of tweets that contain the entered tweet topic
    tweets_cnt = len(topic_tweets)
    
    # Make the prediction of the tweet by using the probabilities and counts calculated above
    predictions = [decide(r[5], predict, negative_counts, negative_probability, negative_review_count, positive_counts, positive_probability, positive_review_count) for r in topic_tweets]
    
    pos_count = 0
    neg_count = 0
    
    neg_comments = []
    pos_comments = []
    
    neg_dt = []
    pos_dt = []
    
    # From the predictions made, count the number of positive and negative comments 
    # and separate the positive and negative comments into 2 separate lists
    for i in range(len(predictions)):
        if (predictions[i] == 'Negative'):
            neg_count += 1
            neg_comments.append(topic_tweets[i])
            neg_dt.append(dt_tm_sub[i])
        else:
            pos_count += 1
            pos_comments.append(topic_tweets[i])
            pos_dt.append(dt_tm_sub[i])

    # Define 2 data frames to store the dates for the line charts
    dfn = pd.DataFrame(
    {'Date': neg_dt
    })
    
    dfp = pd.DataFrame(
    {'Date': pos_dt
    })

    # Find the number of positive and negative counts for each date
    dfn = dfn.groupby(['Date'], sort=True).size().reset_index(name='Count')
    dfp = dfp.groupby(['Date'], sort=True).size().reset_index(name='Count')
    
    # Individually group the tweets predicted to be positive and negative tweets
    update_text(neg_comments,pos_comments)
    
    print("Calculations completed for the topic",topic,"...")
    
    # The charts need to be plotted only when the entered topic has any tweets, i.e., tweets_cnt > 0
    # Else all the values should be 0 and no charts should be plotted
    if(tweets_cnt > 0):
        pos_pct = round(pos_count*100/tweets_cnt,2)
        neg_pct = round(neg_count*100/tweets_cnt,2)
    else:
        pos_pct = 0
        neg_pct = 0
        valid = 0

    percents = []
    
    p.title.text = "Entered Tweet Topic: " + topic
    
    # Define starts/ends for wedges from the percentages of a circle
    percents.append(0)
    percents.append(round(pos_pct/100,2))
    percents.append(round((pos_pct+neg_pct)/100,2))
    
    starts = [p*2*pi for p in percents[:-1]]
    ends = [p*2*pi for p in percents[1:]]
    
    # A color for each pie piece of the pie chart
    # Red represents negative comments 
    # Green represents positive comments
    colors = ['Green','Red']
        
    # If valid is 1, i.e., tweets pertaining to the entered topic exists, then draw all the charts in the application
    if(valid == 1):
        dot.segment(0, ["Positive", "Negative"], [pos_count,neg_count], ["Positive", "Negative"], name="first", line_width=2, line_color="green", )
        dot.circle([pos_count,neg_count], ["Positive", "Negative"], name="second", size=15, fill_color="orange", line_color="green", line_width=3, )
        hover1 = HoverTool(tooltips = [('Positive Comments', str(pos_count)), ('Negative Comments', str(neg_count))], name="h1")
        dot.add_tools(hover1)
        
        p.wedge(x=0, y=0, radius=0.5, name="pie", alpha=0.3, start_angle=starts, end_angle=ends, color=colors)

        hover = HoverTool(tooltips = [('Positive Comments', str(pos_pct)+'%'), ('Negative Comments', str(neg_pct)+'%')], name="h")
        p.add_tools(hover)
        
        lp.line(dfp['Date'], dfp['Count'], line_width=2, color='Green', name="pl")
        ln.line(dfn['Date'], dfn['Count'], line_width=2, color='Red', name="nl")
        
    # If valid is not 1, then do not draw any charts
    # Just change the title of the chart to state that there are on related tweets available
    else:
        p.title.text = "There are no tweets available for the topic " + topic

##### Main module to plot the graphs and invoke all the other modules

In [13]:
import pandas as pd

from bokeh.io import curdoc
from bokeh.layouts import row, column, gridplot
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import PreText, Select
from bokeh.plotting import figure

datan = PreText(text='', width=500)
datap = PreText(text='', width=450)
blank = PreText(text='', width=50)

dot = figure(tools="", toolbar_location=None, y_range=["Positive", "Negative"], width=500, height=200)
dot.xaxis.axis_label = 'Number of Comments'

p = figure(x_range=(-1,1), y_range=(-1,1), tools="", toolbar_location=None, width=500, height=400)
p.axis.visible = False
p.grid.visible = False

lp = figure(x_axis_type="datetime", title="Positive Comments", tools="", toolbar_location=None, width=500, height=250)
lp.grid.grid_line_alpha = 0.3
lp.xaxis.axis_label = 'Date'
lp.yaxis.axis_label = 'Number of Comments'

ln = figure(x_axis_type="datetime", title="Negative Comments", tools="", toolbar_location=None, width=500, height=250)
ln.grid.grid_line_alpha = 0.3
ln.xaxis.axis_label = 'Date'
ln.yaxis.axis_label = 'Number of Comments'

def update_title(attrname, old, new):
    topic = text.value.strip()
    naive_bayes(topic)
    
text = TextInput(title="Enter a tweet topic: ", value='')
text.on_change('value', update_title)

inputs = widgetbox(text)

widgets = row(column(lp,datap),blank,column(ln,datan))
main = column(inputs,p,dot)
layout = row(widgets,main)

curdoc().add_root(layout)
curdoc().title = "Sentiment Analysis using Naive Bayes"