# 1. Package Imports

In [2]:
%reset
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import re
import copy

import pandas as pd
import seaborn as sns

from bokeh.charts import Histogram, TimeSeries, show
from bokeh.io import output_notebook

output_notebook()

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


# 2. Load Twitter datasets

In [3]:
def load_data(name):
    twitter_db = pd.read_json(name)
    
    twitter_db = twitter_db.drop(labels=['fullname', 'id', 'url', 'user'], axis=1)
    twitter_db = twitter_db.sort_values(by=['timestamp'], ascending=True).reset_index(drop=True)
    
    return twitter_db

In [4]:
mappings = zip(
    [ 'parkland', 'pulse', 'sanbernardino', 'sutherland', 'vegas' ],
    [ 'parklandshooting.json', 'pulsenightclub.json', 'sanbernardino.json', 'sutherlandsprings.json', 'vegasshooting.json' ]
)

datasets = dict((key, load_data('datasets/' + fname)) for key, fname in mappings)

# 3. Timeseries data (Volume of tweets) 

In [5]:
# Returns a dictionary containing mappings of labels to dataframes
# X-axis labels contained within 'date' key
def plot_timeseries(datasets):
    datasets = copy.deepcopy(datasets)
    
    # Create a 30 minute interval range for all tweets, starting from time '0'
    st = pd.to_datetime(0)
    date_range = pd.date_range(start=st, end=st + pd.to_timedelta('8D'), freq='30min')
    
    for key, db in datasets.items():
        # Start timeseries for each event at 'time 0'
        ts_min = db['timestamp'].min()
        # Round adjusted tweet timestamps to the nearest half hour
        db['timestamp'] = db['timestamp'].apply(lambda ts: st + (ts - ts_min).round('30 min'))
        
        # Count number of tweets made in each 30 minute interval
        num_tweets = db.groupby('timestamp')['text'].count()
        num_tweets = num_tweets.reindex(date_range, fill_value=0)
        
        datasets[key] = num_tweets.values
    
    datasets['date'] = date_range.tolist()
    return datasets

In [31]:
show(
    TimeSeries(
        plot_timeseries(datasets),
        x='date', y=list(datasets.keys()),
        title='Timeseries data of mass shootings',
        ylabel='Tweet volume (30 min)',
        plot_width=400,
        plot_height=400
    )
)

# 4. Histogram of tag density within tweets

In [78]:
# Returns a Series
def tag_nontag_ratio(datasets):
    out_df = pd.DataFrame(columns=['ratio', 'key'])
    
    for key, db in datasets.items():
        # Word count of tweet
        word_c = db['text'].str.split().apply(len)

        # Counts number of words in tweet body which start with '@' or '#'
        filter_tags = lambda line: ' '.join(filter(lambda word: word.startswith('@') or word.startswith('#'), line.split()))
        tag_c = db['text'].apply(filter_tags).str.split().apply(len)
        
        ratio_df = pd.DataFrame(
            data=(tag_c / word_c).rename('ratio'),
            columns=['ratio', 'key']
        )
        ratio_df['key'] = key
        
        out_df = out_df.append(ratio_df)
        
    return pd.DataFrame(out_df)

In [79]:
show(
    Histogram(
        output,
        title="Histogram of % tags within tweet",
        values="ratio",
        label="key",
        color="key", legend="top_right", bins=10, plot_width=400, plot_height=400
    )
)