# Libraries

In [1]:
import requests
import os
import yaml
os.chdir('../')

In [2]:
import datetime

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
params = {'legend.fontsize': 14,
          'figure.figsize': (15, 8),
         'axes.labelsize': 14,
         'axes.titlesize': 14,
         'xtick.labelsize': 14,
         'ytick.labelsize': 14}
plt.rcParams.update(params)
pd.options.display.max_columns = 50
pd.options.display.precision = 2

In [5]:
from cloudant.view import View
from cloudant.client import Cloudant
from cloudant.document import Document

In [6]:
def unfold_keys(df):
    df = df.copy()
    key_column = 'key'
    for i in range(0, len(df.loc[0,key_column])):
        df.insert(i, 'level_{}'.format(i+1), df[key_column].apply(lambda x: x[i]))
    del df[key_column]
    return df

In [7]:
with open("config.yaml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)

In [8]:
cfg = cfg['COUCHDB']
client = Cloudant(cfg['user'], cfg['password'], url=cfg['host'])
client.connect()
tweets_db = client[cfg['tweets_db']]
aurin_db = client[cfg['aurin_db']]

i = 0
for document in tweets_db:
    break
    if document.json()[0] != '{':
        print(document)
    i += 1
    if divmod(i, 10000)[1] == 0:
        print(i)

# Load data

In [9]:
def preprocess_view_data(df):
    geo_sentiment_columns = ['sa4_area', 'gccsa_area', 'state', 'sentiment', 'date', 'tweets_count']
    data_df = unfold_keys(df)
    #data_df.dropna(inplace=True)
    data_df.columns = geo_sentiment_columns
    data_df['date'] = data_df['date'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y')).astype(str)
    data_df.sort_values('tweets_count', ascending=False, inplace=True)
    data_df = data_df[~(data_df.state == 'Other Territories')]
    data_df.dropna(inplace=True)
    data_df.set_index(geo_sentiment_columns[:-1], inplace=True)
    return data_df

In [10]:
def get_sentiment_df(data, level):
    df = data.unstack(level='sentiment', fill_value=0)
    df = df.groupby(level=level).sum()
    df.columns = df.columns.droplevel(0)
    sorted_columns = list(df.sum().sort_values(ascending=False).index.values)
    df = df[sorted_columns]
    return df

## Load all tweets stat

In [11]:
ddoc = Document(tweets_db, '_design/sentiment_analysis')
view = View(ddoc, 'geo_sentiment_counts', )
with view.custom_result(group=True, stale='ok') as rslt:
    df = pd.DataFrame.from_records(rslt.all())

In [12]:
df['value'].sum()

202807

In [13]:
data_df = preprocess_view_data(df)

In [14]:
data_df['tweets_count'].sum()

197174

In [15]:
gccsa_sentiment_df = get_sentiment_df(data_df, 'gccsa_area')
sa4_sentiment_df = get_sentiment_df(data_df, 'sa4_area')

## Load topic tweets stat

In [86]:
ddoc = Document(tweets_db, '_design/topics')
view = View(ddoc, 'blockchain-all-stat', )
#view = View(ddoc, 'ML-all-stat', )

#ddoc = Document(tweets_db, '_design/topic_testing')
#view = View(ddoc, 'view-all-Cryptos')
with view.custom_result(group=True, stale='ok') as rslt:
    df = pd.DataFrame.from_records(rslt.all())

In [87]:
df['value'].sum()

1829

In [88]:
topic_data_df = preprocess_view_data(df)
topic_gccsa_sentiment_df = get_sentiment_df(topic_data_df, 'gccsa_area')
topic_sa4_sentiment_df = get_sentiment_df(topic_data_df, 'sa4_area')

In [112]:
topic_gccsa_sentiment_df.sum().sum()

1799

## Load Aurin Data

In [89]:
income_names = aurin_db['dataset-AU_Govt_ABS-gccsa_g02_selected_medians_and_averages_census_2016-gccsa_2016']['selectedAttributes']
income_names = {item['name']:item['title']  for item in income_names}
    
labour_names = aurin_db['dataset-AU_Govt_ABS-sa4_g43b_lbr_frc_status_by_age_by_sex_census_2016-sa4_2016']['selectedAttributes']
labour_names = {item['name']:item['title']  for item in labour_names}

### SA4 Median Income

In [90]:
data = [item['properties'] for item in aurin_db['SA4_median_income']['features']]
sa4_median_income_df = pd.DataFrame.from_records(data).set_index('sa4_name16')
del sa4_median_income_df['sa4_code16']
sa4_median_income_df.rename(columns=income_names, inplace=True)
#sa4_median_income_df.head()

### SA4_Labour_Force

In [91]:
data = [item['properties'] for item in aurin_db['SA4_Labour_Force']['features']]
sa4_labour_force_df = pd.DataFrame.from_records(data).set_index('sa4_name16')
del sa4_labour_force_df['sa4_code16']
sa4_labour_force_df.rename(columns=labour_names, inplace=True)
sa4_labour_force_df['Unemployment Rate'] = sa4_labour_force_df['Persons Total unemployed Total'] / \
                                           sa4_labour_force_df['Persons Total labour force Total'] *100
#sa4_labour_force_df.head()

### GCCSA Median Income

In [92]:
data = [item['properties'] for item in aurin_db['GCCSA_median_income']['features']]
gccsa_median_income_df = pd.DataFrame.from_records(data).set_index('gcc_name16')
del gccsa_median_income_df['gcc_code16']
gccsa_median_income_df.rename(columns=income_names, inplace=True)
#gccsa_median_income_df

### GCCSA_Labour_Force

In [93]:
data = [item['properties'] for item in aurin_db['GCCSA_Labour_Force']['features']]
gccsa_labour_force_df = pd.DataFrame.from_records(data).set_index('gcc_name16')
del gccsa_labour_force_df['gcc_code16']
gccsa_labour_force_df.rename(columns=labour_names, inplace=True)
gccsa_labour_force_df['Unemployment Rate'] = gccsa_labour_force_df['Persons Total unemployed Total'] / \
                                           gccsa_labour_force_df['Persons Total labour force Total'] *100
#gccsa_labour_force_df

## GCCSA Data

In [94]:
gccsa_df = gccsa_sentiment_df.join(gccsa_median_income_df).join(gccsa_labour_force_df)
gccsa_df.to_csv('gccsa.csv')

In [95]:
topic_gccsa_df = topic_gccsa_sentiment_df.join(gccsa_median_income_df).join(gccsa_labour_force_df)
topic_gccsa_df.to_csv('topic_gccsa.csv')

## SA4 Data

In [96]:
sa4_df = sa4_sentiment_df.join(sa4_median_income_df).join(sa4_labour_force_df)
sa4_df.to_csv('sa4.csv')

In [97]:
topic_sa4_df = topic_sa4_sentiment_df.join(sa4_median_income_df).join(sa4_labour_force_df)
topic_sa4_df.to_csv('topic_sa4.csv')

# Visualisations

In [98]:
import plotly
import plotly.graph_objs as go
from plotly.graph_objs import Scatter, Layout
plotly.offline.init_notebook_mode(connected=True)

## Tweets by States

In [62]:
plot_df = data_df.groupby(level='state').sum().sort_values('tweets_count', ascending=False)

In [63]:
labels = plot_df.index.values.tolist()
values = plot_df.values.ravel().tolist()
trace = go.Pie(labels=labels, values=values, 
               hoverinfo='label+percent', sort=True, showlegend=False,
               textinfo='value+label', textposition='outside')
layout = go.Layout(title='Number of Collected Tweets by States')
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig, filename='basic_pie_chart')

In [64]:
plot_df = topic_data_df.groupby(level='state').sum().sort_values('tweets_count', ascending=False)

In [65]:
labels = plot_df.index.values.tolist()
values = plot_df.values.ravel().tolist()
trace = go.Pie(labels=labels, values=values, 
               hoverinfo='label+percent', sort=True, showlegend=False,
               textinfo='value+label', textposition='outside')
layout = go.Layout(title='Number of Collected Topic Tweets by States')
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig, filename='basic_pie_chart')

In [66]:
def plot_area_chart(plot_df, graph_title):
    button_masks = []
    data = []
    territories = plot_df.columns.get_level_values(0).unique()
    for idx, territory in enumerate(territories):
        mask = np.zeros(territories.shape[0]*(len(sentiment_columns)+1))
        territory_df = plot_df[territory][sentiment_columns]


        cols = []
        for jdx, col in enumerate(territory_df.columns):
            cols.append(col)
            trace = go.Scatter(
                x=territory_df.index.values,
                y=territory_df[cols].sum(axis=1).values.ravel(),
                text=territory_df[col],
                hoverinfo='x+text',
                fill='tonexty',
                visible=(idx==0),
                fillcolor=sentiment_colors[col],
                line = dict(color = sentiment_colors[col]),
                name=col,
            )
            data.append(trace)
            mask[idx*(len(sentiment_columns)+1) + jdx] = 1
        dummy_trace = go.Scatter(x=[None], y=[None], name='<b>Tweet Sentiment</b>', visible=(idx==0),
                                line={'color': 'rgba(0, 0, 0, 0)'})
        data.append(dummy_trace)
        mask[idx*(len(sentiment_columns)+1) + len(sentiment_columns)] = 1

        button_masks.append(list(mask==1))

    buttons = []
    for idx, mask in enumerate(button_masks):
        button = dict(
            label = territories[idx],
            method = 'update',
            args = [{'visible': mask},
                    {'title': '{} in {}'.format(graph_title, territories[idx])}],
        )
        buttons.append(button)
    updatemenus = list([dict(showactive = False,
                             buttons=buttons,

                             x = 0.01, xanchor = 'left',
                             y = 1.0,
                             yanchor = 'top',)])

    layout = go.Layout(title=graph_title,
                       xaxis=dict(title='Tweet Creation Date'),
                       yaxis=dict(title='Number of Collected Tweets'),
                       updatemenus=updatemenus,
                      )
    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig,filename='basic-area')

## Tweets by Sentiment, GCCSA Area and Time

In [67]:
sentiment_columns = ['Positive', 'Neutral', 'Negative']
sentiment_colors = {'Positive': 'rgb(120,220,120)', 'Negative':'rgb(245,99,99)', 'Neutral': 'rgb(250,250,150)'}

In [68]:
def add_missed_columns(df):
    l0_values, l1_values  = df.columns.levels
    mask = np.ones((l0_values.shape[0], l1_values.shape[0]), dtype=int)
    mask[df.columns.labels] = 0
    l0,l1 = mask.nonzero()
    for i, j in zip(l0, l1):
        df[l0_values[i],l1_values[j]] = 0
    return df.sort_index(axis=1, level=0)

In [69]:
def transform_df(df):
    group_columns = ['date', 'gccsa_area', 'sentiment']
    plot_df = df.groupby(level=group_columns).sum().unstack(level=['gccsa_area','sentiment'], fill_value=0)
    plot_df = plot_df.loc['2018-04-01':]
    plot_df.columns = plot_df.columns.droplevel(0)
    plot_df = add_missed_columns(plot_df)
    plot_df.sort_index(level=0, axis=1, inplace=True)

    temp_df = plot_df.groupby(axis=1, level='sentiment').sum()
    for col in sentiment_columns:
        plot_df.insert(0, ('All Austalia', col), temp_df[col])
    return plot_df

## Header

In [70]:
plot_area_chart(transform_df(data_df), graph_title = 'Number of Tweets by Creation Date')

dskfhlksdhfklsh kdshflksfl

In [71]:
plot_area_chart(transform_df(topic_data_df), graph_title = 'Number of Blockchain-Related Tweets by Creation Date')

# Sentiment By States

## Plotly plots

In [100]:
mapbox_access_token = 'pk.eyJ1Ijoidml0YWx5LXlha3V0ZW5rbyIsImEiOiJjamd3ZGJpMTQwcDA2MzNsNjg2dTJzeG1zIn0.shxkPXdNpbwF1UI2Gm24fg'

In [101]:
sa4_points_df = pd.read_json('notebooks/sa4_center.json')
sa4_points_df.set_index('SA4_name', inplace=True)
sa4_points_df['lat'] = sa4_points_df.Centre.apply(lambda x: x[0][1])
sa4_points_df['lng'] = sa4_points_df.Centre.apply(lambda x: x[0][0])
del sa4_points_df['Centre']

In [102]:
sa4_df = sa4_df.join(sa4_points_df)

In [103]:
sa4_df['Unemployment Rate'] = sa4_df['Unemployment Rate'].apply(lambda x: '{:.1f}%'.format(x))

In [113]:
columns = sentiment_columns + ['Median total personal income weekly', 
                               'Median total family income weekly', 'Unemployment Rate']
def format_title(vector):
    lines = []
    for key, value in vector.iteritems():
        lines.append('{}: {}'.format(key, value))
    return '<br>' + '</br>'.join(lines)
    #return lines
sa4_df['text'] = sa4_df[columns].astype(str).apply(format_title, axis=1)

In [114]:
scl = [ [0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
    [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"] ]

data = [ dict(
    type = 'scattermapbox',
    lat=sa4_df.lat,
    lon=sa4_df.lng,
    text=list(sa4_df.index.values.ravel()),
    mode = 'markers+text',
    hoverinfo='text',
    hovertext=sa4_df.text,
    marker=go.Marker(
            #symbol='square-15.svg',
            size = 12,
            opacity = 0.6,
            autocolorscale = False,
            colorscale = 'Viridis',
            cmin = 0,
            color = sa4_df['Median total personal income weekly'],
            cmax = sa4_df['Median total personal income weekly'].max(),
            colorbar=dict( title="Positive Sentiment")
            )
        )]

layout = dict(
    title = 'Scatter Mapplot of AURIN and Twitter Data',
    colorbar = True,
    width=1000,
    height=850,
    margin=go.Margin(l=10, r=10, b=10, t=35, pad=4),
        mapbox=dict(
                    accesstoken=mapbox_access_token,
                    bearing=0,
                    center=dict(
                        lat=-28,
                        lon=134
                    ),
        pitch=0,
        zoom=3.8
    ),
    )

fig = dict( data=data, layout=layout )
plotly.offline.iplot( fig, validate=False, filename='' )