In [123]:
import glob
import collections
from datetime import datetime


independent = set(['King'])
democrats = set(['Stabenow', 'Feinstein', 'klobuchar', 'Menendez', 'Cantwell', 'Warren', 'Gillibrand', 
                'Tester', 'Manchin', 'Murphy','Baldwin',  'hirono', 'Carper', 'Sanders', 'kaine', 'Brown', 
                'Heinrich', 'Whitehouse', 'Casey'])
republican = set(['Wicker', 'Barrasso', 'Fischer', 'Cruz'])

# Ana
NUM_TAGS = 5
count_by_tags = collections.defaultdict(int)
tags_by_date = collections.defaultdict(list)
hashtags_min_date = {}
hashtags_max_date = {}
date_tag = collections.defaultdict(list)
date_tag_republican = collections.defaultdict(list)
date_tag_democrats = collections.defaultdict(list)


def get_tags_by_name(name):
    tags = collections.defaultdict(list)
    dates = list(glob.glob(f"/home/simi/projects/tweet2vec/results/{name}/*"))
    for path_date in dates:
        date = path_date.rindex('/')
        date = path_date[date+1:]
        path = f"{path_date}//predicted_tags.txt"
        
        with open(path, 'r') as f:
            line = f.readline()
            while line:
                str_tags = line.replace('\n', '').split(' ')
                str_tags = [t.strip() for t in str_tags][:NUM_TAGS]
                if 'UNK' != str_tags[0]:
                    for t in str_tags: 
                        count_by_tags[t] += 1
                    tags_by_date[date].extend(str_tags)
                    tags[date].append(str_tags) 
                    
                    date_in_type = [int(x) for x in date.split('-')]
                    date_in_type = datetime(date_in_type[0], date_in_type[1], date_in_type[2])
                    for tag in str_tags:
                        date_tag[tag].append([str(date_in_type.date()), 1])
                        
                        if name in republican:
                            date_tag_republican[tag].append([str(date_in_type.date()), 1])
                        else:
                            date_tag_democrats[tag].append([str(date_in_type.date()), 1])
                        

                        if tag not in hashtags_min_date:
                            hashtags_min_date[tag] = date_in_type
                            hashtags_max_date[tag] = date_in_type

                        if date_in_type < hashtags_min_date[tag]:
                            hashtags_min_date[tag] = date_in_type
                        if date_in_type > hashtags_max_date[tag]:
                            hashtags_max_date[tag] = date_in_type
                    
                line = f.readline()

    return tags
    
tags_by_name = {}
names =  list(glob.glob(f"/home/simi/projects/tweet2vec/results/*"))
for path_name in names:
    name = path_name.rindex('/')
    name = path_name[name+1:]
    tags_by_name[name] = get_tags_by_name(name)

In [124]:
l_tags = [(v, k) for (k,v) in count_by_tags.items()]
l_tags.sort(reverse=True)

In [125]:
import plotly.offline as py
import plotly.figure_factory as ff

py.init_notebook_mode(connected=True)

TOP_K = 20
df = []
max_tag = max([l_tags[i][0] for i in range(TOP_K)])

def get_bucket(max_tag, num_buckets, my_val):
    tags_per_group = max_tag/num_buckets
    for i in range(num_buckets):
        if my_val > i*tags_per_group and my_val <= (i+1)*tags_per_group:
            return f"{int(i*tags_per_group)} < {int((i+1)*tags_per_group)}"
    return f"< {max_tag}"

NUM_BUCKETS = 10

for i in range(TOP_K):
    tag = l_tags[i][1]
    s_date = hashtags_min_date[tag]
    e_date = hashtags_max_date[tag]
    s_date = f"{s_date.year}-{s_date.month}-{s_date.day}"
    e_date = f"{e_date.year}-{e_date.month}-{e_date.day}"
    bucket = get_bucket(max_tag, NUM_BUCKETS, l_tags[i][0])
    df.append(dict(Task=tag, Start=s_date, Finish=e_date, Resource=f'{bucket}'))

colors = "Viridis" # ["#FCB711", "#F37021", "#CC004C", "#6460AA", "#0089D0", "#0DB14B"]

fig = ff.create_gantt(df[::-1], bar_width=0.25,  title='Hashtag Over Time', 
                      showgrid_x=True, show_colorbar=True, index_col='Resource')
py.iplot(fig)

In [126]:
import plotly.graph_objs as go
import pandas as pd

def get_tagdf(tag_name, d_tags,  cumsum=True):
    if not d_tags[tag_name]:
        return
    taggis = d_tags[tag_name]
    taggis.sort()

    df_date_tag = pd.DataFrame(taggis)
    df_date_tag = df_date_tag.groupby(0).sum()
    if cumsum:
        df_date_tag = df_date_tag.cumsum()
    return go.Scatter(x=df_date_tag.index, y=df_date_tag[1], name=tag_name)

tag_names = ['obamacare', 
             'trumpcare', 
             'taxreform', 
             'netneutrality', 
             'WhatAreTheyHiding',
            'Kavanaugh',
            'Russia','Ohio']

dts = [get_tagdf(tn, date_tag) for tn in tag_names]

data = [dt for dt in dts if dt]

layout = dict(
    title = "Hashtags over time"
)

fig = dict(data=data, layout=layout)

py.iplot(fig, filename = 'Hashtags over time over all senators')


In [127]:
tag_names = ['obamacare', 
             'trumpcare', 
             'taxreform', 
             'netneutrality', 
             'WhatAreTheyHiding',
            'Kavanaugh',
            'Russia','Ohio', 'jobs', 'healthcare']

dts = [get_tagdf(tn, date_tag_republican) for tn in tag_names]

data = [dt for dt in dts if dt]

layout = dict(
    title = "Hashtags over time"
)

fig = dict(data=data, layout=layout)

py.iplot(fig, filename = 'Hashtags over time for republicans')


In [120]:
tag_names = ['obamacare', 
             'trumpcare', 
             'taxreform', 
             'netneutrality', 
             'WhatAreTheyHiding',
            'Kavanaugh',
            'Russia','Ohio']

dts = [get_tagdf(tn, date_tag_democrats) for tn in tag_names]

data = [dt for dt in dts if dt]

layout = dict(
    title = "Hashtags over time"
)

fig = dict(data=data, layout=layout)

py.iplot(fig, filename = 'Hashtags over time for democrats')


In [110]:
# Plot distribution of top 5 hashtags for each senator

def count_all_hashtags(tags):
    tags_by_count = collections.defaultdict(int)
    for date, tags in tags.items():
        for t in tags:
            for t_ in t:
                tags_by_count[t_] += 1
                
    return tags_by_count

data = []
i = 1
for sen, tags in tags_by_name.items():
    num_tags_by_name = count_all_hashtags(tags)
    
    num_tags_by_name = {k:v for (k,v) in num_tags_by_name.items() if v > 1}
    
    data.append(
            go.Bar(
                x = list(num_tags_by_name.keys()),
                y = list(num_tags_by_name.values()),
                name = sen
            )
    )
    if i > 3:
        py.iplot(data, filename='hashtags_by_senator')
        data.clear()
        i = 0
    else:
        i += 1
    


    


In [114]:



data = []
i = 1
for sen in republican:
    tags = tags_by_name[sen]
    num_tags_by_name = count_all_hashtags(tags)
    
    num_tags_by_name = {k:v for (k,v) in num_tags_by_name.items() if v > 1}
    
    data.append(
            go.Bar(
                x = list(num_tags_by_name.keys()),
                y = list(num_tags_by_name.values()),
                name = sen
            )
    )
py.iplot(data, filename='hashtags_by_senator')
data.clear()

        
        
data = []
i = 1
for sen in democrats:
    tags = tags_by_name[sen]
    num_tags_by_name = count_all_hashtags(tags)
    
    num_tags_by_name = {k:v for (k,v) in num_tags_by_name.items() if v > 1}
    
    data.append(
            go.Bar(
                x = list(num_tags_by_name.keys()),
                y = list(num_tags_by_name.values()),
                name = sen
            )
    )
    if i > 5:
        py.iplot(data, filename='hashtags_by_senator')
        data.clear()
        i = 0
    else:
        i += 1
    


