In [1]:
import pandas as pd
import random
import collections
from datetime import datetime, timedelta
from email.utils import parsedate_tz



def to_datetime(datestring):
    time_tuple = parsedate_tz(datestring.strip())
    dt = datetime(*time_tuple[:6])
    return dt - timedelta(seconds=time_tuple[-1])

tweets = pd.read_csv("senators-1-tweets.csv", header=None, names=['hashtags', 'text', 'user', 'user_location', 'created_at'])  
print('num tweets: {}'.format(len(tweets)))

def process_tweet(tweet):  
    d = {}
    d['hashtags'] = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
    d['text'] = tweet['full_text']
    d['user'] = tweet['user']['screen_name']
    d['user_loc'] = tweet['user']['location']
    d['created_at'] = tweet['created_at']
    return d


tweets_table = tweets.values
N = len(tweets_table)
hashtags_sum = collections.defaultdict(int)
hashtags_min_date = {}
hashtags_max_date = {}

date_tag = collections.defaultdict(list)

new_tags_df = []

for i, tw in enumerate(tweets_table):
    if i % 10000 == 0:
        print('{}%'.format(100./N*i), end=' ')
    for tag in tw[0].replace('[', '').replace(']', '').replace('\'', '').split(","):
        l_tag = tag.strip().lower()
        hashtags_sum[l_tag] += 1
        s_date = to_datetime(tw[4])
        date_tag[l_tag].append([str(s_date.date()), 1])
        new_tw = tw.copy()
        new_tw[0] = l_tag
        new_tw[4] = str(s_date.date())
        new_tags_df.append(new_tw)
    
    date = to_datetime(tw[4])
    if tag not in hashtags_min_date:
        hashtags_min_date[tag] = date
        hashtags_max_date[tag] = date
    
    if date < hashtags_min_date[tag]:
        hashtags_min_date[tag] = date
    if date > hashtags_max_date[tag]:
        hashtags_max_date[tag] = date

new_tags_df = pd.DataFrame(new_tags_df)
    


num tweets: 449334
0.0% 2.2255159858813265% 4.451031971762653% 6.6765479576439795% 8.902063943525306% 11.127579929406632% 13.353095915287959% 15.578611901169285% 17.804127887050612% 20.02964387293194% 22.255159858813265% 24.48067584469459% 26.706191830575918% 28.931707816457244% 31.15722380233857% 33.3827397882199% 35.608255774101224% 37.833771759982554% 40.05928774586388% 42.28480373174521% 44.51031971762653% 46.73583570350786% 48.96135168938918% 51.18686767527051% 53.412383661151836% 55.637899647033166% 57.86341563291449% 60.08893161879582% 62.31444760467714% 64.53996359055847% 66.7654795764398% 68.99099556232112% 71.21651154820245% 73.44202753408378% 75.66754351996511% 77.89305950584642% 80.11857549172775% 82.34409147760908% 84.56960746349041% 86.79512344937173% 89.02063943525306% 91.24615542113439% 93.47167140701572% 95.69718739289704% 97.92270337877837% 

In [2]:
l_tags = [(v, k) for (k,v) in hashtags_sum.items()]
l_tags.sort(reverse=True)

In [3]:
import plotly.offline as py
import plotly.figure_factory as ff

py.init_notebook_mode(connected=True)

TOP_K = 20
df = []
max_tag = max([l_tags[i][0] for i in range(TOP_K)])

def get_bucket(max_tag, num_buckets, my_val):
    tags_per_group = max_tag/num_buckets
    for i in range(num_buckets):
        if my_val > i*tags_per_group and my_val <= (i+1)*tags_per_group:
            return f"{int(i*tags_per_group)} < {int((i+1)*tags_per_group)}"
    return f"< {max_tag}"

NUM_BUCKETS = 10

for i in range(TOP_K):
    tag = l_tags[i][1]
    s_date = hashtags_min_date[tag]
    e_date = hashtags_max_date[tag]
    s_date = f"{s_date.year}-{s_date.month}-{s_date.day}"
    e_date = f"{e_date.year}-{e_date.month}-{e_date.day}"
    bucket = get_bucket(max_tag, NUM_BUCKETS, l_tags[i][0])
    df.append(dict(Task=tag, Start=s_date, Finish=e_date, Resource=f'{bucket}'))

colors = "Viridis" # ["#FCB711", "#F37021", "#CC004C", "#6460AA", "#0089D0", "#0DB14B"]

fig = ff.create_gantt(df[::-1], bar_width=0.25,  title='Hashtag Over Time', 
                      showgrid_x=True, show_colorbar=True, index_col='Resource')
py.iplot(fig)

In [4]:
import plotly.graph_objs as go

def get_tagdf(tag_name, cumsum=True):
    taggis = date_tag[tag_name]
    taggis.sort()

    df_date_tag = pd.DataFrame(taggis)
    df_date_tag = df_date_tag.groupby(0).sum()
    if cumsum:
        df_date_tag = df_date_tag.cumsum()
    return go.Scatter(x=df_date_tag.index, y=df_date_tag[1], name=tag_name)

dt1 = get_tagdf('obamacare')
dt2 = get_tagdf('trumpcare')
dt3 = get_tagdf('taxreform')
dt4 = get_tagdf('netneutrality')

data = [dt1, dt2, dt3, dt4]

layout = dict(
    title = "Hashtags over time"
)

fig = dict(data=data, layout=layout)

py.iplot(fig, filename = 'Hashtags over time')


In [5]:
dt1 = get_tagdf('obamacare', cumsum=False)
dt2 = get_tagdf('trumpcare', cumsum=False)
dt3 = get_tagdf('taxreform', cumsum=False)
dt4 = get_tagdf('netneutrality', cumsum=False)

data = [dt1, dt2, dt3, dt4]

layout = dict(
    title = "Hashtags over time"
)

fig = dict(data=data, layout=layout)

py.iplot(fig, filename = 'Hashtags over time')

In [6]:
#### Hashtags of a concrete politician over time
# Politicians relected 2019:
# SenFeinstein, SenAngusKing, SenMurphyOffice, SenatorCarper, maziehirono, SenWarren
# SenStabenow, amyklobuchar, SenatorWicker, SenatorTester, SenatorFischer, SenatorMenendez
# MartinHeinrich, SenGillibrand, SenSherrodBrown, SenBobCasey, SenWhitehouse, SenTedCruz
# SenSanders, timkaine, SenatorCantwell, Sen_JoeManchin, SenatorBaldwin, SenJohnBarrasso

sen_2019 = [
    "SenFeinstein", "SenAngusKing", "SenMurphyOffice", "SenatorCarper", "maziehirono", "SenWarren",
"SenStabenow", "amyklobuchar", "SenatorWicker", "SenatorTester", "SenatorFischer", "SenatorMenendez",
"MartinHeinrich", "SenGillibrand", "SenSherrodBrown", "SenBobCasey", "SenWhitehouse", "SenTedCruz",
"SenSanders", "timkaine", "SenatorCantwell", "Sen_JoeManchin", "SenatorBaldwin", "SenJohnBarrasso"
]
print(len(sen_2019))
tweets_by_sen2019 = new_tags_df.loc[new_tags_df[2].isin(sen_2019)]
print(len(tweets_by_sen2019.groupby(2)))
# tweets per senator
tweets_by_sen2019.groupby(2).count()[0]

24
24


2
MartinHeinrich     10864
SenAngusKing        4800
SenBobCasey         5848
SenFeinstein        6436
SenGillibrand       8284
SenJohnBarrasso     5238
SenMurphyOffice      228
SenSanders          1634
SenSherrodBrown     5698
SenStabenow         3242
SenTedCruz          7354
SenWarren           2798
SenWhitehouse      11870
Sen_JoeManchin      6476
SenatorBaldwin     11950
SenatorCantwell    15696
SenatorCarper       8568
SenatorFischer      8960
SenatorMenendez    13798
SenatorTester       8870
SenatorWicker       8046
amyklobuchar        2638
maziehirono         7114
timkaine            3522
Name: 0, dtype: int64

In [7]:
import plotly.graph_objs as go
import numpy as np

def get_timeseries_from_dates(time_list, name,  cumsum=True):
    date_count = np.vstack((time_list, np.ones(len(time_list)))).T

    df_date_tag = pd.DataFrame(date_count)
    df_date_tag = df_date_tag.groupby(0).sum()
    if cumsum:
        df_date_tag = df_date_tag.cumsum()
    return go.Scatter(x=df_date_tag.index, y=df_date_tag[1], name=name)

def by_senator(sen, cumsum=True):
    twees_np = tweets_by_sen2019.loc[tweets_by_sen2019[2] == sen].values
    return get_timeseries_from_dates(twees_np[:, 4], sen, cumsum)

dt1 = by_senator("SenSanders")
dt2 = by_senator("SenSanders")
dt3 = by_senator("SenSanders")
dt4 = by_senator("SenSanders")

data = [by_senator(sen) for sen in sen_2019] #, dt2, dt3, dt4]

layout = dict(
    title = "Cumsum of tweets from senators over time"
)

fig = dict(data=data, layout=layout)

py.iplot(fig, filename = 'Hashtags over time')
 




In [8]:
# Plot distribution of top 5 hashtags for each senator

for sen in sen_2019:
    topk = 8
    tw_by_sen = tweets_by_sen2019.loc[tweets_by_sen2019[2] == sen]
    top_k_tags = tw_by_sen.groupby(0).count().nlargest(topk, 1).index.values

    data = [get_timeseries_from_dates(tw_by_sen[tw_by_sen[0] == top_k_tags[k]].values[:, 4], top_k_tags[k])
                for k in range(topk)] #, dt2, dt3, dt4]
    data.append(get_timeseries_from_dates(tw_by_sen[~tw_by_sen[0].isin(top_k_tags)].values[:, 4], 'others',
                                  cumsum=True))

    layout = dict(
        title = f"Cumsum of tweets from @{sen} over time"
    )

    fig = dict(data=data, layout=layout)

    py.iplot(fig, filename = f"Cumsum of tweets from @{sen} over time")




In [18]:
# Plot distribution of top 5 hashtags for each senator

for sen in sen_2019:
    topk = 5
    tw_by_sen = tweets_by_sen2019.loc[tweets_by_sen2019[2] == sen]
    tw_by_sen = tw_by_sen[tw_by_sen[4] > '2017-06-01']
    top_count = tw_by_sen.groupby(0).count()
    top_k_tags = top_count.nlargest(topk, 1).index.values
    
    data = [get_timeseries_from_dates(tw_by_sen[tw_by_sen[0] == top_k_tags[k]].values[:, 4], 
                                      top_k_tags[k] + " #" + str(int(top_count[top_count.index == top_k_tags[k]][1])), 
                                                          cumsum=False)
                for k in range(len(top_k_tags))] #, dt2, dt3, dt4]
   # data.append(get_timeseries_from_dates(tw_by_sen[~tw_by_sen[0].isin(top_k_tags)].values[:, 4],
    #                            'others  #' + str(len(tw_by_sen[~tw_by_sen[0].isin(top_k_tags)])), 
    #                              cumsum=False))

    layout = dict(
        title = f"Cumsum of tweets from @{sen} over time"
    )

    fig = dict(data=data, layout=layout)

    py.iplot(fig, filename = f"Number of tweets from @{sen} over time")




In [10]:
(tw_by_sen[4] > '2017')

402       True
406       True
416       True
425       True
2033      True
2037      True
2046      True
2047      True
2048      True
2049      True
3097      True
3098      True
3103      True
3104      True
3850      True
3851      True
4483      True
4508      True
4509      True
4510      True
5399      True
5405      True
5407      True
5408      True
5409      True
5410      True
5411      True
5418      True
5419      True
5435      True
          ... 
622186    True
622203    True
623075    True
623076    True
623077    True
623087    True
623110    True
624430    True
624431    True
624432    True
624436    True
624437    True
624441    True
624446    True
624447    True
626881    True
626882    True
626935    True
627555    True
627560    True
627561    True
627562    True
628836    True
628837    True
628850    True
628851    True
628853    True
630576    True
630591    True
630592    True
Name: 4, Length: 820, dtype: bool

In [11]:
# Plot senator cruz

tw_by_sen = tweets_by_sen2019.loc[tweets_by_sen2019[2] == "SenTedCruz"]
top_k_tags = np.array(['texas', 'tx', 'texans'])#tw_by_sen.groupby(0).count().nlargest(topk, 1).index.values

data = [get_timeseries_from_dates(tw_by_sen[tw_by_sen[0] == top_k_tags[k]].values[:, 4], top_k_tags[k], cumsum=False)
            for k in range(len(top_k_tags))] #, dt2, dt3, dt4]

data.append(get_timeseries_from_dates(tw_by_sen[~tw_by_sen[0].isin(top_k_tags)].values[:, 4], 'others',
                                  cumsum=False))

layout = dict(
    title = f"Cumsum of tweets from @SenTedCruz over time"
)

fig = dict(data=data, layout=layout)

py.iplot(fig, filename = f"Num tweets from @{sen} at time")


In [12]:
tw_by_sen[~tw_by_sen[0].isin(top_k_tags)]

Unnamed: 0,0,1,2,3,4
969,fy18ndaa,Glad to see @POTUS sign #FY18NDAA into law. Th...,SenTedCruz,"Houston, Texas",2017-12-12
979,guard381,Happy 381st Birthday to the U.S. @NationalGuar...,SenTedCruz,"Houston, Texas",2017-12-13
980,iran,RT @USUN: TUNE IN: Ambassador Nikki Haley will...,SenTedCruz,"Houston, Texas",2017-12-14
989,fy18ndaa,The #FY18NDAA supports US Navy port calls with...,SenTedCruz,"Houston, Texas",2017-12-15
990,fy18ndaa,"This past Monday, President Trump signed #FY18...",SenTedCruz,"Houston, Texas",2017-12-15
2291,amarillo,Glad to speak about jobs and economic growth w...,SenTedCruz,"Houston, Texas",2017-08-18
2295,ussjohnsmccain,Prayers for our sailors injured in #USSJohnSMc...,SenTedCruz,"Houston, Texas",2017-08-21
2321,smallbiz,Strong trade relations benefit our #smallbiz o...,SenTedCruz,"Houston, Texas",2017-08-22
2323,laredo,Strong trade relations benefit our #smallbiz o...,SenTedCruz,"Houston, Texas",2017-08-22
2324,edinburg,Enjoyed speaking w agriculture leaders &amp; f...,SenTedCruz,"Houston, Texas",2017-08-22
