In [1]:
import numpy as np
import os
import scipy.sparse as sparse
from scipy.stats import bernoulli, poisson
import analysis_utils_mine as utils

import plotly.graph_objects as go
import chart_studio
import json
import pandas as pd
import ast
from datetime import datetime
import torch
import pickle

In [2]:
source_dir = os.path.join("../data/tweets_cong_115_116/")

# Load TBIP data.
data_dir = os.path.join(source_dir, "clean2")
(counts, vocabulary, author_indices, 
 author_map) = utils.load_text_data(data_dir)

# Load TBIP parameters.
param_dir = os.path.join(source_dir, "tbip-og-k50-expanded-vocab-with-mallet-scaled-topics/params/")
(document_loc, document_scale, objective_topic_loc, objective_topic_scale, 
 ideological_topic_loc, ideological_topic_scale, ideal_point_loc, 
 ideal_point_scale) = utils.load_tbip_parameters(param_dir)

# Compute means from variational parameters
document_mean = np.exp(document_loc + document_scale ** 2 / 2)
objective_topic_mean = np.exp(objective_topic_loc + 
                              objective_topic_scale ** 2 / 2)
ideological_topic_mean = ideological_topic_loc
ideal_point_mean = ideal_point_loc

In [3]:
print(document_mean.shape)
print(objective_topic_mean.shape)
print(ideological_topic_mean.shape)
print(ideal_point_mean.shape)
ideal_point_mean = -1*ideal_point_mean
ideological_topic_loc = ideological_topic_loc * -1

(295327, 50)
(50, 9343)
(50, 9343)
(471,)


In [4]:
utils.print_topics(objective_topic_loc, 
                   objective_topic_scale, 
                   ideological_topic_loc, 
                   ideological_topic_scale, 
                   vocabulary, 
                   20)

['Ideal Point = -1.0, Topic = 0: history, american, african, first, african american, woman, asian, friend, man, elected, african americans, american woman, advocate, true, museum, congresswoman, dear, sad, passing, native'
 'Ideal Point = -0.5, Topic = 0: history, american, first, african, friend, woman, true, african american, man, passing, advocate, colleague, rest, dedicated, native, former, sad, asian, elected, missed'
 'Ideal Point = 0.0, Topic = 0: friend, american, history, true, passing, first, man, former, colleague, woman, rest, dedicated, missed, hero, native, advocate, bush, african, sad, hear'
 'Ideal Point = 0.5, Topic = 0: friend, passing, former, true, missed, american, bush, hero, dedicated, colleague, rest, man, native, first, hear, history, advocate, sad, woman, patriot'
 'Ideal Point = 1.0, Topic = 0: friend, bush, passing, former, hero, missed, true, dedicated, rest, colleague, hear, man, fellow, native, american, away, texan, american hero, patriot, good friend'


In [9]:
%%capture cap --no-stderr
utils.print_topics(objective_topic_loc, 
                   objective_topic_scale, 
                   ideological_topic_loc, 
                   ideological_topic_scale, 
                   vocabulary, 
                   20)
with open('../../tweets_results/topics.txt', 'w') as f:
    f.write(cap.stdout)

In [5]:
bid_to_twitter_uid = pickle.load(open(os.path.join(source_dir,'social_media_data_bioguide_to_twitter.pkl'), 'rb'))
print(len(bid_to_twitter_uid))
twitter_uid_to_bid = {v:k for k,v in bid_to_twitter_uid.items()}

530


In [7]:
legis_info = json.load(open('../data/floor_speeches_congs_115_116/legislator-info-1990-2020.json'))
print(len(legis_info))

1880


In [8]:
legis_id_to_info = {}
for x in legis_info:
    legis_id_to_info[x['id']['bioguide']] = x

In [9]:
parties = []
for a in author_map:
    x = legis_id_to_info[twitter_uid_to_bid[int(a)]]
    parties.append(x['terms'][-1]['party'])
print(len(parties))

471


In [10]:
names = []
for a in author_map:
    x = legis_id_to_info[twitter_uid_to_bid[int(a)]]
    names.append(x['id']['wikipedia'])
print(len(names))

471


In [11]:
colors = np.array(["steelblue" if p=='Democrat' else "crimson" for p in parties])
def get_ideological_topics(objective_topic_loc, 
                           objective_topic_scale,
                           ideological_topic_loc, 
                           ideological_topic_scale,
                           ideal_point):
    ideological_topic_mean = np.exp(objective_topic_loc +
                              ideal_point * ideological_topic_loc +
                              (objective_topic_scale ** 2 + 
                               ideal_point ** 2 * 
                               ideological_topic_scale ** 2) / 2)
    return ideological_topic_mean

In [12]:
ideal_points = ideal_point_mean
author_map = np.array(author_map)
print(author_map.shape)
ideal_points = np.array(ideal_points)
print(ideal_points.shape)

(471,)
(471,)


In [13]:
bid_to_tbip = {}
for i in range(len(author_map)):
    bid_to_tbip[twitter_uid_to_bid[int(author_map[i])]] = ideal_points[i]
print(len(bid_to_tbip))

471


In [21]:
pickle.dump(bid_to_tbip, 
            open('../../tweets_results/bid_to_tbip_tweets.pkl', 'wb'))

In [14]:
import chart_studio
chart_studio.tools.set_credentials_file(username='pg96', api_key='NDcsdaKUnDc6rrurwLmg')
import chart_studio.plotly as py

In [15]:
def save_interactive(topic_number):
    topic_name = 'Topic ' + str(topic_number)

    fig = go.Figure(layout = go.Layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'))
    x_min = -1.0
    x_max = 1.0
    diff = 0.05
    offset = -x_min / diff
    scatterplot_location = -0.53

    ideal_point_dict = {}
    for i in range(int((x_max - x_min) / diff) + 1):
        ideal_point_dict[i] = (i - offset) * diff

    # Add black line.
    fig.add_trace(go.Scatter(x=[x_min, x_max], 
                             y = [scatterplot_location, scatterplot_location],
                             line=dict(color="black", width=1),
                             marker=dict(size=1),
                             hoverinfo='skip'))

    # Add trace for all representatives
    fig.add_trace(go.Scatter(
        mode="markers",
        x=ideal_points, 
        y=scatterplot_location * np.ones(len(ideal_points)), 
        text=[x for x in names], 
        hoverinfo="x+text",
        marker=dict(color=colors, size=8)))

    num_top_words = 10

    # Add bar plots, one for each ideal point
    for step in np.arange(len(ideal_point_dict.keys())):
        ideological_topic_mean = get_ideological_topics(objective_topic_loc, 
                                   objective_topic_scale,
                                   ideological_topic_loc, 
                                   ideological_topic_scale,
                                   ideal_point_dict[step])
        topic_intensities = ideological_topic_mean[topic_number]
        top_topic_intensities = np.sort(topic_intensities)[-num_top_words:]
        top_topic_words = vocabulary[np.argsort(-topic_intensities)[:num_top_words]][::-1]
        fig.add_trace(
            go.Bar(
                visible=False,
                x=top_topic_intensities * 5,
                orientation='h',
                text=top_topic_words,
                textposition='outside',
                marker_color='rgb(175,122,197)',
                y0=0.529,
                dy=0.214,
                base=-0.3,
                hoverinfo='skip',
            ))

    fig.update_layout(go.Layout(
    yaxis = {'showgrid': False,
             'showline': False,
             'zeroline': False,
            'showticklabels': False,
            'fixedrange': True}),
    xaxis = {'showgrid': False,
             'showline': False,
             'zeroline': False,
            'showticklabels': False,
            'fixedrange': True})
    fig.update_yaxes(range=[-0.60, 2.5])
    fig.update_xaxes(range=[x_min - 0.01, x_max + 0.02])

    # Set default trace
    fig.data[len(fig.data) // 2].visible = True

    # Create and add slider
    steps = []
    for i in range(2, len(fig.data)):
        step = dict(
            method="update",
            args=[{"visible": [True, True] + [False] * (len(fig.data) - 2)}],  
            label="{:.2f}".format(ideal_point_dict[i - 2]),
        )
        step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)

    sliders = [dict(
        active=(len(fig.data) // 2 - 1),
        currentvalue={"prefix": "Ideal Point: "},
        y=0.35,
        steps=steps)]

    fig.update_layout(sliders=sliders, showlegend=False, 
                      title={'text': "Word Usage as a Function of Ideal Point (Topic: {})".format(topic_name),
                             'x': 0.5,
                             'y': 0.9},
                      xaxis_title='Representative ideal points (hover to see names)',
                      annotations = [
                                    dict(xref='paper',
                                            yref='paper',
                                            x=0.5, y=0.255,
                                            font={'size': 14},
                                            showarrow=False,
                                            text ='Move slider to change ideal point')
                      ])
    #fig.show()
    fig.write_html("../../tweets_results/interactive_htmls/topic" + str(topic_number) + "_interactive.html")

In [16]:
num_topics = objective_topic_mean.shape[0]
print(num_topics)

50


In [17]:
for topic_ind in range(num_topics):
    save_interactive(topic_ind)

In [18]:
print(document_mean.shape)

(295327, 50)


In [20]:
tweets_data = pd.read_csv(os.path.join(source_dir, "finalized_tbip_tweets_sampled2.csv"))
print(tweets_data.info())
tweets_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295327 entries, 0 to 295326
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Tweet ID   295327 non-null  int64 
 1   Author ID  295327 non-null  int64 
 2   Text       295327 non-null  object
 3   Timestamp  295327 non-null  object
dtypes: int64(2), object(2)
memory usage: 9.0+ MB
None


Unnamed: 0,Tweet ID,Author ID,Text,Timestamp
0,996837967976194048,818554054309715969,The skyrocketing cost of life-saving medicatio...,2018-05-16 19:41:06
1,878358077145325569,818554054309715969,To all of the brave women who have had to say ...,2017-06-23 21:04:19
2,979935734701412353,818554054309715969,#Passover is a beautiful time to remember our ...,2018-03-31 04:17:40
3,1040330596545904643,818554054309715969,I was glad to help pass bipartisan legislation...,2018-09-13 20:05:17
4,923648345394040833,818554054309715969,GOP budget does nothing to help Nevada's middl...,2017-10-26 20:31:40


In [22]:
final_legis_info_and_tbip_data = pd.read_csv('../../legislator_info_and_tbip_congresses_115_and_116.csv')
# NOTE: while above file is ultimately created using the ideal point estimates... we only create these other files
# as resulting files for legislators ultimately retained in the final file containing everything used for research.


included_bid_to_tbip = dict(zip(final_legis_info_and_tbip_data.Bioguide_ID, 
                       final_legis_info_and_tbip_data.TBIP_Tweets))
relev_bids = []
for bid in included_bid_to_tbip:
    if np.isnan(included_bid_to_tbip[bid]):
        continue
    relev_bids.append(bid)
print(len(relev_bids))

471


In [23]:
tweets_data.Timestamp = pd.to_datetime(tweets_data.Timestamp)
print(tweets_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295327 entries, 0 to 295326
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Tweet ID   295327 non-null  int64         
 1   Author ID  295327 non-null  int64         
 2   Text       295327 non-null  object        
 3   Timestamp  295327 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 9.0+ MB
None


In [24]:
speaker_ids_from_data = list(tweets_data['Author ID'])
speaker_ids_from_data = list(map(lambda x:twitter_uid_to_bid[x], speaker_ids_from_data))
texts_from_data = list(tweets_data['Text'])
dates_from_data = list(tweets_data['Timestamp'])

In [25]:
retain_inds = []
all_relev_ids, all_relev_texts, all_relev_dates = [], [], []
for i, bid in enumerate(speaker_ids_from_data):
    if bid in relev_bids:
        retain_inds.append(i)
        all_relev_ids.append(bid)
        all_relev_texts.append(texts_from_data[i])
        all_relev_dates.append(dates_from_data[i])
print(len(retain_inds))

295327


In [26]:
document_mean = document_mean[retain_inds]
document_mean = np.array(torch.softmax(torch.from_numpy(document_mean), 1))
print(document_mean.shape)

(295327, 50)


In [27]:
texts_topics = pd.DataFrame()
texts_topics["Bioguide_ID"] = all_relev_ids
texts_topics["Text"] = all_relev_texts
texts_topics["Timestamp"] = all_relev_dates

In [28]:
for topic_ind in range(document_mean.shape[1]):
    texts_topics['Topic' + str(topic_ind)] = list(map(lambda x:np.float32(str(x)), list(document_mean[:, topic_ind])))
    

In [29]:
print(texts_topics.info())
texts_topics

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295327 entries, 0 to 295326
Data columns (total 53 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Bioguide_ID  295327 non-null  object        
 1   Text         295327 non-null  object        
 2   Timestamp    295327 non-null  datetime64[ns]
 3   Topic0       295327 non-null  float64       
 4   Topic1       295327 non-null  float64       
 5   Topic2       295327 non-null  float64       
 6   Topic3       295327 non-null  float64       
 7   Topic4       295327 non-null  float64       
 8   Topic5       295327 non-null  float64       
 9   Topic6       295327 non-null  float64       
 10  Topic7       295327 non-null  float64       
 11  Topic8       295327 non-null  float64       
 12  Topic9       295327 non-null  float64       
 13  Topic10      295327 non-null  float64       
 14  Topic11      295327 non-null  float64       
 15  Topic12      295327 non-null  floa

Unnamed: 0,Bioguide_ID,Text,Timestamp,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,...,Topic40,Topic41,Topic42,Topic43,Topic44,Topic45,Topic46,Topic47,Topic48,Topic49
0,R000608,The skyrocketing cost of life-saving medicatio...,2018-05-16 19:41:06,4.963950e-10,3.773803e-10,5.296226e-10,5.210248e-10,6.010184e-10,3.932444e-10,1.102688e-09,...,6.037760e-10,4.906782e-10,9.130554e-10,3.175643e-10,3.456588e-07,5.551402e-10,4.296170e-10,9.850813e-10,3.213561e-10,4.410904e-10
1,R000608,To all of the brave women who have had to say ...,2017-06-23 21:04:19,2.729059e-02,8.922050e-03,8.401409e-03,1.085479e-02,9.115369e-03,9.636827e-03,9.053043e-03,...,7.630489e-03,1.434223e-02,1.058010e-02,1.330151e-02,9.308327e-03,7.347345e-03,3.608839e-02,1.301936e-02,6.946318e-03,1.020903e-02
2,R000608,#Passover is a beautiful time to remember our ...,2018-03-31 04:17:40,8.554869e-06,2.751229e-06,3.428339e-06,2.412011e-06,9.962156e-01,3.167376e-06,2.588517e-06,...,2.689379e-06,2.811784e-06,2.209202e-06,2.464343e-06,2.612589e-06,4.027968e-06,1.879338e-06,2.431450e-06,2.529632e-06,2.432784e-06
3,R000608,I was glad to help pass bipartisan legislation...,2018-09-13 20:05:17,5.748105e-08,2.425721e-08,3.063078e-08,2.513337e-08,1.931524e-08,3.416139e-07,3.907677e-08,...,3.169257e-08,2.638630e-08,5.060026e-08,2.274637e-08,2.190512e-08,1.452587e-07,2.440651e-08,4.107147e-08,3.119926e-08,2.735903e-08
4,R000608,GOP budget does nothing to help Nevada's middl...,2017-10-26 20:31:40,4.652912e-07,4.193509e-07,7.384368e-07,4.020405e-07,4.980043e-07,7.877406e-07,4.075310e-07,...,4.046053e-07,4.380161e-07,4.553031e-07,3.635515e-07,6.520915e-07,3.766006e-01,6.594825e-07,4.825466e-07,6.208430e-07,4.546258e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295322,B001311,"RT @RepMattGaetz: ""If you accept all of their ...",2019-12-04 18:45:07,1.279216e-02,1.771166e-02,2.196138e-02,2.256744e-02,1.292124e-02,1.239425e-02,2.180657e-02,...,1.794208e-02,1.612906e-02,1.843484e-02,1.375183e-02,2.208498e-02,1.362797e-02,1.436380e-02,2.104737e-02,1.914472e-02,1.093400e-02
295323,B001311,Merry Christmas! üéÑ https://t.co/QsShG6aykE,2019-12-25 18:23:45,1.702594e-03,3.533124e-03,2.896988e-03,1.981668e-03,8.997316e-01,1.514078e-03,1.373954e-03,...,1.402275e-03,2.444467e-03,2.141321e-03,2.220500e-03,1.573489e-03,1.365747e-03,1.348756e-03,1.862440e-03,2.171565e-03,1.311586e-03
295324,B001311,Been working hard w/ @Statedept to help get US...,2020-03-27 19:47:47,9.076281e-03,1.005910e-02,9.407829e-03,8.749188e-03,1.232147e-01,2.913292e-02,2.191692e-01,...,1.266893e-02,8.269025e-03,8.378092e-03,7.247412e-03,1.083687e-02,1.176402e-02,1.019323e-02,1.048611e-02,1.633170e-02,1.062113e-02
295325,B001311,üëè @Allstate is REFUNDING $$$ hundreds of milli...,2020-04-07 20:08:57,1.954941e-07,4.706637e-07,2.085140e-07,2.767639e-07,3.550961e-07,3.451636e-07,4.695545e-07,...,4.859219e-07,1.910311e-07,3.031047e-07,6.710041e-05,4.531999e-07,5.317777e-07,2.642632e-07,3.339371e-07,3.660530e-07,5.856515e-07


In [30]:
texts_topics.to_csv('../../tweets_results/texts_topic_proportions.csv',
                    index=False)


In [31]:
relev_bid_to_inds = {}
for bid in relev_bids:
    relev_bid_to_inds[bid] = []
for i, idd in enumerate(all_relev_ids):
        relev_bid_to_inds[idd].append(i)

In [32]:
def get_mean_topic_props_author(X, bid, author_to_inds):
    return np.mean(X[author_to_inds[bid]], 0).reshape((1, num_topics))

In [33]:
relev_bid_avg_topic_props = []
for bid in relev_bids:
    relev_bid_avg_topic_props.append(get_mean_topic_props_author(document_mean, 
                                                                 bid,
                                                                 relev_bid_to_inds))
relev_bid_avg_topic_props = np.concatenate(relev_bid_avg_topic_props, 0)
print(relev_bid_avg_topic_props.shape)

(471, 50)


In [34]:
author_topic_props = pd.DataFrame()
author_topic_props["Bioguide_ID"] = relev_bids

In [35]:
for topic_ind in range(num_topics):
    author_topic_props['Topic' + str(topic_ind)] = list(relev_bid_avg_topic_props[:, topic_ind])
    

In [36]:
print(author_topic_props.info())
author_topic_props

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 51 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Bioguide_ID  471 non-null    object 
 1   Topic0       471 non-null    float64
 2   Topic1       471 non-null    float64
 3   Topic2       471 non-null    float64
 4   Topic3       471 non-null    float64
 5   Topic4       471 non-null    float64
 6   Topic5       471 non-null    float64
 7   Topic6       471 non-null    float64
 8   Topic7       471 non-null    float64
 9   Topic8       471 non-null    float64
 10  Topic9       471 non-null    float64
 11  Topic10      471 non-null    float64
 12  Topic11      471 non-null    float64
 13  Topic12      471 non-null    float64
 14  Topic13      471 non-null    float64
 15  Topic14      471 non-null    float64
 16  Topic15      471 non-null    float64
 17  Topic16      471 non-null    float64
 18  Topic17      471 non-null    float64
 19  Topic18 

Unnamed: 0,Bioguide_ID,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,...,Topic40,Topic41,Topic42,Topic43,Topic44,Topic45,Topic46,Topic47,Topic48,Topic49
0,A000055,0.022957,0.034588,0.018237,0.017258,0.014678,0.020173,0.010822,0.034457,0.013750,...,0.021959,0.016912,0.010786,0.021198,0.009488,0.011509,0.006141,0.007452,0.012618,0.030016
1,A000367,0.014456,0.016591,0.017763,0.017173,0.027282,0.019257,0.010935,0.009101,0.010002,...,0.026245,0.018628,0.013234,0.012326,0.009139,0.013505,0.011133,0.011980,0.012391,0.011835
2,A000370,0.019121,0.016536,0.017995,0.010555,0.015216,0.019707,0.022182,0.101135,0.029285,...,0.014269,0.014094,0.020195,0.016259,0.022285,0.019404,0.025066,0.007491,0.012865,0.014002
3,A000371,0.016413,0.021756,0.026164,0.009264,0.025473,0.014424,0.033159,0.016009,0.059210,...,0.019139,0.010741,0.005105,0.024956,0.013815,0.015092,0.012697,0.004854,0.013239,0.009068
4,A000372,0.009323,0.030266,0.037240,0.021548,0.034366,0.043011,0.018999,0.040080,0.040374,...,0.016162,0.018643,0.013643,0.023351,0.016187,0.007704,0.010989,0.027051,0.016290,0.013758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466,Y000033,0.030443,0.027125,0.018845,0.019399,0.028535,0.023730,0.008535,0.012005,0.011057,...,0.024970,0.014881,0.012594,0.013978,0.017863,0.011236,0.009194,0.011527,0.013420,0.020649
467,Y000062,0.011217,0.023819,0.025872,0.043340,0.026072,0.012256,0.020057,0.014939,0.012956,...,0.015985,0.006912,0.011419,0.009806,0.021181,0.051542,0.025068,0.012607,0.017957,0.017291
468,Y000065,0.014928,0.019125,0.006034,0.021565,0.023981,0.019317,0.011259,0.032313,0.016154,...,0.020324,0.011322,0.008861,0.024236,0.017501,0.006270,0.008622,0.011532,0.013311,0.015742
469,Y000066,0.013007,0.019855,0.031433,0.034459,0.024164,0.011088,0.009982,0.030010,0.042604,...,0.017244,0.012494,0.013399,0.016834,0.012615,0.009568,0.006890,0.021387,0.011763,0.014933


In [37]:
author_topic_props.to_csv('../../tweets_results/topic_proportions_per_author.csv',
                          index=False)