## Data Collection  

The output from this notebook will be a two column dataframe that contains raw tweets in one column and the the classification (Liberal or Conservative) in the second column. 

In [92]:

import pandas as pd
import numpy as np
import tweepy
import tabula
import os
import pickle
import boto3
s3 = boto3.resource('s3')
bucket_name = "msds-practicum-carey"

c_key = os.environ.get('tw_c_key')
c_sec = os.environ.get('tw_c_sec')
atk = os.environ.get('tw_ac_tok')
ats = os.environ.get('tw_ac_sec')

auth = tweepy.OAuthHandler(c_key, c_sec)
auth.set_access_token(atk, ats)

api = tweepy.API(auth, wait_on_rate_limit_notify=True,
                wait_on_rate_limit=True)

pd.set_option('display.max_rows', 200)

***

In [2]:
# read in data
congress_url = "https://theunitedstates.io/congress-legislators/legislators-current.csv"
congress_df = pd.read_csv(congress_url,
                          usecols=['last_name',
                                   'first_name',
                                   'full_name', 
                                   'party',
                                   'type', 
                                   'state' ,
                                   'twitter'])


### Check for missing Twitter Handles

In [3]:
 congress_df[congress_df['twitter'].isna()]


Unnamed: 0,last_name,first_name,full_name,type,state,party,twitter
33,Amash,Justin,Justin Amash,rep,MI,Independent,
62,Clay,Wm.,Wm. Lacy Clay,rep,MO,Democrat,
182,Peterson,Collin,Collin C. Peterson,rep,MN,Democrat,
314,Kaine,Timothy,Tim Kaine,sen,VA,Democrat,
372,Comer,James,James Comer,rep,KY,Republican,
425,Gianforte,Greg,Greg Gianforte,rep,MT,Republican,
534,Bishop,Dan,Dan Bishop,rep,NC,Republican,
535,Murphy,Gregory,Gregory F. Murphy,rep,NC,Republican,
536,Loeffler,Kelly,Kelly Loeffler,sen,GA,Republican,


A few twitter handles are missing. Since it is a small numbe I will manually fill these in. 

Georgia swore in a new Senator to replace retired Sen. Johnny Isakson. I've elected to keep Johnny Isakson's name in the list in order to capture all of his tweets

In [13]:
congress_df.at[33, 'twitter'] = 'justinamash'

congress_df.at[62, 'twitter'] = 'LacyClayMO1'

congress_df.at[182, 'twitter'] = "collinpeterson"

congress_df.at[314, 'twitter'] = "timkaine"

congress_df.at[372, 'twitter'] = "KYComer"

congress_df.at[425, 'twitter'] = "GregForMontana"

congress_df.at[534, 'twitter'] = "jdanbishop"

congress_df.at[535, 'twitter'] = "DrGregMurphy1"

congress_df.at[536, 'twitter'] = "SenatorLoeffler"

print(f"Missing Twitter Handles: {len(congress_df[congress_df['twitter'].isna()])}")



In [14]:
congress_df.groupby(by='party').count()

Unnamed: 0_level_0,last_name,first_name,full_name,type,state,twitter
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Democrat,281,281,281,281,281,281
Independent,3,3,3,3,3,3
Republican,253,253,253,253,253,253


In [15]:

congress_df[congress_df['party']=='Independent']

Unnamed: 0,last_name,first_name,full_name,type,state,party,twitter
8,Sanders,Bernard,Bernard Sanders,sen,VT,Independent,SenSanders
33,Amash,Justin,Justin Amash,rep,MI,Independent,justinamash
287,King,Angus,"Angus S. King, Jr.",sen,ME,Independent,SenAngusKing


In [16]:
congress_df.at[8, 'party'] = 'Democrat'
congress_df.at[33, 'party'] = 'Republican'
congress_df.at[287, 'party'] = 'Democrat'


congress_df['lean'] = np.where(congress_df['party']=='Democrat','L', 'C')
congress_df

Unnamed: 0,last_name,first_name,full_name,type,state,party,twitter,lean
0,Brown,Sherrod,Sherrod Brown,sen,OH,Democrat,SenSherrodBrown,L
1,Cantwell,Maria,Maria Cantwell,sen,WA,Democrat,SenatorCantwell,L
2,Cardin,Benjamin,Benjamin L. Cardin,sen,MD,Democrat,SenatorCardin,L
3,Carper,Thomas,Thomas R. Carper,sen,DE,Democrat,SenatorCarper,L
4,Casey,Robert,"Robert P. Casey, Jr.",sen,PA,Democrat,SenBobCasey,L
...,...,...,...,...,...,...,...,...
532,Golden,Jared,Jared F. Golden,rep,ME,Democrat,repgolden,L
533,Keller,Fred,Fred Keller,rep,PA,Republican,RepFredKeller,C
534,Bishop,Dan,Dan Bishop,rep,NC,Republican,jdanbishop,C
535,Murphy,Gregory,Gregory F. Murphy,rep,NC,Republican,DrGregMurphy1,C


## Get tweets from congress

In [22]:
def get_tweets(handle):
	# blank list
    print(f'...Getting Tweets for {handle}')
    tweets = []	
    try:
    # get the most recent 200 tweets
        new_tweets = api.user_timeline(screen_name = handle,
                                       count=200)
	
    # add new tweets one by one to end of tweets list
        tweets.extend(new_tweets)
	
    # get oldest tweet from list 
        old = tweets[-1].id - 1
    
        while len(new_tweets) > 0:
        
	
            # get next 200 tweets
            new_tweets = api.user_timeline(screen_name = handle,
                                           count=200,
                                           max_id=old)
		
        #add current batch to the tweets list
            tweets.extend(new_tweets)
		
        # update the old var to match the oldest tweet currently in 
            old = tweets[-1].id - 1
            
        tweet_tab = [tweet.text for tweet in tweets]
    
    except tweepy.TweepError:
        print(f'error with {handle} in function')
        pass    
        
    
    return tweet_tab

In [23]:

liberal_handle_list = congress_df[congress_df['lean']=='L'].twitter
conservative_handle_list = congress_df[congress_df['lean']=='C'].twitter

In [24]:

lib_tweets = []
for name in liberal_handle_list:
        
    try:
        tweets_temp = get_tweets(name)
        lib_tweets.extend(tweets_temp)
        with open('outdata/lib_list.pkl', 'wb') as f:
            pickle.dump(lib_tweets, f)
        print(f'Lib_tweets len: {len(lib_tweets)}')
    except:
        print(f"problem with {name} in loop")
    
        
    
    

...Getting Tweets for SenSherrodBrown
Lib_tweets len: 3245
...Getting Tweets for SenatorCantwell
Lib_tweets len: 6458
...Getting Tweets for SenatorCardin
Lib_tweets len: 9690
...Getting Tweets for SenatorCarper
Lib_tweets len: 12892
...Getting Tweets for SenBobCasey
Lib_tweets len: 16110
...Getting Tweets for SenFeinstein
Lib_tweets len: 19312
...Getting Tweets for SenAmyKlobuchar
Lib_tweets len: 19939
...Getting Tweets for SenatorMenendez
Lib_tweets len: 23186
...Getting Tweets for SenSanders
Lib_tweets len: 26424
...Getting Tweets for SenStabenow
Lib_tweets len: 29632
...Getting Tweets for SenatorTester
Lib_tweets len: 32829
...Getting Tweets for SenWhitehouse
Lib_tweets len: 36057
...Getting Tweets for SenatorDurbin
Lib_tweets len: 39303
...Getting Tweets for SenJeffMerkley
Lib_tweets len: 42504
...Getting Tweets for SenJackReed
Lib_tweets len: 45751
...Getting Tweets for SenatorShaheen
Lib_tweets len: 48984
...Getting Tweets for SenatorTomUdall
Lib_tweets len: 52189
...Getting Twee

In [28]:
conservative_tweets = []
for name in conservative_handle_list:
    try:
        tweets_temp = get_tweets(name)
        conservative_tweets.extend(tweets_temp)
        with open('outdata/con_list.pkl', 'wb') as f:
            pickle.dump(conservative_tweets, f)
        print(f'con_tweets len: {len(conservative_tweets)}')
        
    except:
        print(f"problem with {name} in loop")
    
    

...Getting Tweets for SenJohnBarrasso
con_tweets len: 3209
...Getting Tweets for SenatorWicker
con_tweets len: 6408
...Getting Tweets for SenAlexander
con_tweets len: 9637
...Getting Tweets for SenatorCollins
con_tweets len: 12880
...Getting Tweets for JohnCornyn
con_tweets len: 16095
...Getting Tweets for SenatorEnzi
con_tweets len: 19302
...Getting Tweets for GrahamBlog
con_tweets len: 22540
...Getting Tweets for InhofePress
con_tweets len: 25736
...Getting Tweets for McConnellPress
con_tweets len: 28943
...Getting Tweets for SenatorRisch
con_tweets len: 30786
...Getting Tweets for SenPatRoberts
con_tweets len: 34010
...Getting Tweets for Robert_Aderholt
con_tweets len: 35779
...Getting Tweets for justinamash
con_tweets len: 38999
...Getting Tweets for RepGusBilirakis
con_tweets len: 42207
...Getting Tweets for RepRobBishop
problem with RepRobBishop in loop
...Getting Tweets for MarshaBlackburn
con_tweets len: 45453
...Getting Tweets for RoyBlunt
con_tweets len: 48699
...Getting Twee

### Manually fix the problems that arose for a few Conservative Handles

In [41]:

# Rep Rob Marshal's twitter handle needed to be updated     
marsh_tweets = get_tweets('RogerMarshallMD')
conservative_tweets.extend(marsh_tweets)
with open('outdata/con_list.pkl', 'wb') as f:
            pickle.dump(conservative_tweets, f)

...Getting Tweets for RogerMarshallMD


In [42]:
# Rep. Lance Gooden's twitter handle needed to be updated. 
gooden_tweets = get_tweets('Lancegooden')
conservative_tweets.extend(gooden_tweets)
with open('outdata/con_list.pkl', 'wb') as f:
            pickle.dump(conservative_tweets, f)

...Getting Tweets for Lancegooden


## Add Current Executive Branch

In [43]:
# Add President Trump to the Conservative Tweet List
trump_tweets = get_tweets('realDonaldTrump')
conservative_tweets.extend(trump_tweets)
with open('outdata/con_list.pkl', 'wb') as f:
            pickle.dump(conservative_tweets, f)
print(f'{len(conservative_tweets)}')

...Getting Tweets for realDonaldTrump
594399


In [46]:
# Define a function for adding to the conservative list
def add_tweets_to_con_list(handle):
    temp_tweets = get_tweets(handle)
    conservative_tweets.extend(temp_tweets)
    with open('outdata/con_list.pkl', 'wb') as f:
            pickle.dump(conservative_tweets, f)
    print(f'Conservative Tweets: {len(conservative_tweets)}')
    

In [45]:
# add VP Pence 
add_tweets_to_con_list('VP')

...Getting Tweets for VP
597644


### Add the previous Executive Branch and Current Democratic Presidential Candidates (that aren't in congress)

In [47]:
def add_tweets_to_lib_list(handle):
    temp_tweets = get_tweets(handle)
    lib_tweets.extend(temp_tweets)
    with open('outdata/lib_list.pkl', 'wb') as f:
            pickle.dump(lib_tweets, f)
    print(f'Liberal Tweets: {len(lib_tweets)}')

In [48]:
add_tweets_to_lib_list('BarackObama')
add_tweets_to_lib_list('JoeBiden')
add_tweets_to_lib_list('MikeBloomberg')
add_tweets_to_lib_list('DevalPatrick')
add_tweets_to_lib_list('TomSteyer')
add_tweets_to_lib_list('PeteButtigieg')
add_tweets_to_lib_list('JohnDelaney')
add_tweets_to_lib_list('AndrewYang')



...Getting Tweets for BarackObama
Liberal Tweets: 731403
...Getting Tweets for JoeBiden
Liberal Tweets: 734630
...Getting Tweets for MikeBloomberg
Liberal Tweets: 737863
...Getting Tweets for DevalPatrick
Liberal Tweets: 739782
...Getting Tweets for TomSteyer
Liberal Tweets: 742994
...Getting Tweets for PeteButtigieg
Liberal Tweets: 746216
...Getting Tweets for JohnDelaney
Liberal Tweets: 749432
...Getting Tweets for AndrewYang
Liberal Tweets: 752662


In [93]:
s3.meta.client.upload_file('outdata/lib_list.pkl', bucket_name, 'lib_list.pkl')
s3.meta.client.upload_file('outdata/con_list.pkl', bucket_name, 'con_list.pkl')

## Check for Data Imbalance  

The data does not look terribly imbalanced. Liberals represent a larger proportion of the tweets than conservatives. 

In [59]:
percent_con_tweets = (len(conservative_tweets)) / (len(lib_tweets) + len(conservative_tweets))
percent_lib_tweets = (len(lib_tweets)) / (len(lib_tweets) + len(conservative_tweets))

print(f'Proportion of Tweets from Conservatives: {percent_con_tweets}')
print(f'Proportion of Tweets from Liberals: {percent_lib_tweets}')

Proportion of Tweets from Conservatives: 0.442598936833577
Proportion of Tweets from Liberals: 0.557401063166423


## Create a Combined Dataframe of tweets with labels

In [88]:
# read in lists of tweets
    
# create liberal and Conservative Dataframes    
lib_df = pd.DataFrame(columns=['tweet', 'class'])
con_df = pd.DataFrame(columns=['tweet', 'class'])

# Fill liberal dataframe
lib_df['tweet'] = lib_list
lib_df['class'] = "L"

# fill conservative dataframe
con_df['tweet'] = con_list
con_df['class'] = "C"



#combine the liberal and conservative dataframes
tweet_df = pd.concat([lib_df, con_df])

# Randomly shuffle the dataframe
tweet_df = tweet_df.sample(frac=1)

# reset the index of the complete dataframe
tweet_df.reset_index(drop=True, inplace = True)

# view dataframe
tweet_df

Unnamed: 0,tweet,class
0,RT @aafb: Congrats to ⁦@RepOHalleran⁩ &amp; ⁦@...,L
1,Great to meet the new Lake County Farm Bureau ...,L
2,Congratulations to @waynestcollege women's rug...,C
3,Great to meet with the Erickson Air Crane team...,C
4,Always wonderful to be part of the Back to Sch...,L
...,...,...
1350301,We should be upholding the National Environmen...,L
1350302,"If anything is to be investigated, I think we ...",C
1350303,TODAY: Federal judge rules in favor of House R...,C
1350304,"In the words of an old proverb, ""A hit dog wil...",L


In [None]:
with open('outdata/tweet_df.pkl', 'wb') as f:
            pickle.dump(tweet_df, f)

In [94]:
# save dataframe to pickle to AWS S3 
s3.meta.client.upload_file('outdata/tweet_df.pkl',
                           bucket_name,
                           'tweet_df.pkl')


In [98]:
os.remove('outdata/con_list.pkl')
os.remove('outdata/lib_list.pkl')
os.remove('outdata/tweet_df.pkl')