In [1]:
# Extracting Common Phrases with tfidf

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

* `max_df=0.8` means exclude phrases that are in 80% of the documents or more (similar to stop words, these are unlikely to be informative since they are so common)

* `min_df=50` means that the word must occur at least 50 times in the corpus to be included in analysis (I used 50 because the research paper I mentioned did too, though you may experiment with different cutoffs)

* `ngram_range=(1,2)` means includes one-word and two-word phrases (you could easily set it to `(1,3)` to also include trigrams / three-word phrases
binary=True means to only count if a word occurs at all in a given document (i.e. 0 or 1), rather than counting exactly how many times it occurs (i.e. 0 or 1 or 2 or 3 or…)

* `stop_words=nltk_stop_words` just plugs in the NLTK stop word list set up in the previous line, so that words such as “of” and “to” are not included

In [2]:
nltk_stop_words = stopwords.words('english')
tf_vectorizer = CountVectorizer(max_df=0.8, min_df=50,
    ngram_range = (1,5),
    binary=True,
    stop_words=nltk_stop_words)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
#from progressbar import progressbar
import networkx as nx
import pandas as pd
#import pdb
import os

In [4]:
df=pd.read_csv('tweets.csv')
# df.at[df.twitter =='SenSanders', 'party'] = 'Democrat' # for ideological purposes
df

Unnamed: 0.1,Unnamed: 0,name,twitter,party,tweet,tweet_published
0,0,Sherrod Brown,SenSherrodBrown,Democrat,"This year, #WorkersMemorialDay is particularly...",2020-04-28
1,1,Sherrod Brown,SenSherrodBrown,Democrat,"Without the #ACA, health insurers could discri...",2020-03-23
2,2,Sherrod Brown,SenSherrodBrown,Democrat,Just voted to send more than 15 million in eme...,2020-03-05
3,3,Sherrod Brown,SenSherrodBrown,Democrat,Mitch McConnell has wasted four days in the mi...,2020-03-17
4,4,Sherrod Brown,SenSherrodBrown,Democrat,"TUNE IN: At 12:25 PM, I'm going Live with my f...",2020-05-21
...,...,...,...,...,...,...
64327,64327,Kelly Loeffler,SenatorLoeffler,Republican,Families in the USA - a pillar of my USA RISE ...,2020-05-18
64328,64328,Kelly Loeffler,SenatorLoeffler,Republican,"Two weeks ago, I spoke on the Senate floor con...",2020-03-19
64329,64329,Kelly Loeffler,SenatorLoeffler,Republican,I also hosted a statewide call w/@GACities &am...,2020-03-17
64330,64330,Kelly Loeffler,SenatorLoeffler,Republican,Outstanding work happening in Augusta to take ...,2020-04-23


In [5]:
print("Fitting...")
tf_vectorizer.fit(df.tweet.tolist())

Fitting...


CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=None, min_df=50,
                ngram_range=(1, 5), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [6]:
term_frequencies = tf_vectorizer.fit_transform(df.tweet.tolist())
phrases_df = pd.DataFrame(data=tf_vectorizer.get_feature_names(),columns=['phrase'])
phrases_df['total_occurrences']=term_frequencies.sum(axis=0).T
ans = phrases_df.sort_values(by='total_occurrences',ascending=False).head(100)
ans.to_csv('top_20_overall.csv',index=False)

In [7]:
ans

Unnamed: 0,phrase,total_occurrences
1075,coronavirus,22908
1229,covid19,21500
3694,pandemic,15854
1200,covid,13870
41,19,13641
...,...,...
1116,coronavirus pandemic,2204
3210,masks,2201
1993,first,2194
3250,medical,2185


# Step 3: Measuring Politically Polarized Phrases
First things first, we will need to split up the Democrat-authored and Republican-authored texts, and then get their term frequency matrices. The pandas dataframe makes this pretty easy:

In [9]:
dem_tfs = tf_vectorizer.transform(df[df.party=='Democrat'].tweet.tolist())
rep_tfs = tf_vectorizer.transform(df[df.party=='Republican'].tweet.tolist())

In [10]:
n_dem_docs = dem_tfs.shape[0]
n_rep_docs = rep_tfs.shape[0]
print("{} Dem docs, {} Rep docs".format(n_dem_docs, n_rep_docs))

42820 Dem docs, 21234 Rep docs


In [12]:
total_dem_tfs = dem_tfs.sum(axis=0)
total_rep_tfs = rep_tfs.sum(axis=0)
total_tfs = total_dem_tfs + total_rep_tfs
p_dem = total_dem_tfs / n_dem_docs
p_rep = total_rep_tfs / n_rep_docs

Now for the juicy part: figuring out which phrases are politically charged. Here is the part of the paper that explains the method for this (the key formula is highlighted):


Next, we extracted all unigrams and bigrams from the cleaned corpus and scored them. We computed the probability of each unigram and bigrams g appearing in Republican and Democrat-authored text as PrR(g) and PrD(g). We then compute the partisan bias y of each g as 

y(g) = (PrR(g) - PrD(g)) / (PrR(g) + PrD(g))

y scores range from [-1, 1], with 1(-1) indicating that a phrase is used exclusively by Republicans(Democrats)

In [13]:
bias = (p_rep - p_dem) / (p_rep + p_dem)

In [14]:
phrases_df['bias_score'] = bias.T
phrases_df['p_dem'] = p_dem.T
phrases_df['p_rep'] = p_rep.T
phrases_df['n_dem'] = total_dem_tfs.T
phrases_df['n_rep'] = total_rep_tfs.T 

In [15]:
print('Counting senators...')

Counting senators...


In [18]:
phrases_df.sort_values(by='total_occurrences',ascending=False).to_csv('all_phrases.csv',index=False)

In [22]:
print("Most Democratic...")
top_dem = phrases_df.sort_values(by='bias_score', ascending=True).head(200).copy()
top_dem.head(5)

Most Democratic...


Unnamed: 0,phrase,total_occurrences,bias_score,p_dem,p_rep,n_dem,n_rep
3578,ny19,57,-1.0,0.001331,0.0,57,0
4795,staters,122,-1.0,0.002849,0.0,122,0
541,away health,67,-1.0,0.001565,0.0,67,0
544,az01,216,-1.0,0.005044,0.0,216,0
1838,facebook com events,75,-1.0,0.001752,0.0,75,0


In [23]:
top_dem['n_senators'] =  top_dem.apply(lambda x: len(df[df.tweet.str.contains(x.phrase)].name.unique()),axis=1)
top_dem = top_dem[top_dem.n_senators > 2]
top_dem.head(50).to_csv('top_50_democrat.csv', index=False)

In [25]:
print("Most Republican:")
top_rep = phrases_df.sort_values(by='bias_score', ascending=False).head(200).copy()
top_rep['n_senators'] =  top_rep.apply(lambda x: len(df[df.tweet.str.contains(x.phrase)].name.unique()),axis=1)
top_rep = top_rep[top_rep.n_senators > 2]
top_rep.head(50).to_csv('top_50_republican.csv', index=False)

Most Republican:
