In [1]:
# Extracting Common Phrases with tfidf

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

* `max_df=0.8` means exclude phrases that are in 80% of the documents or more (similar to stop words, these are unlikely to be informative since they are so common)

* `min_df=50` means that the word must occur at least 50 times in the corpus to be included in analysis (I used 50 because the research paper I mentioned did too, though you may experiment with different cutoffs)

* `ngram_range=(1,2)` means includes one-word and two-word phrases (you could easily set it to `(1,3)` to also include trigrams / three-word phrases
binary=True means to only count if a word occurs at all in a given document (i.e. 0 or 1), rather than counting exactly how many times it occurs (i.e. 0 or 1 or 2 or 3 or…)

* `stop_words=nltk_stop_words` just plugs in the NLTK stop word list set up in the previous line, so that words such as “of” and “to” are not included

In [2]:
nltk_stop_words = stopwords.words('english')
tf_vectorizer = CountVectorizer(max_df=0.8, min_df=50,
    ngram_range = (1,5),
    binary=True,
    stop_words=nltk_stop_words)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
#from progressbar import progressbar
#import networkx as nx
import pandas as pd
#import pdb
import os

In [4]:
df=pd.read_csv('tweets.csv')
# df.at[df.twitter =='SenSanders', 'party'] = 'Democrat' # for ideological purposes
df

Unnamed: 0.1,Unnamed: 0,name,twitter,party,tweet,tweet_published
0,0,Sherrod Brown,SenSherrodBrown,Democrat,"My plan calls for widespread testing, intensiv...",2020-05-21
1,1,Sherrod Brown,SenSherrodBrown,Democrat,This is dangerous. The President should not be...,2020-03-24
2,2,Sherrod Brown,SenSherrodBrown,Democrat,LIVE: @SenBooker and I discuss the best ways t...,2020-05-21
3,3,Sherrod Brown,SenSherrodBrown,Democrat,Thank you @SenWhitehouse for joining my daily ...,2020-04-27
4,4,Sherrod Brown,SenSherrodBrown,Democrat,NEW: I've released my requirements for a coron...,2020-03-19
...,...,...,...,...,...,...
697,697,"Robert P. Casey, Jr.",SenBobCasey,Democrat,Republican politicians spent three years attac...,2020-03-22
698,698,"Robert P. Casey, Jr.",SenBobCasey,Democrat,"This year, because of COVID-19, moms across th...",2020-05-10
699,699,"Robert P. Casey, Jr.",SenBobCasey,Democrat,Just one example: After fearing he had coronav...,2020-02-26
700,700,"Robert P. Casey, Jr.",SenBobCasey,Democrat,HAPPENING SOON: I’m hosting a digital town hal...,2020-03-19


In [5]:
print("Fitting...")
tf_vectorizer.fit(df.tweet.tolist())

Fitting...


CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=None, min_df=50,
                ngram_range=(1, 5), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [16]:
term_frequencies = tf_vectorizer.fit_transform(df.tweet.tolist())
phrases_df = pd.DataFrame(data=tf_vectorizer.get_feature_names(),columns=['phrase'])
phrases_df['total_occurrences']=term_frequencies.sum(axis=0).T
ans = phrases_df.sort_values(by='total_occurrences',ascending=False).head(100)
ans.to_csv('top_20_overall.csv',index=False)

In [17]:
ans

Unnamed: 0,phrase,total_occurrences
10,covid19,354
20,pandemic,215
15,health,166
7,coronavirus,125
4,amp,116
19,need,113
8,covid,102
3,americans,97
0,19,95
17,help,94
