# Sentiment analysis and topic modeling of Tweets regarding vaccines in late November, early December 2020

## Part 1: Importing data, creating data frame 

In [1]:
# import initial libraries 
import pandas as pd
import numpy as np

In [2]:
#import json file (converted to csv) of tweets after duplicates have been removed
#deduplication reduced data set from 472100 tweets to 192411 tweets
df = pd.read_csv("/Users/mymac/desktop/vaccines_twitter_analysis/vaccine_data/combined_tweets.csv")

In [3]:
#get basic info about dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192411 entries, 0 to 192410
Data columns (total 37 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            192411 non-null  int64  
 1   tweet_url                     192411 non-null  object 
 2   created_at                    192411 non-null  object 
 3   parsed_created_at             192411 non-null  object 
 4   user_screen_name              192411 non-null  object 
 5   text                          192411 non-null  object 
 6   tweet_type                    192411 non-null  object 
 7   coordinates                   106 non-null     object 
 8   hashtags                      25206 non-null   object 
 9   media                         16097 non-null   object 
 10  urls                          67731 non-null   object 
 11  favorite_count                192411 non-null  int64  
 12  in_reply_to_screen_name       106905 non-nul

In [4]:
# explore: what are the columns?
df.columns

Index(['id', 'tweet_url', 'created_at', 'parsed_created_at',
       'user_screen_name', 'text', 'tweet_type', 'coordinates', 'hashtags',
       'media', 'urls', 'favorite_count', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_id', 'user_created_at', 'user_default_profile_image',
       'user_description', 'user_favourites_count', 'user_followers_count',
       'user_friends_count', 'user_listed_count', 'user_location', 'user_name',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')

In [5]:
#count number of different languages in data set
count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

59 ['en' 'fr' 'ml' 'und' 'it' 'et' 'mr' 'hi' 'da' 'in' 'ja' 'kn' 'tl' 'pl'
 'es' 'th' 'ca' 'sv' 'el' 'ar' 'te' 'zh' 'pt' 'ne' 'gu' 'nl' 'or' 'lt'
 'ru' 'de' 'ko' 'tr' 'ta' 'pa' 'no' 'cs' 'hu' 'vi' 'fi' 'fa' 'ro' 'sr'
 'ht' 'iw' 'ur' 'km' 'ka' 'bn' 'is' 'cy' 'bg' 'uk' 'sl' 'lo' 'lv' 'eu'
 'my' 'si' 'ps']


In [6]:
# tweets are in 59 different languages. 
# I'll be working only with tweets in English
# so, I'll drop tweets in all other languages
df = df[df.lang == 'en']
df.shape

(179672, 37)

In [7]:
#dropping non-English tweets has reduced the data set to 179,672 tweets 

#there are 37 columns
#now, I will get rid of unnecessary columns
#some columns I'm not certain of needing later on or not, so I'll keep those
df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count'], axis=1) 

In [8]:
#checking which columns are left
df.columns

Index(['id', 'parsed_created_at', 'user_screen_name', 'text', 'tweet_type',
       'coordinates', 'hashtags', 'favorite_count', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'user_id', 'user_location',
       'user_time_zone', 'user_urls'],
      dtype='object')

In [9]:
len(df.columns)
# columns reduced from 37 to 16 

16

In [10]:
#dataframe has been reduced to 15 columns. I'll likely drop more columns later 

#now, check how many of the tweets include geo-coordinates
no_coordinates = df[df.coordinates != 'NaN']
len(no_coordinates)
#none of the tweets include geo-coordinates 


179672

In [11]:
df.head(5)

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,coordinates,hashtags,favorite_count,lang,place,possibly_sensitive,retweet_count,user_id,user_location,user_time_zone,user_urls
0,1333608117242720256,2020-12-01 03:05:27+00:00,SpeakerMentors,@Hobie_SHRED I don't think so...this is the fi...,reply,,,0,en,,,0,2439831350,,,
1,1333608115816828928,2020-12-01 03:05:26+00:00,JoyceWhyfor,@DrMorien1 i voted for trump but he keeps talk...,reply,,,0,en,,,0,3133624726,"Mexico, Maine",,
2,1333608112314576897,2020-12-01 03:05:25+00:00,BrianCCox2,@ighaworth Thank goodness the Harris-Biden duo...,reply,,,0,en,,,0,741455733875716096,"Texas, USA",,
3,1333608107751170048,2020-12-01 03:05:24+00:00,TheHops31,Covid-19 vaccine: Moderna applies for FDA auth...,original,,,0,en,,True,0,379140897,"New York, New York",,http://devilsandpinstripes.blogspot.com
4,1333608105515438080,2020-12-01 03:05:24+00:00,JuCamarote,@LusyLuck @bleedinCubBlue @Liliana22207796 @Dr...,reply,,,0,en,,,0,168259094,"San Diego, CA",,


In [12]:
#before moving forward, I'd like to reduce the data set more - by removing some additional unnecessary columns, and maybe some rows

#dropping more unnecessary columns
df = df.drop(['user_urls', 'user_time_zone', 'place', 'coordinates'], axis=1)

In [13]:
#checking start and end date times of the data 
#tweets collected (non-continuously) from 2020-11-28 17:40:59+00:00 to 2020-12-01 03:05:27+00:00
df.tail(5)

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,lang,possibly_sensitive,retweet_count,user_id,user_location
192403,1332741303503712261,2020-11-28 17:41:02+00:00,globalbreaking_,Covid-19 vaccine: GPs in NI plan rollout for o...,original,,2,en,,0,1239226109981601793,London•Paris•Washington D.C.
192406,1332741297375830018,2020-11-28 17:41:01+00:00,partyboitopher,If you're pro covid vaccine im just gonna assu...,original,,0,en,,0,1035828543563784192,
192407,1332741297178816512,2020-11-28 17:41:01+00:00,1Pembswolf,December 10th for the first vaccine roll out.,original,,0,en,,0,317934037,"Wales, United Kingdom"
192409,1332741293781495809,2020-11-28 17:41:00+00:00,sdutIdeas,I’m a doctor who had COVID-19. A vaccine will ...,original,,4,en,False,1,4214805732,"San Diego, California"
192410,1332741291541663751,2020-11-28 17:40:59+00:00,ronaldtheshort,As we learn this morning that someone else we ...,original,,13,en,,0,20603640,"Austin, TX"


## Part 2: Vader analysis
I want to start exploring this dataset using the Vader sentiment analysis tool, to see if it can give me an initial sense regarding polarity of sentiment in the texts.

Vader gives each Tweet a rating between -1 and 1, with -1 indicating a negative sentiment and 1 indicating a positive sentiment. 

In [14]:
#import Vader

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [15]:
#testing Vader on a single tweet - the first tweet in my data set

documents = np.array(df['text'])
tweet1 = documents[0:1]
tweet = tweet1

def sentiment_analyzer_scores(tweet):
    score = analyzer.polarity_scores(tweet)
    return score, tweet

sentiment_analyzer_scores(tweet)



({'neg': 0.112, 'neu': 0.858, 'pos': 0.03, 'compound': -0.7096},
 array(["@Hobie_SHRED I don't think so...this is the first time vaccines were prepared BEFORE the testing started, so they are ahead of schedule...the investment in ones that didn't work will be a big expense, but not as big as the death count without it."],
       dtype=object))

The Vader test gives the first tweet a result of -.7, which is quite negative. 

A read of the text reveals that the negativity is not towards vaccines themselves, but rather towards the pandemic - with a reference to the "deah count." 

I'll look at the Vader scores of some other random tweets from the data set, to get a sense of what the sentiment scores are actually picking up on... 

In [16]:
#testing Vader on another tweet 

tweet2 = documents[1:2]
tweet = tweet2

def sentiment_analyzer_scores(tweet):
    score = analyzer.polarity_scores(tweet)
    return score, tweet

sentiment_analyzer_scores(tweet)

({'neg': 0.129, 'neu': 0.871, 'pos': 0.0, 'compound': -0.4782},
 array(["@DrMorien1 i voted for trump but he keeps talking about the vaccines, and if you listen to anthony patch it's not good"],
       dtype=object))

The Vader test of the second tweet is also negative: -0.4.

In this case, a read of the text reveals a negativity towards vaccines in general. 

In [17]:
tweet3 = documents[2:3]
tweet = tweet3

def sentiment_analyzer_scores(tweet):
    score = analyzer.polarity_scores(tweet)
    return score, tweet

sentiment_analyzer_scores(tweet)

({'neg': 0.111, 'neu': 0.651, 'pos': 0.238, 'compound': 0.7404},
 array(['@ighaworth Thank goodness the Harris-Biden duo are so kind to offer this option vs the dangerous and ineffective vaccine that most Americans won’t be able to afford being offered by Trump. 🙄'],
       dtype=object))

The vader score of tweet 3 is very positive at 0.7.

A read of the text shows the positivity is directed at the Biden-Harris ticket, with very NEGATIVE sentiment towards Trump and any vaccine "offered by Trump." I find it very interesting that Vader scored this tweet as positive - given that it expresses both positive and negative feelings very stronly. 

In [18]:
tweet4 = documents[3:4]
tweet = tweet4

def sentiment_analyzer_scores(tweet):
    score = analyzer.polarity_scores(tweet)
    return score, tweet

sentiment_analyzer_scores(tweet)

({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 array(['Covid-19 vaccine: Moderna applies for FDA authorization - CNN https://t.co/4lGcFv6Fsb'],
       dtype=object))

Vader scores tweet 4 as neutral - I assume because it's a retweet of a credible news source. 

In [19]:
tweet5 = documents[4:5]
tweet = tweet5

def sentiment_analyzer_scores(tweet):
    score = analyzer.polarity_scores(tweet)
    return score, tweet

sentiment_analyzer_scores(tweet)

({'neg': 0.073, 'neu': 0.748, 'pos': 0.179, 'compound': 0.7264},
 array(['@LusyLuck @bleedinCubBlue @Liliana22207796 @DrLeanaWen “Without any side effects “ doesn’t exist. Everything has side effects. Regulatory agencies will review the efficacy and safety data and assess and if the benefits are more significant than the risks the vaccine will be approved and the risks listed on the label/package insert.'],
       dtype=object))

Vader scores tweet 5 as very positive. 

A read of the text reveals that the vocabulary is quite scientific; I'm surprised this tweet's score is not closer to neutral. 


In [20]:
tweet6 = documents[5:6]
tweet = tweet6

def sentiment_analyzer_scores(tweet):
    score = analyzer.polarity_scores(tweet)
    return score, tweet

sentiment_analyzer_scores(tweet)

({'neg': 0.166, 'neu': 0.834, 'pos': 0.0, 'compound': -0.8389},
 array(['Roger Howell shut down nfl until everyone person gets a vaccine. Why are you putting lives at risks . It’s not  like you don’t make a lot of money on merchandizing that can you hold you over? A fb was in tears almost because he has to make a living and worry about covid???'],
       dtype=object))

Vader score of tweet 6 is the most negative so far: -0.8.
    
The text expresses negativity not towards vaccines but rather towards the pandemic; similar to tweet 1, which also had a very negative score. 

In [21]:
tweet8 = documents[15:16]
tweet = tweet8

def sentiment_analyzer_scores(tweet):
    score = analyzer.polarity_scores(tweet)
    return score, tweet

sentiment_analyzer_scores(tweet)

({'neg': 0.092, 'neu': 0.561, 'pos': 0.347, 'compound': 0.7731},
 array(['@centralfornia @NYorNothing @VoopaOfficial Yes true, but..... no use arguing this here. To answer the OP - yes I will take the vaccine.'],
       dtype=object))

Vader score of tweet 8 is high: 0.7.
    
The text does indeed reveal a positive sentiment towards a vaccine. 

In [22]:
tweet9 = documents[500:501]
tweet = tweet9

def sentiment_analyzer_scores(tweet):
    score = analyzer.polarity_scores(tweet)
    return score, tweet

sentiment_analyzer_scores(tweet)

({'neg': 0.0, 'neu': 0.871, 'pos': 0.129, 'compound': 0.6369},
 array(['@KamalaHarris @Railli3 Jokes.... 😂😂😂. The clone came up with a vaccine and he isn’t and wasn’t even in power 🤷\u200d♀️. Joe LIEden and neither you....will hold office for much longer. Enjoy your make believe while you can....'],
       dtype=object))

The Vader score of tweet 9 is quite positive: 0.64.

Interestingly, the sentiment of the text is all over the place - n egative towards Biden-Haris, perhaps positive towars a vaccine - it's not clear even to a human reader. 

In [23]:
tweet10 = documents[1000:1001]
tweet = tweet10

def sentiment_analyzer_scores(tweet):
    score = analyzer.polarity_scores(tweet)
    return score, tweet

sentiment_analyzer_scores(tweet)

({'neg': 0.0, 'neu': 0.838, 'pos': 0.162, 'compound': 0.2732},
 array(["@raganbeth Except with them it's going to be mandatory. You take the vaccine or they take your livelihood."],
       dtype=object))

The Vader score of tweet 10 is somewhat positive: 0.27.
    
A human read of the text reveals, however, a somewhat dystopian sentiment about forced vaccinations -- not exactly positive. 

INITIAL IMPRESSION OF VADER SENTIMENT ANALYSIS: The Vader analysis of unprocessed text from tweets is quite superficial and in many cases misleading.  

I could process the tweets in hopes of getting a more refined output from Vader, and then grouping the positive/neutral/negative texts into clusters in order to find similarities between them. 

However, I think it's better to focus my efforts on another path, doing my own modeling.

To conclude the Vader analysis section, I'll include a breakdown of the distribution of sentiment across my data set (below). 

In [24]:
# run Vader on full dataset 
sentiment_analyzer = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    return sentiment_analyzer.polarity_scores(text)['compound']

df['sentiment_score'] = df.text.apply(lambda x: sentiment_analysis(x))

In [25]:
# look at basic stats of the sentiment score for full dataset
df.describe()
#mean is on the slighly positive side at 0.05 (with zero being a neutral score)
#std of 0.48

Unnamed: 0,id,favorite_count,retweet_count,user_id,sentiment_score
count,179672.0,179672.0,179672.0,179672.0,179672.0
mean,1.333256e+18,11.516998,2.006768,4.463303e+17,0.057929
std,255893400000000.0,496.171748,83.790658,5.553048e+17,0.484655
min,1.332741e+18,0.0,0.0,324.0,-0.9971
25%,1.333093e+18,0.0,0.0,210226000.0,-0.296
50%,1.333333e+18,0.0,0.0,2406998000.0,0.0
75%,1.333469e+18,2.0,0.0,1.06544e+18,0.4434
max,1.333608e+18,155735.0,24783.0,1.333592e+18,0.9998


In [26]:
# look at distribution of sentiment score 

df['very_negative'] = df.sentiment_score.apply(lambda x: x < -0.5)
df['negative'] = df.sentiment_score.apply(lambda x: x >= -0.5 and x < 0)
df['neutral'] = df.sentiment_score.apply(lambda x: x == 0)
df['positive'] = df.sentiment_score.apply(lambda x: x > 0 and x <= 0.5)
df['very_positive'] = df.sentiment_score.apply(lambda x: x > 0.5)

count_very_negative = list(df[df.very_negative == True].count())[0]
count_negative = list(df[df.negative == True].count())[0] 
count_neutral = list(df[df.neutral == True].count())[0]
count_positive = list(df[df.positive == True].count())[0]
count_very_positive = list(df[df.very_positive == True].count())[0] 

df['negative'] = df.sentiment_score.apply(lambda x: x >= -0.5 and x < 0)
df['neutral'] = df.sentiment_score.apply(lambda x: x == 0)
df['positive'] = df.sentiment_score.apply(lambda x: x > 0 and x <= 0.5)
df['very_positive'] = df.sentiment_score.apply(lambda x: x > 0.5)

sentiment_counts = [count_very_negative, count_negative, count_neutral, count_positive, count_very_positive]


In [27]:
sentiment_counts 
#distribution (list below of totals) is skewed to the left, as indicated by the slightly positive mean score 

[28655, 32954, 37678, 42111, 38274]

In [28]:


# import matplotlib.pyplot as plt
# %matplotlib inline
# import seaborn as sns

# # plot histogram of sentiments
# fig = plt.figure(figsize=(10,5))

# plt.hist(
#     sentiment_counts,
#     bins=20,
#     color='#60505C')

# plt.title('Distribution - Article Word Count', fontsize=16)
# plt.ylabel('Frequency', fontsize=12)
# plt.xlabel('Word Count', fontsize=12)
# plt.yticks(np.arange(0, 50, 5))
# plt.xticks(np.arange(0, 2700, 200))

# file_name = 'hist'

# fig.savefig(
#     file_path + file_name + '.png',
#     dpi=fig.dpi,
#     bbox_inches='tight'
# )


## Part 3: Processing of data in preparation for text analysis

In [29]:
#reminder: my "corpus" is the numpy array called "documents"
documents = df['text']

In [30]:
#I imported this code from a great project on GitHub - I'm adapting it for my purposes. Credit to: 
#source: https://github.com/robsalgado/personal_data_science_projects/blob/master/topic_modeling_nmf/nlp_topic_utils.ipynb

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
import re

import nltk
#from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag

# Contraction map
c_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'alls": "you alls",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you you will",
    "you'll've": "you you will have",
    "you're": "you are",
    "you've": "you have"
}

# Compiling the contraction dict
c_re = re.compile('(%s)' % '|'.join(c_dict.keys()))

# List of stop words
add_stop = ['said', 'say', '...', 'like']
stop_words = ENGLISH_STOP_WORDS.union(add_stop)

# List of punctuation
punc = list(set(string.punctuation))


# Splits words on white spaces (leaves contractions intact) and splits out
# trailing punctuation
def casual_tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


def expandContractions(text, c_re=c_re):
    def replace(match):
        return c_dict[match.group(0)]
    return c_re.sub(replace, text)

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)


def process_text(text):
    text = casual_tokenizer(text)
    text = [each.lower() for each in text]
    text = [re.sub('[0-9]+', '', each) for each in text]
    text = [expandContractions(each, c_re=c_re) for each in text]
    text = [w for w in text if w not in punc]
    #text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return text


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mymac/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [31]:
#process text
df['processed_text'] = df['text'].apply(process_text)

In [32]:
df.head()

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,lang,possibly_sensitive,retweet_count,user_id,user_location,sentiment_score,very_negative,negative,neutral,positive,very_positive,processed_text
0,1333608117242720256,2020-12-01 03:05:27+00:00,SpeakerMentors,@Hobie_SHRED I don't think so...this is the fi...,reply,,0,en,,0,2439831350,,-0.7096,True,False,False,False,False,"[@hobie_shred, think, so, ..., this, is, the, ..."
1,1333608115816828928,2020-12-01 03:05:26+00:00,JoyceWhyfor,@DrMorien1 i voted for trump but he keeps talk...,reply,,0,en,,0,3133624726,"Mexico, Maine",-0.4782,False,True,False,False,False,"[@drmorien, voted, for, trump, but, he, keeps,..."
2,1333608112314576897,2020-12-01 03:05:25+00:00,BrianCCox2,@ighaworth Thank goodness the Harris-Biden duo...,reply,,0,en,,0,741455733875716096,"Texas, USA",0.7404,False,False,False,False,True,"[@ighaworth, thank, goodness, the, harris-bide..."
3,1333608107751170048,2020-12-01 03:05:24+00:00,TheHops31,Covid-19 vaccine: Moderna applies for FDA auth...,original,,0,en,True,0,379140897,"New York, New York",0.0,False,False,True,False,False,"[covid, vaccine, moderna, applies, for, fda, a..."
4,1333608105515438080,2020-12-01 03:05:24+00:00,JuCamarote,@LusyLuck @bleedinCubBlue @Liliana22207796 @Dr...,reply,,0,en,,0,168259094,"San Diego, CA",0.7264,False,False,False,False,True,"[@lusyluck, @bleedincubblue, @liliana, @drlean..."


In [33]:
df['processed_text'][0:3]

0    [@hobie_shred, think, so, ..., this, is, the, ...
1    [@drmorien, voted, for, trump, but, he, keeps,...
2    [@ighaworth, thank, goodness, the, harris-bide...
Name: processed_text, dtype: object

In [34]:
#NOTE TO SELF: need to remove stopwords parts-of-speech tagging, following by lemmatizing
#then processing will be complete

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in processed_text]
    
df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x)) 


In [35]:
df.head(5)

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,lang,possibly_sensitive,retweet_count,user_id,user_location,sentiment_score,very_negative,negative,neutral,positive,very_positive,processed_text,pos_tagged
0,1333608117242720256,2020-12-01 03:05:27+00:00,SpeakerMentors,@Hobie_SHRED I don't think so...this is the fi...,reply,,0,en,,0,2439831350,,-0.7096,True,False,False,False,False,"[@hobie_shred, think, so, ..., this, is, the, ...","[(@hobie_shred, VBN), (think, VBP), (so, RB), ..."
1,1333608115816828928,2020-12-01 03:05:26+00:00,JoyceWhyfor,@DrMorien1 i voted for trump but he keeps talk...,reply,,0,en,,0,3133624726,"Mexico, Maine",-0.4782,False,True,False,False,False,"[@drmorien, voted, for, trump, but, he, keeps,...","[(@drmorien, NN), (voted, VBD), (for, IN), (tr..."
2,1333608112314576897,2020-12-01 03:05:25+00:00,BrianCCox2,@ighaworth Thank goodness the Harris-Biden duo...,reply,,0,en,,0,741455733875716096,"Texas, USA",0.7404,False,False,False,False,True,"[@ighaworth, thank, goodness, the, harris-bide...","[(@ighaworth, JJ), (thank, NN), (goodness, NN)..."
3,1333608107751170048,2020-12-01 03:05:24+00:00,TheHops31,Covid-19 vaccine: Moderna applies for FDA auth...,original,,0,en,True,0,379140897,"New York, New York",0.0,False,False,True,False,False,"[covid, vaccine, moderna, applies, for, fda, a...","[(covid, JJ), (vaccine, NN), (moderna, NN), (a..."
4,1333608105515438080,2020-12-01 03:05:24+00:00,JuCamarote,@LusyLuck @bleedinCubBlue @Liliana22207796 @Dr...,reply,,0,en,,0,168259094,"San Diego, CA",0.7264,False,False,False,False,True,"[@lusyluck, @bleedincubblue, @liliana, @drlean...","[(@lusyluck, JJ), (@bleedincubblue, NNP), (@li..."


In [36]:
pos_tagged = df['pos_tagged']
type(pos_tagged)

pandas.core.series.Series

In [43]:
# #remove stop words
def remove_stopwords(text):
    text = [w for w in text if w not in stop_words]
    return text

df['pos_tagged'].apply(remove_stopwords)

0         [(@hobie_shred, VBN), (think, VBP), (so, RB), ...
1         [(@drmorien, NN), (voted, VBD), (for, IN), (tr...
2         [(@ighaworth, JJ), (thank, NN), (goodness, NN)...
3         [(covid, JJ), (vaccine, NN), (moderna, NN), (a...
4         [(@lusyluck, JJ), (@bleedincubblue, NNP), (@li...
                                ...                        
192403    [(covid, JJ), (vaccine, NN), (gps, NN), (in, I...
192406    [(if, IN), (pro, JJ), (covid, FW), (vaccine, N...
192407    [(december, NN), (th, NN), (for, IN), (the, DT...
192409    [(doctor, NN), (who, WP), (had, VBD), (covid, ...
192410    [(as, IN), (we, PRP), (learn, VBP), (this, DT)...
Name: pos_tagged, Length: 179672, dtype: object

In [46]:
type(df['pos_tagged'][0])

list

In [47]:
df['pos_tagged'][0]

[('@hobie_shred', 'VBN'),
 ('think', 'VBP'),
 ('so', 'RB'),
 ('...', ':'),
 ('this', 'DT'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('first', 'JJ'),
 ('time', 'NN'),
 ('vaccines', 'NNS'),
 ('were', 'VBD'),
 ('prepared', 'VBN'),
 ('before', 'IN'),
 ('the', 'DT'),
 ('testing', 'NN'),
 ('started', 'VBD'),
 ('so', 'RB'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('ahead', 'RB'),
 ('of', 'IN'),
 ('schedule', 'NN'),
 ('...', ':'),
 ('the', 'DT'),
 ('investment', 'NN'),
 ('in', 'IN'),
 ('ones', 'NNS'),
 ('that', 'WDT'),
 ('work', 'VBP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('big', 'JJ'),
 ('expense', 'NN'),
 ('but', 'CC'),
 ('not', 'RB'),
 ('as', 'RB'),
 ('big', 'JJ'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('death', 'NN'),
 ('count', 'NN'),
 ('without', 'IN'),
 ('it', 'PRP')]

In [56]:
#lemmatizing
from nltk.stem import WordNetLemmatizer 
wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]


In [57]:
type(lemmatized)

list

In [58]:
len(lemmatized)

179672

In [64]:
lemmatized[500]

['@kamalaharris',
 '@railli',
 'joke',
 '...',
 'the',
 'clone',
 'came',
 'up',
 'with',
 'vaccine',
 'and',
 'he',
 'isn',
 'and',
 'wasn',
 'even',
 'in',
 'power',
 'joe',
 'lieden',
 'and',
 'neither',
 'you',
 '...',
 'will',
 'hold',
 'office',
 'for',
 'much',
 'longer',
 'enjoy',
 'your',
 'make',
 'believe',
 'while',
 'you',
 'can',
 '...']

In [67]:
df.head()

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,lang,possibly_sensitive,retweet_count,user_id,user_location,sentiment_score,very_negative,negative,neutral,positive,very_positive,processed_text,pos_tagged
0,1333608117242720256,2020-12-01 03:05:27+00:00,SpeakerMentors,@Hobie_SHRED I don't think so...this is the fi...,reply,,0,en,,0,2439831350,,-0.7096,True,False,False,False,False,"[@hobie_shred, think, so, ..., this, is, the, ...","[(@hobie_shred, VBN), (think, VBP), (so, RB), ..."
1,1333608115816828928,2020-12-01 03:05:26+00:00,JoyceWhyfor,@DrMorien1 i voted for trump but he keeps talk...,reply,,0,en,,0,3133624726,"Mexico, Maine",-0.4782,False,True,False,False,False,"[@drmorien, voted, for, trump, but, he, keeps,...","[(@drmorien, NN), (voted, VBD), (for, IN), (tr..."
2,1333608112314576897,2020-12-01 03:05:25+00:00,BrianCCox2,@ighaworth Thank goodness the Harris-Biden duo...,reply,,0,en,,0,741455733875716096,"Texas, USA",0.7404,False,False,False,False,True,"[@ighaworth, thank, goodness, the, harris-bide...","[(@ighaworth, JJ), (thank, NN), (goodness, NN)..."
3,1333608107751170048,2020-12-01 03:05:24+00:00,TheHops31,Covid-19 vaccine: Moderna applies for FDA auth...,original,,0,en,True,0,379140897,"New York, New York",0.0,False,False,True,False,False,"[covid, vaccine, moderna, applies, for, fda, a...","[(covid, JJ), (vaccine, NN), (moderna, NN), (a..."
4,1333608105515438080,2020-12-01 03:05:24+00:00,JuCamarote,@LusyLuck @bleedinCubBlue @Liliana22207796 @Dr...,reply,,0,en,,0,168259094,"San Diego, CA",0.7264,False,False,False,False,True,"[@lusyluck, @bleedincubblue, @liliana, @drlean...","[(@lusyluck, JJ), (@bleedincubblue, NNP), (@li..."


In [69]:
df['lemmatized'] = lemmatized

In [70]:
df.head()

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,lang,possibly_sensitive,retweet_count,...,user_location,sentiment_score,very_negative,negative,neutral,positive,very_positive,processed_text,pos_tagged,lemmatized
0,1333608117242720256,2020-12-01 03:05:27+00:00,SpeakerMentors,@Hobie_SHRED I don't think so...this is the fi...,reply,,0,en,,0,...,,-0.7096,True,False,False,False,False,"[@hobie_shred, think, so, ..., this, is, the, ...","[(@hobie_shred, VBN), (think, VBP), (so, RB), ...","[@hobie_shred, think, so, ..., this, is, the, ..."
1,1333608115816828928,2020-12-01 03:05:26+00:00,JoyceWhyfor,@DrMorien1 i voted for trump but he keeps talk...,reply,,0,en,,0,...,"Mexico, Maine",-0.4782,False,True,False,False,False,"[@drmorien, voted, for, trump, but, he, keeps,...","[(@drmorien, NN), (voted, VBD), (for, IN), (tr...","[@drmorien, voted, for, trump, but, he, keep, ..."
2,1333608112314576897,2020-12-01 03:05:25+00:00,BrianCCox2,@ighaworth Thank goodness the Harris-Biden duo...,reply,,0,en,,0,...,"Texas, USA",0.7404,False,False,False,False,True,"[@ighaworth, thank, goodness, the, harris-bide...","[(@ighaworth, JJ), (thank, NN), (goodness, NN)...","[@ighaworth, thank, goodness, the, harris-bide..."
3,1333608107751170048,2020-12-01 03:05:24+00:00,TheHops31,Covid-19 vaccine: Moderna applies for FDA auth...,original,,0,en,True,0,...,"New York, New York",0.0,False,False,True,False,False,"[covid, vaccine, moderna, applies, for, fda, a...","[(covid, JJ), (vaccine, NN), (moderna, NN), (a...","[covid, vaccine, moderna, applies, for, fda, a..."
4,1333608105515438080,2020-12-01 03:05:24+00:00,JuCamarote,@LusyLuck @bleedinCubBlue @Liliana22207796 @Dr...,reply,,0,en,,0,...,"San Diego, CA",0.7264,False,False,False,False,True,"[@lusyluck, @bleedincubblue, @liliana, @drlean...","[(@lusyluck, JJ), (@bleedincubblue, NNP), (@li...","[@lusyluck, @bleedincubblue, @liliana, @drlean..."


In [72]:
df['lemmatized'][0:2]

0    [@hobie_shred, think, so, ..., this, is, the, ...
1    [@drmorien, voted, for, trump, but, he, keep, ...
Name: lemmatized, dtype: object

## Part 4: TF-IDF 


In [78]:
" ".join(df['lemmatized'][0])


'@hobie_shred think so ... this is the first time vaccine were prepared before the testing started so they are ahead of schedule ... the investment in one that work will be big expense but not a big a the death count without it'

In [88]:
df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))


In [113]:
final_docs = df['final_docs']

In [236]:
#create document term matrix with TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# # initial tuning of parameters
# #set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# # set min_df to 5 (word must occur in at least 5 documents)
# # set max_df to 0.85 (word must not occur in more than 80 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))


In [237]:
type(doc_term_matrix_1)

scipy.sparse.csr.csr_matrix

In [238]:
doc_term_matrix_1.shape

(179672, 2000)

## Part 5: LDA modeling to identify latent topics 

For Parts 5 and 6, I was inspired by and adapted some code from a Github project that uses LDA and NMF modeling:
https://stackabuse.com/python-for-nlp-topic-modeling/


In [245]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=20, stop_words='english')
doc_term_matrix_2 = count_vect.fit_transform(df['final_docs'].values.astype('U'))

In [253]:
doc_term_matrix_2

<179672x8803 sparse matrix of type '<class 'numpy.int64'>'
	with 2047256 stored elements in Compressed Sparse Row format>

Use LDA to create topics,  along with the probability distribution for each word in our vocabulary for each topic:

In [254]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=10, random_state=42)
LDA.fit(doc_term_matrix_2)

LatentDirichletAllocation(random_state=42)

In [272]:
import random

for i in range(10):
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

union
jr
formaldehyde
bullying
start
desperately
hurry
unnecessary
screaming
sorted


In [273]:
# find 10 words with the highest probability for the first topic...
# to get first topic, se the components_ attribute and pass a 0 index as the value
first_topic = LDA.components_[0]

"To sort the indexes according to probability values, we can use the argsort() function. Once sorted, the 10 words with the highest probabilities will now belong to the last 10 indexes of the array. The following script returns the indexes of the 10 words with the highest probabilities:"

In [274]:
top_topic_words = first_topic.argsort()[-10:]

In [275]:
for i in top_topic_words:
    print(count_vect.get_feature_names()[i])

going
india
country
good
state
news
china
pm
covid
https


In [276]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')


Top 10 words for topic #0:
['going', 'india', 'country', 'good', 'state', 'news', 'china', 'pm', 'covid', 'https']


Top 10 words for topic #1:
['president', 'scientist', 'speed', 'us_fda', 'credit', 'biden', 'did', 'wa', 'realdonaldtrump', 'trump']


Top 10 words for topic #2:
['ha', 'death', 'immunity', 'need', 'rate', 'virus', 'risk', 'https', 'covid', 'people']


Top 10 words for topic #3:
['people', 'world', 'polio', 'just', 'gate', 'year', 'wa', 'make', 'like', 'https']


Top 10 words for topic #4:
['plan', 'news', 'pfizer', 'ha', 'distribution', 'trump', 'pandemic', 'need', 'covid', 'https']


Top 10 words for topic #5:
['know', 'need', 'going', 'think', 'like', 'don', 'want', 'mask', 'just', 'people']


Top 10 words for topic #6:
['year', 'trudeau', 'canadian', 'ha', 'getting', 'canada', 'shot', 'covid', 'https', 'flu']


Top 10 words for topic #7:
['mrna', 'year', 'covid', 'term', 'know', 'virus', 'ha', 'long', 'effect', 'trial']


Top 10 words for topic #8:
['effective', 'dos

In [277]:
topic_values = LDA.transform(doc_term_matrix_2)
topic_values.shape

(179672, 10)

In [295]:
df['LDA_topic'] = topic_values.argmax(axis=1)

In [296]:
df.head()

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,lang,possibly_sensitive,retweet_count,...,positive,very_positive,processed_text,pos_tagged,lemmatized,final_docs,Topic,LDA Topic,NMF Topic,LDA_topic
0,1333608117242720256,2020-12-01 03:05:27+00:00,SpeakerMentors,@Hobie_SHRED I don't think so...this is the fi...,reply,,0,en,,0,...,False,False,"[@hobie_shred, think, so, ..., this, is, the, ...","[(@hobie_shred, VBN), (think, VBP), (so, RB), ...","[@hobie_shred, think, so, ..., this, is, the, ...",@hobie_shred think so ... this is the first ti...,1,3,1,1
1,1333608115816828928,2020-12-01 03:05:26+00:00,JoyceWhyfor,@DrMorien1 i voted for trump but he keeps talk...,reply,,0,en,,0,...,False,False,"[@drmorien, voted, for, trump, but, he, keeps,...","[(@drmorien, NN), (voted, VBD), (for, IN), (tr...","[@drmorien, voted, for, trump, but, he, keep, ...",@drmorien voted for trump but he keep talking ...,1,1,4,4
2,1333608112314576897,2020-12-01 03:05:25+00:00,BrianCCox2,@ighaworth Thank goodness the Harris-Biden duo...,reply,,0,en,,0,...,False,True,"[@ighaworth, thank, goodness, the, harris-bide...","[(@ighaworth, JJ), (thank, NN), (goodness, NN)...","[@ighaworth, thank, goodness, the, harris-bide...",@ighaworth thank goodness the harris-biden duo...,1,1,4,4
3,1333608107751170048,2020-12-01 03:05:24+00:00,TheHops31,Covid-19 vaccine: Moderna applies for FDA auth...,original,,0,en,True,0,...,False,False,"[covid, vaccine, moderna, applies, for, fda, a...","[(covid, JJ), (vaccine, NN), (moderna, NN), (a...","[covid, vaccine, moderna, applies, for, fda, a...",covid vaccine moderna applies for fda authoriz...,0,8,5,5
4,1333608105515438080,2020-12-01 03:05:24+00:00,JuCamarote,@LusyLuck @bleedinCubBlue @Liliana22207796 @Dr...,reply,,0,en,,0,...,False,True,"[@lusyluck, @bleedincubblue, @liliana, @drlean...","[(@lusyluck, JJ), (@bleedincubblue, NNP), (@li...","[@lusyluck, @bleedincubblue, @liliana, @drlean...",@lusyluck @bleedincubblue @liliana @drleanawen...,2,7,1,1


## Part 6: Topic modeling, using NMF



In [297]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=10, random_state=42)
nmf.fit(doc_term_matrix_1 )


NMF(n_components=10, random_state=42)

In [298]:
#randomly get ten words from our vocabulary 
import random

for i in range(10):
    random_id = random.randint(0,len(tfidfconverter.get_feature_names()))
    print(tfidfconverter.get_feature_names()[random_id])

rich
effect
think vaccine
took
confirmed
evil
billion
return
mollyjongfast
cause


In [290]:
first_topic = nmf.components_[0]
top_topic_words = first_topic.argsort()[-10:]

These indexes can now be passed to the tfidfconverter object to retrieve the actual words...

In [291]:
for i in top_topic_words:
    print(tfidfconverter.get_feature_names()[i])

distribution
trial
health
say
pandemic
read
vaccines
pm
news
https


Print the ten words with highest probabilities for each of the topics:



In [292]:
for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidfconverter.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['distribution', 'trial', 'health', 'say', 'pandemic', 'read', 'vaccines', 'pm', 'news', 'https']


Top 10 words for topic #1:
['make', 'going', 'think', 'want', 'wa', 'know', 'like', 'virus', 'ha', 'just']


Top 10 words for topic #2:
['distribution', 'getting covid', 'covidvaccine', 'pfizer', 'available', 'say', 'pfizer covid', 'vaccine covid', 'covid vaccine', 'covid']


Top 10 words for topic #3:
['vp', 'asap', 'don need', 'covid pandemic', 'vaccine covid', 'pfizer', 'pandemic', 'vaccine need', 'need vaccine', 'need']


Top 10 words for topic #4:
['president', 'did', 'came vaccine', 'came', 'credit', 'biden', 'realdonaldtrump us_fda', 'us_fda', 'trump', 'realdonaldtrump']


Top 10 words for topic #5:
['pfizer', 'emergency use', 'moderna vaccine', 'use', 'effective', 'authorization', 'approval', 'fda', 'emergency', 'moderna']


Top 10 words for topic #6:
['cdc', 'administration', 'trump administration', 'airline', 'moderna ask', 'dos moderna', 'want', 'dis

Initial "human" analysis of topics detected using NMF: 


Top 10 words for topic #0:
['distribution', 'trial', 'health', 'say', 'pandemic', 'read', 'vaccines', 'pm', 'news', 'https']
TOPIC 0: generic discussion of news RE vaccine distribution 


Top 10 words for topic #1:
['make', 'going', 'think', 'want', 'wa', 'know', 'like', 'virus', 'ha', 'just']
TOPIC 1: generic  



Top 10 words for topic #2:
['distribution', 'getting covid', 'covidvaccine', 'pfizer', 'available', 'say', 'pfizer covid', 'vaccine covid', 'covid vaccine', 'covid']
TOPIC 2: focused discussion of Pfizer covid vaccine  


Top 10 words for topic #3:
['vp', 'asap', 'don need', 'covid pandemic', 'vaccine covid', 'pfizer', 'pandemic', 'vaccine need', 'need vaccine', 'need']
TOPIC 3: focused discussion of NEED regarding Pfizer covid vaccine  
* examine tweets to see what NEED is referring to... 


Top 10 words for topic #4:
['president', 'did', 'came vaccine', 'came', 'credit', 'biden', 'realdonaldtrump us_fda', 'us_fda', 'trump', 'realdonaldtrump']
TOPIC 4: U.S. political discussion regarding the vaccine, specifically whether Trump will get credit for speed of vaccine... 


Top 10 words for topic #5:
['pfizer', 'emergency use', 'moderna vaccine', 'use', 'effective', 'authorization', 'approval', 'fda', 'emergency', 'moderna']
TOPIC 5: focused discussion of Moderna covid vaccine 


Top 10 words for topic #6:
['cdc', 'administration', 'trump administration', 'airline', 'moderna ask', 'dos moderna', 'want', 'distribute', 'https', 'vaccine https']
TOPIC 6: general U.S. discussion of covid vaccine, regarding politics, business, life (ranging from distribution of the vaccine to whether people will need a vaccine for air travel, etc.) 


Top 10 words for topic #7:
['like flu', 'swine flu', 'swine', 'vaccine flu', 'vaccine year', 'flu shot', 'shot', 'year', 'flu vaccine', 'flu']
TOPIC 7: discussion of the covid vaccine in relation to flu vaccine
* Very interesting topic - look at tweets to see if they seem to be politically-slanted or more of a scientific tone (are people downplaying the need for covid vacine, saying it's no more necessary than a flue vaccine - or are they informing themselves or others about the flu vaccine, history of flu vaccine and history of swine flu, as part of broader discussion of covid? Or something else entirely?) 


Top 10 words for topic #8:
['week', 'report', 'pfizer coronavirus', 'ha', 'health', 'dos', 'say', 'american', 'coronavirus vaccine', 'coronavirus']
TOPIC 8: focused discussion of Pfizer vaccine 


Top 10 words for topic #9:
['million people', 'don', 'vaccinated', 'die', 'million', 'want', 'think', 'vaccine people', 'people vaccine', 'people']
TOPIC 8: this topic is dominated by extremely strong words and expressions - "want", "die", "million." ALso, "people" is listed several times. Fascinating.


The following script adds the topics to the data set and displays the first five rows:

In [299]:
topic_values = nmf.transform(doc_term_matrix_1)
df['NMF_topic'] = topic_values.argmax(axis=1)
df.head()

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,lang,possibly_sensitive,retweet_count,...,very_positive,processed_text,pos_tagged,lemmatized,final_docs,Topic,LDA Topic,NMF Topic,LDA_topic,NMF_topic
0,1333608117242720256,2020-12-01 03:05:27+00:00,SpeakerMentors,@Hobie_SHRED I don't think so...this is the fi...,reply,,0,en,,0,...,False,"[@hobie_shred, think, so, ..., this, is, the, ...","[(@hobie_shred, VBN), (think, VBP), (so, RB), ...","[@hobie_shred, think, so, ..., this, is, the, ...",@hobie_shred think so ... this is the first ti...,1,3,1,1,1
1,1333608115816828928,2020-12-01 03:05:26+00:00,JoyceWhyfor,@DrMorien1 i voted for trump but he keeps talk...,reply,,0,en,,0,...,False,"[@drmorien, voted, for, trump, but, he, keeps,...","[(@drmorien, NN), (voted, VBD), (for, IN), (tr...","[@drmorien, voted, for, trump, but, he, keep, ...",@drmorien voted for trump but he keep talking ...,1,1,4,4,4
2,1333608112314576897,2020-12-01 03:05:25+00:00,BrianCCox2,@ighaworth Thank goodness the Harris-Biden duo...,reply,,0,en,,0,...,True,"[@ighaworth, thank, goodness, the, harris-bide...","[(@ighaworth, JJ), (thank, NN), (goodness, NN)...","[@ighaworth, thank, goodness, the, harris-bide...",@ighaworth thank goodness the harris-biden duo...,1,1,4,4,4
3,1333608107751170048,2020-12-01 03:05:24+00:00,TheHops31,Covid-19 vaccine: Moderna applies for FDA auth...,original,,0,en,True,0,...,False,"[covid, vaccine, moderna, applies, for, fda, a...","[(covid, JJ), (vaccine, NN), (moderna, NN), (a...","[covid, vaccine, moderna, applies, for, fda, a...",covid vaccine moderna applies for fda authoriz...,0,8,5,5,5
4,1333608105515438080,2020-12-01 03:05:24+00:00,JuCamarote,@LusyLuck @bleedinCubBlue @Liliana22207796 @Dr...,reply,,0,en,,0,...,True,"[@lusyluck, @bleedincubblue, @liliana, @drlean...","[(@lusyluck, JJ), (@bleedincubblue, NNP), (@li...","[@lusyluck, @bleedincubblue, @liliana, @drlean...",@lusyluck @bleedincubblue @liliana @drleanawen...,2,7,1,1,1


## Part 7: Initial analysis of topic modeling results

Initial look at topic breakdowns suggests that most of the topics make sense: tweets seem to have been divided up into logical categories (ones about vaccines themselves, ones about politics, ones about the vaccine brands, etc.)

The NMF breakdown of topics looks much more precise to me, so I'll focus further analysis on those topics. 

Next step: count how many tweets fit into each class for NMF.

In [316]:
#get NMF counts
df[df.NMF_topic == 0].count()

id                    20461
parsed_created_at     20461
user_screen_name      20461
text                  20461
tweet_type            20461
hashtags               3992
favorite_count        20461
lang                  20461
possibly_sensitive    19361
retweet_count         20461
user_id               20461
user_location         15075
sentiment_score       20461
very_negative         20461
negative              20461
neutral               20461
positive              20461
very_positive         20461
processed_text        20461
pos_tagged            20461
lemmatized            20461
final_docs            20461
Topic                 20461
LDA Topic             20461
NMF Topic             20461
LDA_topic             20461
NMF_topic             20461
dtype: int64

In [317]:
#get NMF counts
df[df.NMF_topic == 1].count()

id                    64518
parsed_created_at     64518
user_screen_name      64518
text                  64518
tweet_type            64518
hashtags               4190
favorite_count        64518
lang                  64518
possibly_sensitive     6619
retweet_count         64518
user_id               64518
user_location         41218
sentiment_score       64518
very_negative         64518
negative              64518
neutral               64518
positive              64518
very_positive         64518
processed_text        64518
pos_tagged            64518
lemmatized            64518
final_docs            64518
Topic                 64518
LDA Topic             64518
NMF Topic             64518
LDA_topic             64518
NMF_topic             64518
dtype: int64

In [318]:
#get NMF counts
df[df.NMF_topic == 2].count()

id                    25513
parsed_created_at     25513
user_screen_name      25513
text                  25513
tweet_type            25513
hashtags               6380
favorite_count        25513
lang                  25513
possibly_sensitive    14625
retweet_count         25513
user_id               25513
user_location         18903
sentiment_score       25513
very_negative         25513
negative              25513
neutral               25513
positive              25513
very_positive         25513
processed_text        25513
pos_tagged            25513
lemmatized            25513
final_docs            25513
Topic                 25513
LDA Topic             25513
NMF Topic             25513
LDA_topic             25513
NMF_topic             25513
dtype: int64

In [319]:
#get NMF counts
df[df.NMF_topic == 3].count()

id                    8148
parsed_created_at     8148
user_screen_name      8148
text                  8148
tweet_type            8148
hashtags               730
favorite_count        8148
lang                  8148
possibly_sensitive    2435
retweet_count         8148
user_id               8148
user_location         5404
sentiment_score       8148
very_negative         8148
negative              8148
neutral               8148
positive              8148
very_positive         8148
processed_text        8148
pos_tagged            8148
lemmatized            8148
final_docs            8148
Topic                 8148
LDA Topic             8148
NMF Topic             8148
LDA_topic             8148
NMF_topic             8148
dtype: int64

In [322]:
#get NMF counts
df[df.NMF_topic == 4].count()

id                    15253
parsed_created_at     15253
user_screen_name      15253
text                  15253
tweet_type            15253
hashtags               1365
favorite_count        15253
lang                  15253
possibly_sensitive     2294
retweet_count         15253
user_id               15253
user_location          8076
sentiment_score       15253
very_negative         15253
negative              15253
neutral               15253
positive              15253
very_positive         15253
processed_text        15253
pos_tagged            15253
lemmatized            15253
final_docs            15253
Topic                 15253
LDA Topic             15253
NMF Topic             15253
LDA_topic             15253
NMF_topic             15253
dtype: int64

In [323]:
#get NMF counts
df[df.NMF_topic == 6].count()

id                    7040
parsed_created_at     7040
user_screen_name      7040
text                  7040
tweet_type            7040
hashtags              1006
favorite_count        7040
lang                  7040
possibly_sensitive    7032
retweet_count         7040
user_id               7040
user_location         5182
sentiment_score       7040
very_negative         7040
negative              7040
neutral               7040
positive              7040
very_positive         7040
processed_text        7040
pos_tagged            7040
lemmatized            7040
final_docs            7040
Topic                 7040
LDA Topic             7040
NMF Topic             7040
LDA_topic             7040
NMF_topic             7040
dtype: int64

In [324]:
#get NMF counts
df[df.NMF_topic == 7].count()

id                    6638
parsed_created_at     6638
user_screen_name      6638
text                  6638
tweet_type            6638
hashtags               592
favorite_count        6638
lang                  6638
possibly_sensitive    1377
retweet_count         6638
user_id               6638
user_location         4385
sentiment_score       6638
very_negative         6638
negative              6638
neutral               6638
positive              6638
very_positive         6638
processed_text        6638
pos_tagged            6638
lemmatized            6638
final_docs            6638
Topic                 6638
LDA Topic             6638
NMF Topic             6638
LDA_topic             6638
NMF_topic             6638
dtype: int64

In [325]:
#get NMF counts
df[df.NMF_topic == 8].count()

id                    6543
parsed_created_at     6543
user_screen_name      6543
text                  6543
tweet_type            6543
hashtags              1730
favorite_count        6543
lang                  6543
possibly_sensitive    4805
retweet_count         6543
user_id               6543
user_location         4868
sentiment_score       6543
very_negative         6543
negative              6543
neutral               6543
positive              6543
very_positive         6543
processed_text        6543
pos_tagged            6543
lemmatized            6543
final_docs            6543
Topic                 6543
LDA Topic             6543
NMF Topic             6543
LDA_topic             6543
NMF_topic             6543
dtype: int64

In [326]:
#get NMF counts
df[df.NMF_topic == 9].count()

id                    15214
parsed_created_at     15214
user_screen_name      15214
text                  15214
tweet_type            15214
hashtags               1096
favorite_count        15214
lang                  15214
possibly_sensitive     3006
retweet_count         15214
user_id               15214
user_location          9663
sentiment_score       15214
very_negative         15214
negative              15214
neutral               15214
positive              15214
very_positive         15214
processed_text        15214
pos_tagged            15214
lemmatized            15214
final_docs            15214
Topic                 15214
LDA Topic             15214
NMF Topic             15214
LDA_topic             15214
NMF_topic             15214
dtype: int64

In [329]:
type(df[df.NMF_topic == 9])

pandas.core.frame.DataFrame

In [330]:
df[df.NMF_topic == 9].shape

(15214, 27)

In [333]:
nmf_topic9_tweets = df[df.NMF_topic == 9]

In [334]:
type(nmf_topic9_tweets)

pandas.core.frame.DataFrame

In [341]:
pd.set_option('display.max_colwidth', -1)
nmf_topic9_tweets[20:30]['text']

  """Entry point for launching an IPython kernel.


382    @DonaldJTrumpJr The Governor’s in every state are the ones who let all these people die, New York, Cuomo, Cali, Newsome. The OBiden admin sourced out production of Med equipment to China. Trump commissioned AMERICAN Companies to build and make a vaccine. I’d say that’s a dam good job                         
388    @WeHave2BeBetter Is how i see it. So far I heard most people in study had no side effects. I belive this will be Moderna's vaccine which requires vaccine to stay in regular refrigerator temp for up to 3 days after being removed from freezer.                                                                    
401    @mtmalinen Got it. Instead you'll risk your health to a new disease whose long-term effects are unknown. But which is known to very often have serious medium-term effects even in younger people. While the vaccines are known to have none. Smart move.                                                            
424    @bkkid19 @NYGovCuomo I hope people like yo

In [342]:
df[df.NMF_topic == 7][20:30]['text']

830     Y’all out here eating corn fed cow and swine and worried about what’s in a vaccine                                                                                                                                                                                                                                                           
871     @MattWalshBlog We have a flu vaccine.                                                                                                                                                                                                                                                                                                        
927     @Averysdaddy84 And it’s new so it took time to figure out effective treatments and develop a vaccine. We have decades of experience with the flu. If we had been giving a clear direct message from our leaders we may have been closer to being on the same page. However, that was not the case                   

In [354]:
df[df.NMF_topic == 9][14004:14005]['text']

178072    @williamlegate The Bill Gates vaccine is for poor people but I'll be able to listen to podcast in my own head.
Name: text, dtype: object

In [352]:
df[df.NMF_topic == 9][3333:3334]['text']

38539    @stacy_rumpf @voxdotcom The health care workers in my family don’t want the vaccine, not for them, or their children. Healthy people don’t need it anyway. It’s not worth the risk. It’s new and untested
Name: text, dtype: object

In [355]:
df[df.NMF_topic == 9][5555:5556]['text']

65985    @donwinslow People just can't put things aside for 1 year so they and others can stay healthy while a vaccine is developed and distributed. Selfish idiots.
Name: text, dtype: object

In [358]:
df[df.NMF_topic == 9][10000:10001]['text']

126392    on top of that, the vaccine won't just automatically be an instant hit. in fact, reports of some pretty shitty side effects (even mild cases of covid) are gonna prevent people from following through with the full vaccination
Name: text, dtype: object

In [360]:
df[df.NMF_topic == 9][2222:2223]['text']

26313    @MichealMartinTD A lot of people I spoke to say they won't be taking the vaccine. Lots of people are worried about serious adverse effects. I hear the government has indemnified the vaccine manufacturers against any public liability.
Name: text, dtype: object