In [None]:
'''
Breaks the large raw tweet data file into manageable portions

'''


import pandas as pd

#setting file size to 10000 tweets
chunk_size = 10000
batch_no = 1

print("Beginning to parse tweets...")

#Adjust path name per file, this is the small test file I am running currently
for chunk in pd.read_csv("./tweets_01-04.csv", chunksize = chunk_size, error_bad_lines=False):
        chunk.to_csv("covid_data" + str(batch_no) + ".csv", index = False)
        batch_no += 1

print("Finished parsing tweets.")

In [1]:

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
import csv
import glob
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import spacy
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from tqdm.notebook import tqdm

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from collections import Mapping


In [2]:
'''
 creates corpus for topic modelling 
'''
import re

#Correct Dates start at file 700

sentimentDict = {}
documents = []
path = "./covid_data*"


print("Beginning to create corpus...")
#I put the smaller chunked tweet files in to a directory called revisedCovidData
for filename in tqdm(glob.glob(path)):
    if int(filename[12:-4]) >= 700:
        with open(filename, 'r', encoding="utf-8") as rawTweets:
            #open as CSV iterator
            readCSV = csv.reader(rawTweets)
            next(readCSV)
            #Iterate through individual tweets
            tweet_count = 0
            for line in readCSV:
                if tweet_count%100 == 0:
                    result = re.sub(r"http\S+", "", line[1])
                    documents.append(result)
                tweet_count += 1
    #break    
        
print(len(documents))
print(documents[-21])



Beginning to create corpus...


HBox(children=(FloatProgress(value=0.0, max=5635.0), HTML(value='')))


493525
New coronavirus
Transmission is spreading throughout in Japan
There is no escape in Japan 


In [3]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and token not in ["rtrt", "https", 'rt']:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
doc_sample = documents[-21]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['New', 'coronavirus\nTransmission', 'is', 'spreading', 'throughout', 'in', 'Japan\nThere', 'is', 'no', 'escape', 'in', 'Japan', '']


 tokenized and lemmatized document: 
['new', 'coronavirus', 'transmiss', 'spread', 'japan', 'escap', 'japan']


In [5]:
preprocessed_docs = []
print("processing the documents")
for tweet in tqdm(documents):
    preprocessed_docs.append(preprocess(tweet))

processing the documents


HBox(children=(FloatProgress(value=0.0, max=493525.0), HTML(value='')))




In [6]:

print("Creating dictionary")
dictionary = gensim.corpora.Dictionary(preprocessed_docs)

Creating dictionary


In [7]:
#filters out any token in 15 or fewer, more than half, and only the 15,000 most common
#This part can be looked at for changes
dictionary.filter_extremes(no_below = 15, no_above = 0.5, keep_n = 15000)
print(len(dictionary))

15000


In [8]:
print(preprocessed_docs[5])
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

['seoul', 'concert', 'cancel', 'coronavirus', 'instead', 'upset', 'fan', 'start', 'virtuous', 'cycl', 'donat', 'refund', 'prevent', 'spread', 'coronavirus', 'pm', 'donat', 'armi', 'amp', 'bts', 'member', 'name']


In [9]:
from gensim import corpora, models
id2word = corpora.Dictionary(preprocessed_docs)
texts = preprocessed_docs
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1)]]


In [10]:
from pprint import pprint
import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'C:/Users/user/Documents/TopicModelling/Covid_Data/mallet-2.0.8'}) 
#You should update this path as per the path of Mallet directory on your system.
mallet_path = r'C:/Users/user/Documents/TopicModelling/Covid_Data/mallet-2.0.8/bin/mallet' 
#You should update this path as per the path of Mallet directory on your system.

In [11]:
ldamallet = gensim.models.wrappers.LdaMallet(
   mallet_path, corpus=corpus, num_topics=20, id2word=id2word
)
pprint(ldamallet.show_topics(formatted=False))

[(1,
  [('coronavirus', 0.08983900668013145),
   ('outbreak', 0.038771200118047425),
   ('fight', 0.03324387552145297),
   ('donat', 0.02935199958191538),
   ('million', 0.028927766659596118),
   ('win', 0.028890876840264007),
   ('support', 0.02747369294758879),
   ('cancel', 0.022057037808990664),
   ('hope', 0.019520862729908113),
   ('affect', 0.014866597190840257)]),
 (9,
  [('amp', 0.146507251256644),
   ('india', 0.02167560547391712),
   ('govt', 0.014093653046008802),
   ('lockdown', 0.01389824190095962),
   ('pm', 0.008568027129079583),
   ('ccp', 0.008090021405036197),
   ('indian', 0.007648091584694197),
   ('stand', 0.007575939777291421),
   ('power', 0.007088915077322687),
   ('muslim', 0.006998725318069218)]),
 (13,
  [('health', 0.09099601029297198),
   ('public', 0.044122854647182415),
   ('care', 0.025664557709105505),
   ('emerg', 0.024401543945796644),
   ('worker', 0.02164239004697939),
   ('risk', 0.021429920441936777),
   ('protect', 0.01920784248919946),
   ('dis

In [None]:
ldamallet.save("ldamallet2.model")

In [14]:
#0 = reopen, 15 = stay home for ldamallet

In [15]:
def is_topic(text, topic_num):
    tokens = preprocess(text.lower())
    return ldamallet[dictionary.doc2bow(token for token in tokens)][topic_num][1]
def update(date):
    u = date   
    d = dt.timedelta(days=7)
    t = u + d
    return t

In [19]:
#TODO: Fix sentiment averages, how to calculate averages during streaming
#keep counter

import datetime as dt
import csv
import pandas as pd
import textblob
import glob


path = "./covid_data*"
abbr_list = ['ak', 'al', 'ar', 'az', 'ca', 'co', 
             'ct', 'dc', 'de', 'fl', 'ga', 'hi', 
             'ia', 'id', 'il', 'in', 'ks', 'ky', 
             'la', 'ma', 'md', 'me', 'mi', 'mn', 
             'ms', 'mo', 'mt', 'nc', 'ne', 'nh', 
             'nj', 'nm', 'nv', 'ny', 'nd', 'oh', 
             'ok', 'or', 'pa', 'ri', 'sc', 'sd', 
             'tn', 'tx', 'ut', 'vt', 'va', 'wa', 
             'wv', 'wi', 'wy']

column_names = ["Start Date", "State", "Social Dist Sentiment", "Reopening Sentiment", "Other Sentiments"]

#starting one week before start date so it updates correctly
first = dt.datetime(2020, 1, 20)
weekStart = first
weekEnd = update(weekStart)
weekString = ''
startDate = dt.datetime(2020, 1, 27)
sentimentDict = {}

for filename in tqdm(glob.glob(path)):
    #start at document 700, as any earlier is outside our range, thus futile
    if int(filename[12:-4]) >= 700:
        with open(filename, 'r', encoding="utf-8") as rawTweets:
            #open as CSV iterator
            readCSV = csv.reader(rawTweets)
            for line in readCSV:
                #change date into datetime object
                #Format == Thu Jan 23 15:41:43 +0000 2020 
                if line[4] != 'created_at':
                    date = dt.datetime.strptime(line[4], '%a %b %d %H:%M:%S +0000 %Y')
                    #make sure they are the same week
                    if date >= weekStart and date < weekEnd:
                        weekString = dt.datetime.strftime(weekStart, "%Y, %m, %d")
                        if line[8] in abbr_list:
                            if is_topic(line[1], 15) > 0.05: #Check this value maybe too high
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                #week - state - 3 avg sentiments
                                if sentimementDict[weekString][line[8]][0][0] != 0:
                                    sentimementDict[weekString][line[8]][0].append(text.sentiment.polarity)
                                else:
                                    sentimementDict[weekString][line[8]][0][0] = (text.sentiment.polarity)
                            elif is_topic(line[1], 0) > 0.05:
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                #line[8] = state; if this state is already in the dictionary, the sentiment gets averaged
                                if sentimementDict[weekString][line[8]][1][0] != 0:
                                    sentimementDict[weekString][line[8]][1].append((text.sentiment.polarity))
                                else:
                                    sentimementDict[weekString][line[8]][1][0]  = (text.sentiment.polarity)
                            else:
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                #line[8] = state; if this state is already in the dictionary, the sentiment gets averaged
                                if sentimementDict[weekString][line[8]][2][0] != 0:
                                    sentimementDict[weekString][line[8]][2].append(text.sentiment.polarity)
                                else:
                                    sentimementDict[weekString][line[8]][2][0] = (text.sentiment.polarity)
                    #if tweet is from the next week, updates the start date
                    elif date >= weekEnd:
                        #add a week
                        weekStart = update(weekStart)
                        #create a dictionary with each state as key and a list of 0's as values
                        statesDict = {}
                        #iterate each state name into the keys
                        for state in abbr_list:
                            #initialize the dictionary
                            statesDict[state] = [[0], [0], [0]]
                        #add this dictionary to the larger one
                        weekString = dt.datetime.strftime(weekStart, "%Y, %m, %d")
                        sentimentDict.update({weekString: statesDict})
                        #make sure this is a line with data
                        if line[8] in abbr_list:
                            if is_topic(line[1], 15) > 0.05:
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                avgSentiment = (text.sentiment.polarity)
                                sentimentDict[weekString][line[8]][0][0] = avgSentiment
                            elif is_topic(line[1], 0) > 0.05:
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                #line[8] = state; if this state is already in the dictionary, the sentiment gets averaged
                                avgSentiment = (text.sentiment.polarity)
                                sentimentDict[weekString][line[8]][1][0] = avgSentiment
                            else:
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                #line[8] = state; if this state is already in the dictionary, the sentiment gets averaged
                                avgSentiment = (text.sentiment.polarity)
                                sentimentDict[weekString][line[8]][2][0] = avgSentiment
                    elif date <= weekStart and date >= startDate:
                        oneWeek = dt.timedelta(days = 7)
                        previousWeekEnd = weekEnd - oneWeek
                        while (date >= previousWeekEnd):
                            previousWeekEnd = previousWeekEnd - oneWeek
                            previousWeekStart = previousWeekEnd - oneWeek
                        earlierWeekString = dt.strftime(previousWeekStart, "%Y, %m, %d")
                        if earlierWeekString not in SenimentDict:
                            sentimentDict.update({earlierWeekString: statesDict})
                            #make sure this is a line with data
                            if line[8] in abbr_list:
                                if is_topic(line[1], 15) > 0.05:
                                    #calls text of each tweet as a TextBlob object
                                    text = textblob.TextBlob(line[1])
                                    avgSentiment = (text.sentiment.polarity)
                                    sentimentDict[earlierWeekString][line[8]][0][0] = avgSentiment
                                elif is_topic(line[1], 0) > 0.05:
                                    #calls text of each tweet as a TextBlob object
                                    text = textblob.TextBlob(line[1])
                                    #line[8] = state; if this state is already in the dictionary, the sentiment gets averaged
                                    avgSentiment = (text.sentiment.polarity)
                                    sentimentDict[earlierWeekString][line[8]][1][0] = avgSentiment
                                else:
                                    #calls text of each tweet as a TextBlob object
                                    text = textblob.TextBlob(line[1])
                                    #line[8] = state; if this state is already in the dictionary, the sentiment gets averaged
                                    avgSentiment = (text.sentiment.polarity)
                                    sentimentDict[ealierWeekString][line[8]][2][0] = avgSentiment
                        else:
                            earlierWeekString = dt.strftime(previousWeekStart, "%Y, %m, %d")
                            if line[8] in abbr_list:
                                if is_topic(line[1], 15) > 0.05:
                                    text = textblob.TextBlob(line[1])
                                    #calls text of each tweet as a TextBlob object
                                    if sentimentDict[earlierWeekString][line[8]][0][0] != 0:
                                        sentimementDict[earlierWeekString][line[8]][0].append(text.sentiment.polarity)
                                    else:
                                        sentimentDict[earlierWeekString][line[8]][0][0] = (text.sentiment.polarity)
                                elif is_topic(line[1], 0) > 0.05:
                                    #calls text of each tweet as a TextBlob object
                                    text = textblob.TextBlob(line[1])
                                    #line[8] = state; if this state is already in the dictionary, the sentiment gets averaged
                                    if sentimentDict[earlierWeekString][line[8]][1][0] != 0:
                                        sentimentDict[earlierWeekString][line[8]][1].append(text.sentiment.polarity)
                                    else:
                                        sentimentDict[earlierWeekString][line[8]][1][0] = (text.sentiment.polarity)
                                else:
                                    #calls text of each tweet as a TextBlob object
                                    text = textblob.TextBlob(line[1])
                                    #line[8] = state; if this state is already in the dictionary, the sentiment gets averaged
                                    if sentimentDict[earlierWeekString][line[8]][2][0] != 0:
                                        sentimentDict[earlierWeekString][line[8]][2].append(text.sentiment.polarity)
                                    else:
                                        sentimentDict[earlierWeekString][line[8]][2][0] = (text.sentiment.polarity)
                        

HBox(children=(FloatProgress(value=0.0, max=5635.0), HTML(value='')))




OverflowError: date value out of range

In [None]:
"""
(Optional) writes dictionary to CSV file with rows of state, sentiment
"""

print("writing sentiments to file...")
with open("twitter_sentiments_byTopic.csv", "w") as outFile:
    writer = csv.writer(outFile)
    writer.writerow(["Start Date", "State", "Social Dist Sentiment", "Reopening Sentiment", "Other Sentiments"]
    for key, value in sentimentDict.items():
        for state, values in value:
                    for a, b, c in values:
                        writer.writerow([key, state, mean(a), mean(b), mean(c)])
print("finsished writing to file")