In [None]:
'''
Breaks the large raw tweet data file into manageable portions

'''


import pandas as pd

#setting file size to 10000 tweets
chunk_size = 10000
batch_no = 1

print("Beginning to parse tweets...")

#Adjust path name per file, this is the small test file I am running currently
for chunk in pd.read_csv("./tweets_01-04.csv", chunksize = chunk_size, error_bad_lines=False):
        chunk.to_csv("covid_data" + str(batch_no) + ".csv", index = False)
        batch_no += 1

print("Finished parsing tweets.")

In [None]:
!pip install tqdm
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
import csv
import glob
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import spacy
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from tqdm.notebook import tqdm

In [None]:
'''
 creates corpus for topic modelling 
'''
import re

#Correct Dates start at file 700

sentimentDict = {}
documents = []
path = "./covid_data*"

num_files = 0

print("Beginning to create corpus...")
#I put the smaller chunked tweet files in to a directory called revisedCovidData
for filename in tqdm(glob.glob(path)):
    if int(filename[12:-4]) >= 700:
        with open(filename, 'r', encoding="utf-8") as rawTweets:
            #open as CSV iterator
            readCSV = csv.reader(rawTweets)
            next(readCSV)
            #Iterate through individual tweets
            tweet_count = 0
            for line in readCSV:
                if tweet_count%100 == 0:
                    result = re.sub(r"http\S+", "", line[1])
                    documents.append(result)
                tweet_count += 1
    #break    
        
print(len(documents))
print(documents[-21])



In [None]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and token not in ["rtrt", "https", 'rt']:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
doc_sample = documents[-21]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
preprocessed_docs = []
print("processing the documents")
for tweet in tqdm(documents):
    preprocessed_docs.append(preprocess(tweet))

In [None]:

print("Creating dictionary")
dictionary = gensim.corpora.Dictionary(preprocessed_docs)

In [None]:
#filters out any token in 15 or fewer, more than half, and only the 15,000 most common
dictionary.filter_extremes(no_below = 15, no_above = 0.5, keep_n = 15000)
print(len(dictionary))

In [None]:
print(preprocessed_docs[5])
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]


In [None]:
from gensim import corpora, models
id2word = corpora.Dictionary(preprocessed_docs)
texts = preprocessed_docs
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

In [None]:
from pprint import pprint
import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'C:/Users/user/Documents/TopicModelling/Covid_Data/mallet-2.0.8'}) 
#You should update this path as per the path of Mallet directory on your system.
mallet_path = r'C:/Users/user/Documents/TopicModelling/Covid_Data/mallet-2.0.8/bin/mallet' 
#You should update this path as per the path of Mallet directory on your system.

In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(
   mallet_path, corpus=corpus, num_topics=20, id2word=id2word
)
pprint(ldamallet.show_topics(formatted=False))

In [None]:
#11 = reopen, 5 = stay home for ldamallet

In [None]:
def is_topic(text, topic_num):
    tokens = preprocess(text.lower())
    return ldamallet[dictionary.doc2bow(token for token in tokens)][topic_num][1]
def update(date):
    u = dt.datetime.strptime("2011-01-01","%Y-%m-%d")
    d = dt.timedelta(days=7)
    t = u + d
    return t

In [None]:
import datetime as dt
import csv
import pandas as pd
import textblob
import glob

sentimentDict = {}
path = "./covid_data*"
abbr_list = ['ak', 'al', 'ar', 'az', 'ca', 'co', 
             'ct', 'dc', 'de', 'fl', 'ga', 'hi', 
             'ia', 'id', 'il', 'in', 'ks', 'ky', 
             'la', 'ma', 'md', 'me', 'mi', 'mn', 
             'ms', 'mo', 'mt', 'nc', 'ne', 'nh', 
             'nj', 'nm', 'nv', 'ny', 'nd', 'oh', 
             'ok', 'or', 'pa', 'ri', 'sc', 'sd', 
             'tn', 'tx', 'ut', 'vt', 'va', 'wa', 
             'wv', 'wi', 'wy']

column_names = ["Start Date", "State", "Social Dist Sentiment", "Reopening Sentiment", "Other Sentiments"]

#starting one week before start date so it updates correctly
first = dt.datetime(2020, 1, 20)
weekStart = first
weekEnd = update(weekStart)
weekString = ''
startDate = dt.datetime(2020, 1, 27)


for filename in tqdm(glob.glob(path)):
    #start at document 700, as any earlier is outside our range, thus futile
    if int(filename[12:-4]) >= 700:
        with open(filename, 'r', encoding="utf-8") as rawTweets:
            #open as CSV iterator
            readCSV = csv.reader(rawTweets)
            for line in readCSV:
                next(readCSV)
                #change date into datetime object
                #Format == Thu Jan 23 15:41:43 +0000 2020 
                date = dt.datetime.strptime(line[4], '%a %b %d %H:%M:%S %z %Y')
                #make sure they are the same week
                if date >= weekStart and date < weekEnd:
                    weekString = dt.strftime(weekStart)
                    if line[9] != "Null" or "us_state":
                        if is_topic(line[1], 5) > 0.05:
                            #calls text of each tweet as a TextBlob object
                            if sentimementDict[weekString][line[9]][0] != 0:
                                avgSentiment = (sentimementDict[weekString][line[9]][0] +  (text.sentiment.polarity))/2
                            else:
                                avgSentiment = (text.sentiment.polarity)
                            sentimementDict[weekString][line[9]][0] = avgSentiment
                        elif is_topic(line[1], 11) > 0.05:
                            #calls text of each tweet as a TextBlob object
                            text = textblob.TextBlob(line[1])
                            #line[9] = state; if this state is already in the dictionary, the sentiment gets averaged
                            if sentimementDict[weekString][line[9]][1] != 0:
                                avgSentiment = (sentimementDict[weekString][line[9]][1] +  (text.sentiment.polarity))/2
                            else:
                                avgSentiment = (text.sentiment.polarity)
                            sentimementDict[weekString][line[9]][1] = avgSentiment
                        else:
                            #calls text of each tweet as a TextBlob object
                            text = textblob.TextBlob(line[1])
                            #line[9] = state; if this state is already in the dictionary, the sentiment gets averaged
                            if sentimementDict[weekString][line[9]][2] != 0:
                                avgSentiment = (sentimementDict[weekString][line[9]][2] +  (text.sentiment.polarity))/2
                            else:
                                avgSentiment = (text.sentiment.polarity)
                            sentimementDict[weekString][line[9]][2] = avgSentiment
                #if tweet is from the next week, updates the start date
                elif date >= weekEnd:
                    #add a week
                    weekStart = update(weekStart)
                    #create a dictionary with each state as key and a list of 0's as values
                    statesDict = {}
                    #iterate each state name into the keys
                    for state in abbr_list:
                        #initialize the dictionary
                        statesDict[state] = [0, 0, 0]
                    #add this dictionary to the larger one
                    weekString = dt.strftime(weekStart)
                    sentimentDict.update({weekString, statesDict})
                    #make sure this is a line with data
                    if line[9] != "Null" or "us_state":
                        if is_topic(line[1], 5) > 0.05:
                            #calls text of each tweet as a TextBlob object
                            avgSentiment = (text.sentiment.polarity)
                            sentimementDict[weekString][line[9]][0] = avgSentiment
                        elif is_topic(line[1], 11) > 0.05:
                            #calls text of each tweet as a TextBlob object
                            text = textblob.TextBlob(line[1])
                            #line[9] = state; if this state is already in the dictionary, the sentiment gets averaged
                            avgSentiment = (text.sentiment.polarity)
                            sentimementDict[weekString][line[9]][1] = avgSentiment
                        else:
                            #calls text of each tweet as a TextBlob object
                            text = textblob.TextBlob(line[1])
                            #line[9] = state; if this state is already in the dictionary, the sentiment gets averaged
                            avgSentiment = (text.sentiment.polarity)
                            sentimementDict[weekString][line[9]][2] = avgSentiment
                elif date <= weekStart and date >= startDate:
                    oneWeek = dt.timedelta(days = 7)
                    previousWeekEnd = weekEnd - oneWeek
                    while (date >= previousWeekEnd):
                        previousWeekEnd = previousWeekEnd - oneWeek
                        previousWeekStart = previousWeekEnd - oneWeek
                    earlierWeekString = dt.strftime(previousWeekStart)
                    if earlierWeekString not in SenimentDict:
                        sentimentDict.update({earlierWeekString, statesDict})
                        #make sure this is a line with data
                        if line[9] != "Null" or "us_state":
                            if is_topic(line[1], 5) > 0.05:
                                #calls text of each tweet as a TextBlob object
                                avgSentiment = (text.sentiment.polarity)
                                sentimementDict[earlierWeekString][line[9]][0] = avgSentiment
                            elif is_topic(line[1], 11) > 0.05:
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                #line[9] = state; if this state is already in the dictionary, the sentiment gets averaged
                                avgSentiment = (text.sentiment.polarity)
                                sentimementDict[earlierWeekString][line[9]][1] = avgSentiment
                            else:
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                #line[9] = state; if this state is already in the dictionary, the sentiment gets averaged
                                avgSentiment = (text.sentiment.polarity)
                                sentimementDict[ealierWeekString][line[9]][2] = avgSentiment
                    else:
                        earlierWeekString = dt.strftime(previousWeekStart)
                        if line[9] != "Null" or "us_state":
                            if is_topic(line[1], 5) > 0.05:
                                #calls text of each tweet as a TextBlob object
                                if sentimementDict[earlierWeekString][line[9]][0] != 0:
                                    avgSentiment = (sentimementDict[earlierWeekString][line[9]][0] +  (text.sentiment.polarity))/2
                                else:
                                    avgSentiment = (text.sentiment.polarity)
                                sentimementDict[earlierWeekString][line[9]][0] = avgSentiment
                            elif is_topic(line[1], 11) > 0.05:
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                #line[9] = state; if this state is already in the dictionary, the sentiment gets averaged
                                if sentimementDict[earlierWeekString][line[9]][1] != 0:
                                    avgSentiment = (sentimementDict[earlierWeekString][line[9]][1] +  (text.sentiment.polarity))/2
                                else:
                                    avgSentiment = (text.sentiment.polarity)
                                sentimementDict[earlierWeekString][line[9]][1] = avgSentiment
                            else:
                                #calls text of each tweet as a TextBlob object
                                text = textblob.TextBlob(line[1])
                                #line[9] = state; if this state is already in the dictionary, the sentiment gets averaged
                                if sentimementDict[earlierWeekString][line[9]][2] != 0:
                                    avgSentiment = (sentimementDict[earlierWeekString][line[9]][2] +  (text.sentiment.polarity))/2
                                else:
                                    avgSentiment = (text.sentiment.polarity)
                                sentimementDict[earlierWeekString][line[9]][2] = avgSentiment
                        

In [None]:
"""
(Optional) writes dictionary to CSV file with rows of state, sentiment
"""

print("writing sentiments to file...")
with open("twitter_sentiments_byTopic.csv", "w") as outFile:
    writer = csv.writer(outFile)
    writer.writerow(["Start Date", "State", "Social Dist Sentiment", "Reopening Sentiment", "Other Sentiments"]
    for key, value in sentimentDict.items():
        writer.writerow([key, value])
print("finsished writing to file")