In [None]:
import nltk
import pandas as pd
import csv
import re
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Data clean
def data_clean(data:list):
  text = []
  for t in data:
    temp = t.lstrip('b\'')
    temp = temp.rstrip('\'')
    cle = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', temp)
    cle = re.sub("(@[A-Za-z0-9_]+)","", cle)
    cle = re.sub(r'(\\x(.){2})', '', cle)
    cle = re.sub('[0-9]', '', cle)
    cle = re.sub(r'(\\n)', ' ', cle)
    cle = re.sub(r'[^\w\s]','', cle)
    cle = cle.lower()
    text.append(cle)

  return text

In [None]:
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
def tweets_lemmatized(tweet_tokens, stopword_list:list):
    lemmatizer = WordNetLemmatizer()
    tweets_lemmatized = []
    for word, tag in pos_tag(tweet_tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        _token = lemmatizer.lemmatize(word, pos)
        if _token.lower() not in stopword_list:
            tweets_lemmatized.append(_token.lower())
    return tweets_lemmatized

In [None]:
# import nltk
# Import and stopwords
# nltk.download('stopwords')
from nltk.corpus import stopwords
stopword_List = stopwords.words('english')
#nltk.download('twitter_samples')

In [None]:
folderNum = 1

In [None]:
# Define a month iterator
from datetime import timedelta, date
def monthrange(m_start_date, m_end_date):
    for n in range(int((m_end_date - m_start_date).days/31)+1):
        yield m_start_date + timedelta(n*31)

In [None]:
import spacy
start_date = date(2020, 8, 1)
end_date = date(2021, 3, 23)
monthrangeData = []
for single_month in monthrange(start_date, end_date):
    tmp = []
    for i in range(folderNum):
        month = single_month.strftime("%Y-%m")
        fileFolder = ".\\test_data_",str(i+1),"\\"
        fileFolder=''.join(fileFolder)
        monthCSV = fileFolder,month,"_hydrated.csv"
        monthCSV=''.join(monthCSV)
        data = pd.read_csv(monthCSV)
        text = data_clean(data["text"])
        text = tweets_lemmatized(text,stopword_List)
        for t in text:
            tmp.append(' '.join(t.split()))
    monthrangeData.append(tmp)

In [None]:
sp = spacy.load('en_core_web_sm')
included_tags = {"VERB", "PROPN","NOUN"}
# Some how the spacy marked auxiliary verbs as verbs, so we remove it 
# Also have is used in both auxiliary verb and verb, to reduce noise we remove it.
auxiliary_verb = {'have','be','is','are','am','was','were','being','been'}

In [None]:
def dataPoping(monthData):
    for d in monthData:
        yield d

In [None]:
clean_monthrange = []
md = dataPoping(monthrangeData)
for single_month in monthrange(start_date, end_date):
    print("Working on",single_month.strftime("%Y-%m"))
    tmp = md.__next__()
    clean_tmp = []
    for t in tmp:
        for token in sp(t):
            if token.pos_ in included_tags:
                if token.text not in auxiliary_verb:
                    clean_tmp.append(token.text)
    clean_monthrange.append(clean_tmp)

In [None]:
import os
datafolder = ".\\analysis\\"
try:
    os.mkdir(datafolder)
except OSError:
    print ("Creation of the directory %s failed" % datafolder)
else:
    print ("Successfully created the directory %s " % datafolder)

In [None]:
from nltk import FreqDist
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
cmd = dataPoping(clean_monthrange)
for single_month in monthrange(start_date, end_date):
    this_month = single_month.strftime("%Y-%m")
    print("Working on",this_month)
    clean_tmp = cmd.__next__()
    req_dist = FreqDist(clean_tmp)
    save_freq = '.\\analysis\\',this_month,"-frequence.txt"
    save_freq=''.join(save_freq)
    with open(save_freq, 'w') as f_out:
        for t in req_dist.most_common(100):
            word,freq = t[0],t[1]
            f_out.write(word)
            f_out.write(' ')
            f_out.write(str(freq))
            f_out.write('\n')
    str1 = ' '.join(str(e) for e in clean_tmp)
    wordcloud = WordCloud(max_words=100, background_color="white",width=8000, height=4000).generate(str1)
    save_img = '.\\analysis\\',this_month,".png"
    save_img=''.join(save_img)
    wordcloud.to_file(save_img)