# Boilerplate Code for Final Blog
*Similar to the the `functions.ipynb` file, this notebook is meant to consolidate the long blocks of code to be ran in the background so that they do not obscure what is presented in the blog text.*

In [None]:
# Import libraries
import os
import json
from nltk.tokenize import TweetTokenizer
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import sample
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
# Execute function definitions for later use
%run ../data_analysis/functions.ipynb

In [None]:
# DATA PROCESSING STEPS

# Read in Tweet data and create two lists of dictionairies
pre_covid_tweets = []
post_covid_tweets = []
summary = []

# Handle pre-covid data
for filename in os.listdir('../../data/pre_covid'):
    try:
        json_string = open(f'../../data/pre_covid/{filename}').read()
        tweets = json.loads(json_string)
        summary.append(('pre', len(tweets), filename.split('_')[0]))
        pre_covid_tweets.extend(tweets)
    except:
        print(f'Error reading {filename}')
        continue   
        
# Handle post-covid data
for filename in os.listdir('../../data/post_covid'):
    try:
        json_string = open(f'../../data/post_covid/{filename}').read()
        tweets = json.loads(json_string)
        summary.append(('post', len(tweets), filename.split('_')[0]))
        post_covid_tweets.extend(tweets)
    except:
        print(f'Error reading {filename}')
        continue 
        
# Remove some tweets to maintain consistent sample sizes
try:
    pre_covid_tweets = sample(pre_covid_tweets, len(post_covid_tweets))
except:
    post_covid_tweets = sample(post_covid_tweets, len(pre_covid_tweets))
    

In [None]:
# TEXT NORMALIZATION STEPS

tt = TweetTokenizer()

# Tokenize the Tweets to generate frequency lists
pre_toks = []
post_toks = []

# Handle pre-covid tweets
for tweet in pre_covid_tweets:
    t = tweet['text']
    pre_toks.extend(tt.tokenize(t))
    
# Handle post-covid tweets
for tweet in post_covid_tweets:
    t = tweet['text']
    post_toks.extend(tt.tokenize(t))

pre_dist = Counter(pre_toks)
post_dist = Counter(post_toks)

In [2]:
def plot_keywords():
    keyness_df = calculate_keyness(pre_dist, post_dist, print_table=False, top=-1, keyness_threshold=-100000)
    plot_keyitems(keyness_df, 20,
              corpusA='Pre-COVID Tweets', 
              corpusB='Post-COVID Tweets')