# Boilerplate Code for Final Blog
*Similar to the the `functions.ipynb` file, this notebook is meant to consolidate the long blocks of code to be ran in the background so that they do not obscure what is presented in the blog text.*

In [None]:
# Import libraries
import os
import json
from nltk.tokenize import TweetTokenizer
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import sample
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
# Execute function definitions for later use
%run ../data_analysis/functions.ipynb

In [None]:
# DATA PROCESSING STEPS

# Read in Tweet data and create two lists of dictionairies
pre_covid_tweets = []
post_covid_tweets = []
summary = []

# Handle pre-covid data
for filename in os.listdir('../../data/pre_covid'):
    try:
        json_string = open(f'../../data/pre_covid/{filename}').read()
        tweets = json.loads(json_string)
        summary.append(('pre', len(tweets), filename.split('_')[0]))
        pre_covid_tweets.extend(tweets)
    except:
        print(f'Error reading {filename}')
        continue   
        
# Handle post-covid data
for filename in os.listdir('../../data/post_covid'):
    try:
        json_string = open(f'../../data/post_covid/{filename}').read()
        tweets = json.loads(json_string)
        summary.append(('post', len(tweets), filename.split('_')[0]))
        post_covid_tweets.extend(tweets)
    except:
        print(f'Error reading {filename}')
        continue 
        
# Remove some tweets to maintain consistent sample sizes
try:
    pre_covid_tweets = sample(pre_covid_tweets, len(post_covid_tweets))
except:
    post_covid_tweets = sample(post_covid_tweets, len(pre_covid_tweets))
    

In [None]:
# TEXT NORMALIZATION STEPS

tt = TweetTokenizer()

# Tokenize the Tweets to generate frequency lists
pre_toks = []
post_toks = []

# Handle pre-covid tweets
for tweet in pre_covid_tweets:
    t = tweet['text']
    pre_toks.extend(tt.tokenize(t))
    
# Handle post-covid tweets
for tweet in post_covid_tweets:
    t = tweet['text']
    post_toks.extend(tt.tokenize(t))

pre_dist = Counter(pre_toks)
post_dist = Counter(post_toks)

In [None]:
def plot_keywords():
    keyness_df = calculate_keyness(pre_dist, post_dist, print_table=False, top=-1, keyness_threshold=-100000)
    plot_keyitems(keyness_df, 20,
              corpusA='Pre-COVID Tweets', 
              corpusB='Post-COVID Tweets')

In [None]:
def tweet_freq_dist():
    
    # Find users who are represented in both corpora
    pre_users = list(set([t['username'] for t in pre_covid_tweets]))
    post_users = list(set([t['username'] for t in post_covid_tweets]))
    longitudinal_users = [u for u in pre_users if u in post_users]
    
    # Initialize tweet dataframes
    pre_df = pd.DataFrame(pre_covid_tweets) 
    post_df = pd.DataFrame(post_covid_tweets)

    # Remove tweets that weren't posted by longitudinal users
    pre_df = pre_df[pre_df['username'].apply(lambda u: u in longitudinal_users)]
    post_df = post_df[post_df['username'].apply(lambda u: u in longitudinal_users)]
    
    # Plot histogram of pre-covid tweet frequency among longitudinal users
    plt.hist(post_df.sample(20)['username'], alpha=0.5, edgecolor='blue', density=False)
    plt.hist(pre_df.sample(20)['username'], alpha=0.5, edgecolor='red', density=False)
    plt.title('Difference in Individual Antivax Tweet Frequency Before & After the COVID-19 Pandemic')
    plt.legend(['Post-COVID', 'Pre-COVID'])
    plt.xticks(rotation = 90)
    plt.xlabel('Username')
    plt.ylabel('Tweet Frequency Difference')
    plt.show()
    

In [None]:
pre_users = list(set([t['username'] for t in pre_covid_tweets]))
post_users = list(set([t['username'] for t in post_covid_tweets]))
longitudinal_users = [u for u in pre_users if u in post_users]
len(longitudinal_users)

# Initialize tweet dataframes
pre_df = pd.DataFrame(pre_covid_tweets) 
post_df = pd.DataFrame(post_covid_tweets)

# Remove tweets that weren't posted by longitudinal users
pre_df = pre_df[pre_df['username'].apply(lambda u: u in longitudinal_users)]
post_df = post_df[post_df['username'].apply(lambda u: u in longitudinal_users)]

pre_user_freq = Counter(pre_df['username'])
post_user_freq = Counter(post_df['username'])

pre_top = [f[0] for f in pre_user_freq.most_common(20)]
post_top = [f[0] for f in post_user_freq.most_common(20)]
top_posters = [u for u in pre_top if u in post_top]
top_posters

# Object for sentiment scoring
sid = SentimentIntensityAnalyzer()

pre_df['polarity'] = pre_df['text'].map(lambda t: sid.polarity_scores(t)['compound'])
post_df['polarity'] = post_df['text'].map(lambda t: sid.polarity_scores(t)['compound'])

pre_top_df = pre_df[pre_df['username'].map(lambda u: u in top_posters)]
post_top_df = post_df[post_df['username'].map(lambda u: u in top_posters)]

In [None]:
# Plot both pre-covid and post-covid polarity scores side-by-side
def top_poster_plot(user):
    try:
        color1 = 'red'
        title1 = f'{user}\'s Tweet Polarity Score Trends Before COVID-19'
        x1 = pre_top_df[pre_top_df['username'] == user].sample(20)['created_at']
        y1 = pre_top_df[pre_top_df['username'] == user].sample(20)['polarity']
    except:
        x1 = pre_top_df[pre_top_df['username'] == user]['created_at']
        y1 = pre_top_df[pre_top_df['username'] == user]['polarity'] 
        
    try:
        color2 = 'blue'
        title2 = f'{user}\'s Tweet Polarity Score Trends After COVID-19'
        x2 = post_top_df[post_top_df['username'] == user].sample(20)['created_at']
        y2 = post_top_df[post_top_df['username'] == user].sample(20)['polarity']
    except:
        x2 = post_top_df[post_top_df['username'] == user]['created_at']
        y2 = post_top_df[post_top_df['username'] == user]['polarity']        
    
    # Define figure size
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    
    # Pre-covid plot parameters
    axes[0].plot(x1, y1, color=color1)
    axes[0].set_xlabel('Date')
    axes[0].set_ylabel('Polarity Score')
    axes[0].set_xticklabels(x1, rotation=45)
    axes[0].set_title(title1)
    
    # Post-covid plot parameters
    axes[1].plot(x2, y2, color=color2)
    axes[1].set_xlabel('Date')
    axes[1].set_ylabel('Polarity Score')
    axes[1].set_xticklabels(x2, rotation=45)
    axes[1].set_title(title2)          


In [None]:
def sentiment_plots():
    for t in top_posters:
        top_poster_plot(t)
    