In [43]:
import tensorflow as tf
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
import locale
from scipy import stats

import os
import time

In [44]:
# Clone github repo containing the data as csv file
! rm -rf covid_data
! git clone https://github.com/PatrickNiccolai2/covid_data

Cloning into 'covid_data'...
remote: Enumerating objects: 2040, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 2040 (delta 24), reused 90 (delta 22), pack-reused 1948[K
Receiving objects: 100% (2040/2040), 334.22 MiB | 24.72 MiB/s, done.
Resolving deltas: 100% (95/95), done.
Checking out files: 100% (13/13), done.


In [45]:
# Download the tweets with sentiment data
!unzip /content/covid_data/tweets_with_lang -d /content/covid_data/unzip_tweets_with_lang

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/covid_data/unzip_tweets_with_lang/tweets_with_lang/lang_sent_shuffled_tweet_80172.txt  
  inflating: /content/covid_data/unzip_tweets_with_lang/tweets_with_lang/lang_sent_shuffled_tweet_8455.txt  
  inflating: /content/covid_data/unzip_tweets_with_lang/tweets_with_lang/lang_sent_shuffled_tweet_54445.txt  
  inflating: /content/covid_data/unzip_tweets_with_lang/tweets_with_lang/lang_sent_shuffled_tweet_65840.txt  
  inflating: /content/covid_data/unzip_tweets_with_lang/tweets_with_lang/lang_sent_shuffled_tweet_7766.txt  
  inflating: /content/covid_data/unzip_tweets_with_lang/tweets_with_lang/lang_sent_shuffled_tweet_74795.txt  
  inflating: /content/covid_data/unzip_tweets_with_lang/tweets_with_lang/lang_sent_shuffled_tweet_23815.txt  
  inflating: /content/covid_data/unzip_tweets_with_lang/tweets_with_lang/lang_sent_shuffled_tweet_43139.txt  
  inflating: /content/covid_data/unzip_tweets_with_lang/t

In [46]:
# Read all of the tweets + other data into an array
all_files_text = []

# Iterate through folder we just unzipped
for filename in os.listdir("/content/covid_data/unzip_tweets_with_lang/tweets_with_lang"):
  if(filename[0] == "l"):
    # Open each file and read it
    file_path = "/content/covid_data/unzip_tweets_with_lang/tweets_with_lang/" + filename
    file = open(file_path, "r")
    lines = file.read()

    # Split the file into different parts
    lines_arr = lines.split("\n\n")
    lang = lines_arr[-1]
    sent = lines_arr[-2]
    id = lines_arr[-3]
    place = lines_arr[-5]
    coords = lines_arr[-5]
    time = lines_arr[-6]

    tweet_text = ""
    for i in range(7, len(lines_arr) + 1):
      tweet_text = tweet_text + lines_arr[-i]

    # Search the tweet text for a manufacturer, if one is mentioned
    if("johnson" not in tweet_text.lower()):
      man = "no_man"
      if("sputnik" in tweet_text.lower()):
        man = "sputnik"
      if("sinopharm" in tweet_text.lower() or "sinovac" in tweet_text.lower()):
        man = "sino"
      if("moderna" in tweet_text.lower() or "spikevax" in tweet_text.lower()):
        man = "moderna"
      if("pfizer" in tweet_text.lower() or "biontech" in tweet_text.lower() or "comirnaty" in tweet_text.lower()):
        man = "pfizer"
      if("janssen" in tweet_text.lower()):
        man = "janssen"
      if("astrazeneca" in tweet_text.lower() or "vaxzevria" in tweet_text.lower()):
        man = "astra"
        
      # Add all the info from the file to all_files_text
      full_tweet = [tweet_text, time, coords, place, id, man, sent, lang]
      all_files_text.append(full_tweet)
    file.close()


In [47]:
# Create a dataframe with all of the data
all_tweets_df = pd.DataFrame(all_files_text, columns=["tweet_text", "time", "coords", "place", "tweet_id", "man", "sentiment", "language"])
all_tweets_df = all_tweets_df.drop_duplicates(subset='tweet_id', keep="first")
all_tweets_df = all_tweets_df.loc[all_tweets_df["language"] == "en"]

In [48]:
# Convert strings to datetime so they can be used
def str_to_datetime(in_str):
  date = in_str.split(" ")[0]
  out_datetime = datetime.strptime(date,"%Y-%m-%d")
  return out_datetime

all_tweets_df["datetime"] = all_tweets_df["time"].apply(str_to_datetime)

In [49]:
# The sentiment is an array containing 3 values, convert it to one value
def sent_arr_to_val(sent_str):
  sent_str = sent_str[2:-2]
  sent_arr = sent_str.split()
    # We define the sentiment value as probability of positive sentiment minus probability of negative sentiment
  return float(sent_arr[2]) - float(sent_arr[0])

all_tweets_df["sent_avg"] = all_tweets_df["sentiment"].apply(sent_arr_to_val)

In [50]:
# Create dataframe for each brand
moderna_df = all_tweets_df[all_tweets_df["man"] == "moderna"]
pfizer_df = all_tweets_df[all_tweets_df["man"] == "pfizer"]
astra_df = all_tweets_df[all_tweets_df["man"] == "astra"]

In [51]:
# This cell combines the dataframes by data and gets the average sentiment for that date
aggregator = {'tweet_text' : 'count', 'time' : 'count', 'coords': 'count', 
              'tweet_id' : 'count', 'sentiment': 'count', 'sent_avg': 'mean'}

moderna_per_day = moderna_df.groupby(["datetime"]).agg(aggregator)
moderna_per_day = moderna_per_day.sort_index()
moderna_per_day.reset_index(inplace=True)
moderna_per_day = moderna_per_day.rename(columns = {'index':'datetime'})

pfizer_per_day = pfizer_df.groupby(["datetime"]).agg(aggregator)
pfizer_per_day = pfizer_per_day.sort_index()
pfizer_per_day.reset_index(inplace=True)
pfizer_per_day = pfizer_per_day.rename(columns = {'index':'datetime'})

astra_per_day = astra_df.groupby(["datetime"]).agg(aggregator)
astra_per_day = astra_per_day.sort_index()
astra_per_day.reset_index(inplace=True)
astra_per_day = astra_per_day.rename(columns = {'index':'datetime'})

In [52]:
# Run kruskal-wallis test comparing the daily avg sentiment across the brands
stats.kruskal(moderna_per_day["sent_avg"], pfizer_per_day["sent_avg"], astra_per_day["sent_avg"])

KruskalResult(statistic=12.639948728118789, pvalue=0.001799989650066879)

In [53]:
stats.chi2.ppf(1-0.05, 2)

5.991464547107979

In [54]:
# So, the chi sqaured critical value is 5.99. This is less that the kruskal result 
# of 12.6, meaning that we can conclude that the distributions of moderna, pfizer,
# and astrazeneca are different

In [55]:
def startMonth(date):
  month = date.month
  year = date.year
  comp_date = str(year) + "-" + str(month) + "-1"
  return datetime.strptime(comp_date, "%Y-%m-%d")

moderna_per_day["month"] = moderna_per_day["datetime"].apply(startMonth)
pfizer_per_day["month"] = pfizer_per_day["datetime"].apply(startMonth)
astra_per_day["month"] = astra_per_day["datetime"].apply(startMonth)


In [56]:
# Function to get data by month for a given dataframe
def get_data_for_months(dataframe):
  months_dict = {}
  months = dataframe.month.unique()
  for month in months:
    years = month.astype('datetime64[Y]').astype(int) + 1970
    months = month.astype('datetime64[M]').astype(int) % 12 + 1
    # We only look at certain months to compare to the other paper
    if((months == 12 and years == 2020) or (months == 1 and years == 2021) or (months == 2 and years == 2021) or (months == 3 and years == 2021)):
      months_dict[month] = dataframe.loc[dataframe["month"] == month]["sent_val"]

  data_arr = []
  months_arr = []
  for month in sorted(months_dict.keys()):
    years = month.astype('datetime64[Y]').astype(int) + 1970
    months = month.astype('datetime64[M]').astype(int) % 12 + 1
    if((months == 12 and years == 2020) or (months == 1 and years == 2021) or (months == 2 and years == 2021) or (months == 3 and years == 2021)):
      data_arr.append(months_dict[month])
      months_arr.append(str(month)[0:7])

  return data_arr, months_arr

In [57]:
# Get the avg daily sentiment for each month, for the brands
moderna_df = moderna_df.rename(columns={"sent_avg": "sent_val"})
moderna_df["month"] = moderna_df["datetime"].apply(startMonth)

moderna_data_arr, moderna_months_arr = get_data_for_months(moderna_df)

pfizer_df = pfizer_df.rename(columns={"sent_avg": "sent_val"})
pfizer_df["month"] = pfizer_df["datetime"].apply(startMonth)

pfizer_data_arr, pfizer_months_arr = get_data_for_months(pfizer_df)

astra_df = astra_df.rename(columns={"sent_avg": "sent_val"})
astra_df["month"] = astra_df["datetime"].apply(startMonth)

astra_data_arr, astra_months_arr = get_data_for_months(astra_df)

In [58]:
stats.chi2.ppf(1-0.05, 3)

7.814727903251179

In [59]:
stats.kruskal(moderna_data_arr[0],moderna_data_arr[1],moderna_data_arr[2],moderna_data_arr[3])

KruskalResult(statistic=2.953228993243689, pvalue=0.3988927021162145)

In [60]:
stats.kruskal(pfizer_data_arr[0],pfizer_data_arr[1],pfizer_data_arr[2],pfizer_data_arr[3])

KruskalResult(statistic=12.966827460574418, pvalue=0.004708894892141813)

In [61]:
stats.kruskal(astra_data_arr[0],astra_data_arr[1],astra_data_arr[2],astra_data_arr[3])

KruskalResult(statistic=14.325531121846787, pvalue=0.00249392695451474)

In [62]:
# Again, we compare the kruskal value to the chi squared value
# Interstingly, we can conclude that for pfizer and astrazeneca,
# different months have different distributions of sentiment, 
# however for moderna this is not the case

In [70]:
sputnik_df = all_tweets_df[all_tweets_df["man"] == "sputnik"]
sino_df = all_tweets_df[all_tweets_df["man"] == "sino"]

sputnik_df = sputnik_df.rename(columns={"sent_avg": "sent_val"})
sputnik_df["month"] = sputnik_df["datetime"].apply(startMonth)

sputnik_data_arr, sputnik_months_arr = get_data_for_months(sputnik_df)

sino_df = sino_df.rename(columns={"sent_avg": "sent_val"})
sino_df["month"] = sino_df["datetime"].apply(startMonth)

sino_data_arr, sino_months_arr = get_data_for_months(sino_df)

In [71]:
stats.kruskal(sputnik_data_arr[0],sputnik_data_arr[1],sputnik_data_arr[2],sputnik_data_arr[3])

KruskalResult(statistic=7.107958499015075, pvalue=0.0685352238431948)

In [72]:
stats.kruskal(sino_data_arr[0],sino_data_arr[1],sino_data_arr[2],sino_data_arr[3])

KruskalResult(statistic=4.137111459679279, pvalue=0.24703412762126067)