In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
# Install requirements after mounting drive, then comment out, restart runtime (use GPU for speeding up embeddings generation)
%%capture
%cd /content/drive/My Drive/covid_tweets/artifact/code
!pip install -r requirements.txt

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
import gc
import math
from datetime import datetime
from datetime import timedelta
from os.path import exists
import os
import glob
from matplotlib import pyplot as plt
import seaborn as sns

from collections import Counter
import operator
import string
import scipy.sparse as sp
from multiprocessing import Pool
from scipy import stats as st
import sys

%%capture
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from nltk.stem import WordNetLemmatizer

from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
import pickle

import warnings
warnings.filterwarnings('ignore')

In [None]:
data_path = "/content/drive/My Drive/covid_tweets/artifact/data/"
WINDOW = 7 # outcome window
preTreatmentPeriod = 30 # 30 days pre event
interact_threshold = 0 # user considered interacting if interacting outside community more than interact_threshold fraction before the event

# Tweet-level/daily user level computations

## Obtain tweet embeddings on cleaned text and save

In [None]:
# %%capture
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def tokenize(input):
  return model.encode(input)

In [None]:
def clean_tweet(text):
  temp = re.sub("@[A-Za-z0-9_]+","", text)
  temp = re.sub(r'http\S+', '', temp)
  temp = re.sub('\s+', ' ', temp)
  temp = temp.strip()
  return temp

In [None]:
df =  pd.read_csv(data_path+"tweet_text.csv", index_col=0) # columns: ['id', 'text']
df = df.merge(pd.read_csv(data_path +"tweet_level_data.csv"), on ="id", how="inner")
# Important columns in tweet_level_data.csv:
      #  'created_at', 'in_reply_to_user_id', 'user_id', 'id', 'valence_intensity', 'anger_intensity',
      #  'fear_intensity', 'sadness_intensity', 'joy_intensity',
      #  'user_friends_count',  'user_followers_count', 'retweet_count',
      #  'user_created_at', 'muslim_score', 'muslim', 'reply', 'reply_muslim']
userid = pd.read_csv(data_path+"final_clean_userid.csv") # user ids after filtering names
df = df[df.user_id.isin(userid.user_id)]
df['text'] = df['text'].apply(lambda x: clean_tweet(x))
df.created_at = pd.to_datetime(df['created_at'], errors='coerce')

In [None]:
if not os.path.isdir(f"{data_path}embeddings"):
  os.mkdir(outpath)
start_date =  min(df.created_at)
end_date = max(df.created_at)
while start_date<=end_date:
  if exists(f"{data_path}embeddings/tweet_embeddings_{start_date}.csv"):
      start_date+=timedelta(days=1)
      continue
  print(start_date)
  subdata = df[df.created_at==start_date]
  batch_size = 10000
  batches = math.ceil(len(subdata)/batch_size)
  for i in range(1,batches+1):
    w = (i-1)*batch_size
    x = i*batch_size
    dt = subdata[w:x]
    dt.reset_index(inplace=True,drop=True)
    dt = pd.concat([dt,pd.DataFrame(tokenize(dt['text']))], axis = 1)
    del dt['text']
    dt.to_csv(f"{data_path}embeddings/temp/tweet_embedding_{str(i)}.csv", index = False)
  del dt
  gc.collect()
  fileList = glob.glob(f"{data_path}embeddings/temp/tweet_embedding*.csv")
  frames = []
  i = 1
  for filePath in fileList:
      cols = list(pd.read_csv(filePath, nrows =1))
      new = pd.read_csv(filePath, dtype = np.float32, usecols =[i for i in cols if i not in ['id', 'user_id', 'created_at', 'muslim']])
      dk = pd.read_csv(filePath, usecols = ['id', 'user_id', 'created_at', 'muslim'], dtype = object)
      new = dk.merge(new, left_index = True, right_index=True)
      i+=1
      try:
          frames.append(new)
      except:
          frames = [new]
      os.remove(filePath)
  del new, dk
  gc.collect()

  data = pd.concat(frames)
  data.to_csv(f"{data_path}embeddings/tweet_embeddings_{start_date}.csv")
  start_date+=timedelta(days=1)

## GCS Computation (daily user level)

qi = ci/mi [ci= vector of token frequencies, normalized by mi= number of tokens in this tweet]

ρ -i = vector of probability of tokens within group D excluding tweeter i and any token used by fewer than two people.

In [None]:
def calculate_centroids(data):
  CM = data[data.muslim==1].iloc[:, 0:768].mean(axis = 0) #CM: Muslim Centroid
  CH = data[data.muslim==0].iloc[:, 0:768].mean(axis = 0) #CH: Hindu Centroid
  return (CH, CM)

def getGCS(row, CH, CM, nh, nm):
  q = row[0:768].values # q = vector of mean embeddings of tweeter
  muslim = row['muslim']

  # adjust user's group's centroid: leaving out user i
  if muslim:
    CM = (CM*nm-q)/(nm-1)
  else:
    CH = (CH*nh-q)/(nh-1)


  m = np.linalg.norm(q-CM) # distance from Muslim centroid
  h = np.linalg.norm(q-CH) # distance from Hindu centroid

  if muslim:
    row['gcs'] = h/(m+h)
  else:
    row['gcs'] = m/(m+h)

  return row[['user_id', 'created_at', 'muslim', 'gcs']]

def get_values(userids):
    """
    Measures daily GCS for each user.
    returns: dataframe with daily user polarization
    """

    start_date =  pd.to_datetime(datetime.strptime("Jan 28 2020", '%b %d %Y').date()).normalize()
    end_date =  pd.to_datetime(datetime.strptime("Jan 01 2021", '%b %d %Y').date()).normalize()

    dic = {str(x):"mean" for x in range(0,768)}
    dic.update({'user_id':'first', 'created_at' :'first'})

    frames = []
    dates = []
    matH=[]
    matM=[]
    while start_date <= end_date:
        subset = pd.read_csv(f"{data_path}embeddings/tweet_embeddings_{start_date}.csv")
        subset = subset[subset.user_id.isin(userids.user_id)]
        del subset["muslim"]
        if len(subset)>0:
          subset = subset.groupby('user_id').agg(dic)
          subset = subset.reset_index(drop=True)
          subset = subset.merge(userids, on = "user_id", how= "left", suffixes= ["", "_rt"])
          subset = subset[[col for col in subset.columns if not col.endswith("_rt")]]

          nh = sum(subset.muslim == 0)
          nm = sum(subset.muslim == 1)
          if nh == 0 or nm == 0:
            print("only one group on {} - nh:{} nm:{}".format(start_date, nh, nm)) #not in this data
          (CH, CM) = calculate_centroids(subset)
          subset['GCS'] = 0.0
          subset['bias_comp'] = 0.0

          sub = subset.apply(lambda row: getGCS(row, CH, CM, nh, nm), axis = 1)
          frames.append(sub)
          dates.append(start_date)
          matH.append(CH.to_numpy())
          matM.append(CM.to_numpy())
        else:
          print("No tweets on {}".format(start_date)) #not in this data
        start_date += timedelta(days=1)
    data = pd.concat(frames)
    data = data[['user_id', 'created_at', 'muslim', 'gcs', 'bias_comp']]
    data.to_csv(data_path + f'/gcs.csv', index=False)
    return (dates, matH, matM)

In [None]:
userids = pd.read_csv(data_path+"/final_clean_userid.csv").reset_index(drop=True)
dates, matH, matM = get_values(userids)
df_centroids = pd.concat([pd.DataFrame(dates, columns =['created_at']),
                          pd.DataFrame(matM, columns = [f'H_{i}' for i in range(0,768)]),
                          pd.DataFrame(matM, columns = [f'M_{i}' for i in range(0,768)])], axis=1)
df_centroids.to_csv(data_path + f'/centroids_gcs.csv', index=False)

## GCS BOW
Code adapted from: https://github.com/ddemszky/framing-twitter


### utils

In [None]:
vocab_file = data_path + 'temp_daily_vocab.txt'
full_vocab_file = data_path + 'full_vocab.txt'

In [None]:
punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}) - set(['#']))
punct_chars.sort()
punctuation = ''.join(punct_chars)
replace = re.compile('[%s]' % re.escape(punctuation))
sno = nltk.stem.SnowballStemmer('english')
stop_words = stopwords.words('english')
stopwords = set(open(data_path+"stopwords.txt", 'r').read().splitlines())
stopwords = stopwords.union(stop_words) #| set(event_stopwords[event])
print(stopwords)

In [None]:
#Also consider spelling and number regularization
def clean_text(text, keep_stopwords=False, event=None, stem=True):
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub("&[A-Za-z0-9_]+","", text)#remove patterns like &amp, etc.
    text = re.sub("[^a-zA-Z]"," ", text)
    text = text.lower()
    # eliminate urls
    text = re.sub(r'http\S*|\S*\.com\S*|\S*www\S*', ' ', text)
    # eliminate @mentions
    text = re.sub(r'\s@\S+', ' ', text)
    # substitute all other punctuation with whitespace
    text = replace.sub(' ', text)
    # replace all whitespace with a single space
    text = re.sub('\s+', ' ', text)
    # strip off spaces on either end
    text = text.strip()
    words = [word for word in nltk.word_tokenize(text)]
    if not keep_stopwords:
        words = [w for w in words if w not in stopwords]
    if stem:
        words = [sno.stem(w) for w in words]
    return words

def build_vocab(corpus):
    freq = {}
    for sent in corpus:
        prev = ''
        count = 0
        for w in sent:
            if w in freq:
                freq[w] += 1
            else:
                freq[w] = 1
            if count > 0:
                bigram = prev + ' ' + w
                if bigram in freq:
                    freq[bigram] += 1
                else:
                    freq[bigram] = 1
            count += 1
            prev = w
    vocab = [k for k,v in sorted(freq.items(), key=operator.itemgetter(1), reverse=True)]# if v > cutoff]
    return vocab

def get_user_token_counts(tweets, vocab):
    users = tweets.groupby('user_id')
    user_ids =[]
    row_idx = []
    col_idx = []
    data = []
    for group_idx, (u, group), in enumerate(users):
        word_indices = []
        user_ids.append(u)
        for split in group['clean_text']:
            count = 0
            prev = ''
            for w in split:
                if w == '':
                    continue
                if w in vocab:
                    word_indices.append(vocab[w])
                if count > 0:
                    bigram = prev + ' ' + w
                    if bigram in vocab:
                        word_indices.append(vocab[bigram])
                count += 1
                prev = w
        for k, v in Counter(word_indices).items():
            col_idx.append(group_idx)
            row_idx.append(k)
            data.append(v)
    return (sp.csr_matrix((data, (col_idx, row_idx)), shape=(len(users), len(vocab))), np.array(user_ids))

def split_religion(data):
  return data[data['muslim'] == 0], data[data['muslim'] == 1]

def get_community_q(community_counts, exclude_user_id = None):
    user_sum = community_counts.sum(axis=0)
    if exclude_user_id:
        user_sum -= community_counts[exclude_user_id, :]
    total_sum = user_sum.sum()
    return user_sum / total_sum

def get_token_user_counts(community_counts):
    no_tokens = community_counts.shape[1]
    nonzero = sp.find(community_counts)[:2]
    user_t_counts = Counter(nonzero[1])  # number of users using each term
    community_t = np.ones(no_tokens)  # add one smoothing
    for k, v in user_t_counts.items():
        community_t[k] += v
    return community_t

def get_rho(H_q, M_q):
    return (M_q / (H_q + M_q)).transpose()

def mutual_information(H_t, M_t, H_not_t, M_not_t, H_no, M_no):
    no_users = H_no + M_no
    all_t = H_t + M_t
    all_not_t = no_users - all_t + 4
    mi_H_t = H_t * np.log2(no_users * (H_t / (all_t * H_no)))
    mi_H_not_t = H_not_t * np.log2(no_users * (H_not_t / (all_not_t * H_no)))
    mi_M_t = M_t * np.log2(no_users * (M_t / (all_t * M_no)))
    mi_M_not_t = M_not_t * np.log2(no_users * (M_not_t / (all_not_t * M_no)))
    return (1 / no_users * (mi_H_t + mi_H_not_t + mi_M_t + mi_M_not_t)).transpose()[:, np.newaxis]

def chi_square(H_t, M_t, H_not_t, M_not_t, H_no, M_no):
    no_users = H_no + M_no
    all_t = H_t + M_t
    all_not_t = no_users - all_t + 4
    chi_enum = no_users * (H_t * M_not_t - H_not_t * M_t) ** 2
    chi_denom = all_t * all_not_t * (H_t + H_not_t) * (M_t + M_not_t)
    return (chi_enum / chi_denom).transpose()[:, np.newaxis]

In [None]:
def calculate_polarization(H_counts, M_counts, leaveout=True):
    H_user_total = H_counts.sum(axis=1)
    M_user_total = M_counts.sum(axis=1)

    H_user_distr = (sp.diags(1 / H_user_total.A.ravel())).dot(H_counts)  # get row-wise distributions
    M_user_distr = (sp.diags(1 / M_user_total.A.ravel())).dot(M_counts)
    H_no = H_counts.shape[0]
    M_no = M_counts.shape[0]
    assert (set(H_user_total.nonzero()[0]) == set(range(H_no)))  # make sure there are no zero rows
    assert (set(M_user_total.nonzero()[0]) == set(range(M_no)))  # make sure there are no zero rows

    H_q = get_community_q(H_counts)
    M_q = get_community_q(M_counts)
    H_t = get_token_user_counts(H_counts)
    M_t = get_token_user_counts(M_counts)
    H_not_t = H_no - H_t + 2  # because of add one smoothing
    M_not_t = M_no - M_t + 2  # because of add one smoothing
    func = chi_square

    # apply measures via leave-out
    H_addup = 0
    M_addup = 0
    H_leaveout_no = H_no - 1
    M_leaveout_no = M_no - 1
    H_Leaveouts = []
    M_Leaveouts = []
    for i in range(H_no):
        H_leaveout_q = get_community_q(H_counts, i)
        token_scores_H = 1. - get_rho(H_leaveout_q, M_q)
        user_measure= H_user_distr[i, :].dot(token_scores_H)[0, 0]
        H_Leaveouts.append(user_measure)
        H_addup += user_measure
    for i in range(M_no):
        M_leaveout_q = get_community_q(M_counts, i)
        token_scores_M = get_rho(H_q, M_leaveout_q)
        user_measure = M_user_distr[i, :].dot(token_scores_M)[0, 0]
        M_Leaveouts.append(user_measure)
        M_addup += user_measure

    M_val = 1 / M_no * M_addup
    H_val = 1 / H_no * H_addup
    return (1/2 * (H_val + M_val), H_Leaveouts, M_Leaveouts)

def get_values(data, start_date):
    """
    Measure polarization.
    :param data: dataframe with 'text' and 'user_id'
    :return:
    """

    H_tweets, M_tweets = split_religion(data)  # get community tweets

    # get vocab
    vocab = {w: i for i, w in
             enumerate(open(vocab_file, 'r').read().splitlines())}

    (H_counts, H_user_ids) = get_user_token_counts(H_tweets, vocab)
    (M_counts, M_user_ids) = get_user_token_counts(M_tweets, vocab)

    del H_tweets
    del M_tweets
    gc.collect()

    H_user_len = H_counts.shape[0]
    M_user_len = M_counts.shape[0]

    all_counts = sp.vstack([H_counts, M_counts])

    wordcounts = all_counts.nonzero()[1]

    # filter words used by fewer than 2 people
    all_counts = all_counts[:, np.array([(np.count_nonzero(wordcounts == i) > 1) for i in range(all_counts.shape[1])])]

    H_counts = all_counts[:H_user_len, :]
    M_counts = all_counts[H_user_len:, :]
    del wordcounts
    del all_counts
    gc.collect()

    # filter users who did not use words from vocab
    H_nonzero = set(H_counts.nonzero()[0])
    M_nonzero = set(M_counts.nonzero()[0])
    H_filter_idx = np.array([(i in H_nonzero) for i in range(H_counts.shape[0])])
    M_filter_idx = np.array([(i in M_nonzero) for i in range(M_counts.shape[0])])
    H_counts = H_counts[H_filter_idx, :]  # filter users who did not use words from vocab
    M_counts = M_counts[M_filter_idx, :]
    H_user_ids = list(H_user_ids[H_filter_idx])
    M_user_ids = list(M_user_ids[M_filter_idx])
    del H_nonzero
    del M_nonzero
    gc.collect()

    (actual_val, H_Leaveouts, M_Leaveouts) = calculate_polarization(H_counts, M_counts)
    df = pd.DataFrame({'leaveout_score': H_Leaveouts + M_Leaveouts,
                       'user_id': H_user_ids + M_user_ids})
    df["date"] = start_date
    df["day_estimate"] = actual_val
    del H_counts
    del M_counts
    gc.collect()
    sys.stdout.flush()
    actual_val = "{:.2f}".format(actual_val)
    total_users = H_user_len + M_user_len
    print(f"Actual value:{actual_val}, Number of users:{total_users}, % of Muslims:{M_user_len*100/total_users}")
    return df

### user-daily aggregate

In [None]:
df =  pd.read_csv(data_path+"tweet_text.csv", index_col=0)
df = df.merge(pd.read_csv(f"{data_path}/tweet_level_data.csv"), on ="id", how="inner")
userid = pd.read_csv(data_path+"final_clean_userid.csv")
df = df[df.user_id.isin(userid.user_id)]
df.created_at = pd.to_datetime(df['created_at'], errors='coerce')

In [None]:
start_date =  pd.to_datetime(datetime.strptime("Jan 28 2020", '%b %d %Y').date()).normalize()
end_date =  pd.to_datetime(datetime.strptime("Jan 01 2021", '%b %d %Y').date()).normalize()
full_vocab = set()

while start_date <= end_date:
    daily_data =  df[df.created_at==start_date]

    if len(daily_data)>0 :
      p = Pool(3)
      daily_data['clean_text'] = p.map(clean_text, daily_data['text'])
      p.close()
      p.join()
      daily_vocab = build_vocab(daily_data.clean_text)
      with open(vocab_file, 'w') as f:
        f.write('\n'.join(daily_vocab))

      full_vocab = set(daily_vocab).union(full_vocab)
      data = get_values(daily_data, start_date)
      try:
        full_data = pd.read_pickle(data_path + "estimates.pkl")
        full_data = pd.concat([full_data, data], ignore_index=True)
      except (OSError, IOError) as e:
        print(f"error: {e}")
        full_data = data
      full_data.to_pickle(data_path+"estimates.pkl")

    start_date += timedelta(days=1)
with open(full_vocab_file, 'w') as f:
        f.write('\n'.join(full_vocab))

In [None]:
# Save BOW GCS to CSV
df = pd.read_pickle(data_path+"estimates.pkl")
df['bowgcs'] = df['leaveout_score']
del df['leaveout_score']
df.to_csv(data_path+"bow_gcs.csv", index = False)

# User-event level computations

In [None]:
def geteventDate(EVENT):
  if EVENT == "juntaCurfew":
    EVENT_DATE = 'Mar 22 2020'
  elif EVENT == "tabliqi":
    EVENT_DATE = 'Mar 31 2020'
  elif EVENT=="migrantraildeath":
    EVENT_DATE="May 8 2020"
  elif EVENT == "Coronil":
    EVENT_DATE = "Jun 23 2020"
  elif EVENT=="exam":
    EVENT_DATE = "Aug 23 2020"
  elif EVENT == "gdpcontracts":
    EVENT_DATE = "Aug 31 2020"
  elif EVENT=="BiharManifesto":
    EVENT_DATE="Oct 22 2020"

  EVENT_DATE = datetime.strptime(EVENT_DATE, '%b %d %Y').date()
  return pd.to_datetime(EVENT_DATE).normalize()

EVENTS = ["juntaCurfew"
          , "tabliqi"
          , "migrantraildeath"
          , "Coronil"
          , "exam"
          , "gdpcontracts"
          , "BiharManifesto"
          ]

## Compute Treatment Variable: interact


In [None]:
df = pd.read_csv(data_path+"/tweet_level_data.csv")
df = df[~df.reply_muslim.isna()][["id", "user_id", "created_at", "muslim", "in_reply_to_user_id", "reply_muslim"]]
df.in_reply_to_user_id = df.in_reply_to_user_id.astype(int)
df.muslim=df.muslim.astype(int)
df.reply_muslim = df.reply_muslim.astype(int)
df.created_at = pd.to_datetime(df['created_at'], errors='coerce')

In [None]:
for event in EVENTS:
  print(event)
  eventdate = geteventDate(event)
  d = df[df.created_at< eventdate]

  dic = {'reply_muslim': 'sum', 'muslim':'first', 'id':'size'}
  subdata = d.groupby("user_id").agg(dic).reset_index()
  subdata['frac_muslim_reply'] = subdata['reply_muslim']/subdata['id']
  subdata['interact'] = subdata.apply(lambda row: 1-row['frac_muslim_reply'] if row['muslim'] else row['frac_muslim_reply'], axis=1)

  subdata.interact = subdata.interact.apply(lambda x: 0 if x<interact_threshold else 1).astype(int)
  print(f"Total users:{len(subdata)}   interacts:{len(subdata[subdata.interact==1])}   non-interacts:{len(subdata[subdata.interact==0])}")
  print("\n\n")
  subdata.to_csv(data_path+f"/interact_before_{event}.csv")

## Compute Outcome (1): GCS-diff around event window

In [None]:
df = pd.read_csv(f'{data_path}/gcs.csv')[['user_id',  'created_at', 'gcs']]
df.created_at = pd.to_datetime(df['created_at'], errors='coerce')
d = {'user_id':'first', 'gcs':'mean'}

In [None]:
for EVENT in EVENTS:
  print(EVENT)
  eventdate = geteventDate(EVENT)
  df["diff_days"] = df['created_at'].apply(lambda x: (eventdate - x).days)

  subdata = df[abs(df.diff_days)<=WINDOW]
  subdata = subdata[subdata.diff_days>-WINDOW]

  post_data = subdata[subdata.diff_days<=0]
  pre_data = subdata[subdata.diff_days>0]

  #avg gcs in post data
  post_data = post_data.groupby('user_id').agg(d)
  post_data.reset_index(drop = True, inplace = True)
  print(f"post users:{len(post_data)}")

  #avg gcs in pre data
  pre_data = pre_data.groupby('user_id').agg(d)
  pre_data.reset_index(drop = True, inplace = True)
  print(f"pre users:{len(pre_data)}")

  subdata = pd.merge(post_data, pre_data, on = ["user_id"], suffixes=('', '_pre'))

  subdata["gcs_diff"] = subdata["gcs"] - subdata["gcs_pre"]
  del subdata["gcs"]
  del subdata["gcs_pre"]
  subdata.to_csv(f"{data_path}/{EVENT}gcsdiff_{WINDOW}.csv")
  print(len(subdata))
  print()

## Combine with pretreatment covariates

In [None]:
def gettweetdata(eventdate):
  df = pd.read_csv(f"{data_path}/tweet_level_data.csv")
  df.created_at = pd.to_datetime(df['created_at'], errors='coerce')
  df['user_created_at'] = pd.to_datetime(df['user_created_at'], errors='coerce')
  df = df[['valence_intensity', 'anger_intensity', 'fear_intensity',
        'sadness_intensity', 'joy_intensity',
        'user_friends_count', 'user_followers_count',
        'retweet_count', 'user_created_at',
        'muslim_score', 'reply', 'muslim', 'user_id', "id",
        'created_at']]
  df['valence_intensity'] = df.valence_intensity.astype(np.float32)
  df['anger_intensity'] = df.anger_intensity.astype(np.float32)
  df['fear_intensity'] = df.fear_intensity.astype(np.float32)
  df['sadness_intensity'] = df.sadness_intensity.astype(np.float32)
  df['joy_intensity'] = df.joy_intensity.astype(np.float32)
  df['muslim_score'] = df.muslim_score.astype(np.float32)
  df['muslim'] = df.muslim.astype(np.int8)
  df['reply'] = df.reply.astype(np.int16)
  return df[(df.created_at >= eventdate - timedelta(preTreatmentPeriod))&(df.created_at<eventdate)]

gcsdata = pd.read_csv(f'{data_path}/gcs.csv')[['user_id',  'created_at', 'gcs']]
gcsdata.created_at = pd.to_datetime(gcsdata['created_at'], errors='coerce')
gcsdata['gcs'] = gcsdata['gcs'].astype(np.float32)
dic = {'valence_intensity':"mean", 'anger_intensity':"mean", 'fear_intensity':"mean",
       'sadness_intensity':"mean", 'joy_intensity':"mean",
       'user_friends_count':'last', 'user_followers_count':'last',
       'retweet_count':'mean', 'user_created_at':'first',
       'muslim_score':'first', 'reply':'mean', 'muslim':'first', 'user_id':'first', "id":'size'}

In [None]:
for EVENT in EVENTS:
  print(EVENT)
  eventdate = geteventDate(EVENT)

  # load gcs_diff
  gcs_diff_data = pd.read_csv(f"{data_path}/{EVENT}gcsdiff_{WINDOW}.csv", index_col = 0)

  # combine with treatment variable
  interactdata= pd.read_csv(data_path+f"/interact_before_{EVENT}.csv", index_col =0)
  subdata = gcs_diff_data.merge(interactdata, on = "user_id", how ="inner")
  del interactdata, gcs_diff_data

  # combine aggregates pretreatmentPeriod (30 days pre-event)
  predata = gettweetdata(eventdate)
  predata = predata[predata.user_id.isin(subdata.user_id)]
  predata = predata.groupby('user_id').agg(dic)
  predata['tweet_frequency'] = predata['id']

  predata.reset_index(drop=True, inplace =True)

  eventgcsdata = gcsdata[(gcsdata.created_at >= eventdate - timedelta(preTreatmentPeriod))&(gcsdata.created_at<eventdate)]
  eventgcsdata = eventgcsdata[eventgcsdata.user_id.isin(subdata.user_id)]
  eventgcsdata = eventgcsdata.groupby('user_id').agg({"user_id":"first", 'gcs':"mean"})
  eventgcsdata.reset_index(drop=True, inplace =True)
  subdata = subdata.merge(eventgcsdata, on = "user_id", how = "inner", suffixes = ['', '_y'])
  subdata = subdata[[col for col in subdata.columns if not col.endswith("_y")]]

  subdata = subdata.merge(predata, on = "user_id", how = "inner", suffixes = ['', '_y'])
  subdata = subdata[[col for col in subdata.columns if not col.endswith("_y")]]
  del subdata['id']
  gc.collect()

  subdata.to_csv(f"{data_path}/causal_data_{EVENT}_{str(WINDOW)}_allcovariates_except_topic.csv")

## Topic modeling

### Utility Functions

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
stop_words = set(stop_words + ["tweet", "twitter", "sir", "tweets", "ji", "like", "ki", "ke", "ko", "ka", "hai",
                               "se", "aur", "bhi", "na", "nahi", "let", "say", "yes", "no", "think", "know", "ur",
                               "going", "kya", "guy", "due", "man", "got",
                               'ye', 'bhai', 'kuch','toh', 'ho', 'jo', 'tha', 'hoga', 'mein', 'hi', 'hain', 'kar',
                               'lo', 'ek', 'ya', 'gaya', 'kal', 'koi', 'ab', 'nhi', 'yeh', 'said', 'say', 'use'])


def clean_tweet(text, labeling=True, hide_covid=False):
  temp = text.lower()
  temp = re.sub("(\bcorona\b)|(\bncov\b)|(covid-19)|(\bcovid\b)" , "coronavirus", temp)
  if hide_covid:
    temp=re.sub("(coronavirus)|(corona)", " ", temp)
  temp = re.sub("(pfizer)|(moderna)|(janseen)|(sinova)|(sinopharm)|(sputnik)|(covishield)|(covaxin)|(coronavac)|(sinovac)|(novavax)","vaccine", temp)
  temp = re.sub("(astrazeneca)|(johnson)|(johnson\s*(&|and)\s*johnson)|(j&j)|(serum)|(biontech)|(sinopharm)|(covishield)|(covaxin)","vaccine", temp)

  temp = re.sub("@[a-z0-9_]+","", temp)
  temp =  temp.replace("_", " ")
  temp = re.sub(r'&amp', '&', temp)
  temp = re.sub(r'&quot', '"', temp)
  temp = re.sub(r'&lt', '<', temp)
  temp = re.sub(r'&gt', '>', temp)
  temp = re.sub(r'http\S+', '', temp)
  temp = re.sub("[^a-z]"," ", temp)
  temp = "".join([char for char in temp if char not in string.punctuation])
  temp = re.sub('\s+', ' ', temp)
  temp = temp.strip()
  temp = " ".join([wordnet_lemmatizer.lemmatize(word) for word in
                   nltk.word_tokenize(temp) if word not in stop_words])
  return temp


def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)#total count of word across corpus
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)
    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=50):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], "{:.3f}".format(tf_idf_transposed[i][j])) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['topic'])
                     .bi_text
                     .count()
                     .reset_index()
                     .rename({"topic": "Topic", "bi_text": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

def bigram_pasting(sentence):
  return " ".join(phrase_model[sentence.split(" ")])

### data prep for topic modeling

In [None]:
tweets_df =  pd.read_csv(data_path+"tweet_text.csv", index_col=0) #read tweet_text.csv
tweets_df = tweets_df.drop_duplicates(subset=['text'])
tweets_df = tweets_df.merge(pd.read_csv(data_path +"/tweet_level_data.csv"), on ="id", how="inner", suffixes= ["", "_y"])[['id','text','created_at','user_id']]
tweets_df = tweets_df.drop(columns=[i for i in tweets_df.columns if i.endswith("_y")])
tweets_df.created_at = pd.to_datetime(tweets_df['created_at'], errors='coerce')

In [None]:
all_tweets = pd.DataFrame()
for EVENT in EVENTS:
  eventdate = geteventDate(EVENT)
  users = pd.read_csv(f"{data_path}/causal_data_{EVENT}_{str(WINDOW)}_allcovariates_except_topic.csv", index_col=0)['user_id']
  tweets = tweets_df[(tweets_df.created_at >= eventdate - timedelta(preTreatmentPeriod))&(tweets_df.created_at<eventdate+timedelta(WINDOW))]
  tweets = tweets[tweets.user_id.isin(users)]

  start_date = eventdate - timedelta(preTreatmentPeriod)
  end_date =  eventdate+timedelta(WINDOW)
  embdata = []
  while start_date < end_date:
    emb = pd.read_csv(f"{data_path}embeddings/tweet_embeddings_{start_date}.csv", index_col=0)
    emb = emb.astype({str(col): 'float16' for col in range(0,768)})
    embdata.append(emb)
    start_date += timedelta(days=1)
  embdata = pd.concat(embdata)
  del embdata['muslim']
  embdata = embdata[embdata.user_id.isin(users)]
  tweets = tweets.merge(embdata, on ="id", how="inner", suffixes= ["", "_y"])
  tweets = tweets.drop(columns=[i for i in tweets.columns if i.endswith("_y")])
  tweets['text'] = tweets['text'].apply(lambda x: clean_tweet(x, labeling=False))

  tweets = tweets[~tweets.text.isna()]
  if len(all_tweets)>0:
    all_tweets = pd.concat([all_tweets, tweets]).drop_duplicates()
    all_tweets = all_tweets.drop_duplicates(subset=['text'])
  else:
    all_tweets = tweets
  del embdata, users
  gc.collect()

tweets = all_tweets
tweets = tweets[tweets.text!='']
tweets = tweets[~tweets['text'].isna()]

del all_tweets
gc.collect()
tweets.to_csv(data_path + '/tweets_topics_df.csv')

### Finding elbow

In [None]:
tweets= pd.read_csv(data_path + '/tweets_topics_df.csv', index_col=0)
x = tweets[[str(i) for i in range(0,768)]].values
del tweets
gc.collect()

In [None]:
inertias = []
total_clusters = []
for k in range(3,11,1):
  # Building and fitting the model
  kmeans = KMeans(n_clusters = k, random_state=23)
  kmeans.fit(x)
  with open(f'{data_path}kmeans{str(k)}_7events.pickle', 'wb') as handle:
    pickle.dump(kmeans, file=handle,protocol=None, fix_imports=True)
  inertias.append(kmeans.inertia_)
  total_clusters.append(k)
  print(f"{str(k)} done")
plt.plot(total_clusters, inertias, 'bx-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Curve: Inertia vs. Clusters')
plt.show()

### Predict

In [None]:
tweets= pd.read_csv(data_path + '/tweets_topics_df.csv', index_col=0)
n = 7
with open(f'{data_path}kmeans{str(n)}_7events.pickle', 'rb') as handle:
    kmeans = pickle.load(file=handle)
tweets["topic"] = kmeans.predict(tweets[[str(i) for i in range(0,768)]].values)
tweets.to_csv(data_path+f'/kmeans_{n}_window_{WINDOW}_7events.csv')
del tweets["text"], tweets["user_id"], tweets["created_at"]
gc.collect()

In [None]:
# interpret topic labels
ntopics = 7
topic_data = pd.read_csv(data_path+f'/kmeans_{ntopics}_window_{WINDOW}_7events.csv', index_col =0).reset_index(drop=True)
topic_data.set_index(['id'], drop=False, append=False, inplace = True)
topic_data = topic_data[[col for col in topic_data.columns if col not in [str(i) for i in range(0,768)]]]
gc.collect()

topic_data["text"] = topic_data["text"].map(lambda tweet: clean_tweet(tweet, labeling=True, hide_covid= True))
phrase_model = Phrases([doc.split(" ") for doc in topic_data['text']],
                      connector_words=ENGLISH_CONNECTOR_WORDS, scoring = 'npmi', threshold=-1)
p = Pool(2)
topic_data['bi_text'] = p.map(bigram_pasting, topic_data['text'])
p.close()
p.join()

# join docs
docs_per_topic = topic_data.groupby(['topic'], as_index = False).agg({'bi_text': ' '.join})
tf_idf, count = c_tf_idf(docs_per_topic.bi_text.values, m=len(topic_data))
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=50)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
topic_label = pd.DataFrame(top_n_words)
topic_label.to_csv(data_path + f'/labels_7topics_7events_window_{WINDOW}.csv', index = 0)

#### Quantitative cluster analysis


In [None]:
data = pd.DataFrame()
n=7 # check for n clusters/topics
with open(f'{data_path}kmeans{str(n)}_7events.pickle', 'rb') as handle:
    kmeans = pickle.load(file=handle)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
# Calculate the distance between centroids using cosine distance
distances = cdist(centroids, centroids, 'cosine')
print(labels)
print(distances)
print()

# Calculate the distance between centroids using Euclidean distance
distances = cdist(centroids, centroids, 'euclidean')
print(np.around(distances, 2))
print(labels)
print(distances)
print()

for i in range(0,n):
  for j in range(i, n):
    tweets = pd.read_csv(data_path+f'/kmeans_{n}_window_{WINDOW}_7events.csv', index_col=0)
    tweets = tweets.astype({str(col): 'float16' for col in range(0,768)})
    x = tweets[tweets['topic'] == i][[str(i) for i in range(0,768)]].values
    y = tweets[tweets['topic'] == j][[str(i) for i in range(0,768)]].values
    del tweets
    gc.collect()

    cos = np.mean(cosine_similarity(x,y))
    print(cos.shape)
    del x,y
    gc.collect()

    temp = pd.DataFrame([[i,j,cos]], columns=['cluster_source', 'cluster_target','mean_cosine'])

    data = data.append(temp)

data.to_csv(data_path+f'/pairwise_avg_cluster_distances_kmeans_{n}_window_{WINDOW}_7events.csv')
print(data)

### Merge topics after examining labels, clusters

In [None]:
merge = True
ntopics=7
topic_data = pd.read_csv(data_path+f'/kmeans_{ntopics}_window_{WINDOW}_7events.csv', index_col =0).reset_index(drop=True)
topic_data.set_index(['id'], drop=False, append=False, inplace = True)
topic_data = topic_data[[col for col in topic_data.columns if col not in [str(i) for i in range(0,768)]]]
gc.collect()
if merge==True:
  topic_data.topic = topic_data.topic.map(lambda t: 0 if t in [0,2,3,4] else 1 if t==1 else 2 if t==5 else 3)# covid, pol-rel, china, economy

#### label merged version

In [None]:
#interpret
ntopics = 4
topic_data["text"] = topic_data["text"].map(lambda tweet: clean_tweet(tweet, labeling=True, hide_covid= True))
phrase_model = Phrases([doc.split(" ") for doc in topic_data['text']],
                      connector_words=ENGLISH_CONNECTOR_WORDS, scoring = 'npmi', threshold=-1)
p = Pool(2)
topic_data['bi_text'] = p.map(bigram_pasting, topic_data['text'])
p.close()
p.join()

# join docs
docs_per_topic = topic_data.groupby(['topic'], as_index = False).agg({'bi_text': ' '.join})
tf_idf, count = c_tf_idf(docs_per_topic.bi_text.values, m=len(topic_data))
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=50)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
topic_label = pd.DataFrame(top_n_words)
topic_label.to_csv(data_path + f'/merged_labels_7to4_kmeans_7events_window_{WINDOW}.csv', index = 0)

#### pivot

In [None]:
#pivot
topic_data["topic"]=topic_data.topic.apply(lambda x: f"topic_{str(x)}")
topic_data["val"]=int(1)
topic_data = pd.pivot(topic_data, index=['user_id', 'created_at', 'id'], columns='topic', values='val')#.reset_index()
topic_data.fillna(0, inplace=True)
topic_data = pd.DataFrame(topic_data.to_records())
topic_data.created_at = pd.to_datetime(topic_data['created_at'], errors='coerce')
topic_data.to_csv(f'{data_path}/All_topics_{ntopics}_pivoted_window_{WINDOW}{"_merged" if merge else ""}.csv')

## Compute Outcomes (2): Topics, and emotions difference

In [None]:
topic_data = pd.read_csv(f'{data_path}/All_topics_{ntopics}_pivoted_window_{WINDOW}{"_merged" if merge else ""}.csv', index_col =0).reset_index(drop=True)
topic_data['created_at'] = pd.to_datetime(topic_data['created_at'], errors='coerce')
topic_data.columns

In [None]:
df = pd.read_csv(f"{data_path}/tweet_level_data.csv")
df = df.merge(topic_data, on='id', how ='inner', suffixes=['', '_y'])
df = df[[col for col in df.columns if  not col.endswith("_y")]]
df.created_at = pd.to_datetime(df['created_at'], errors='coerce')
df['valence_intensity'] = df.valence_intensity.astype(np.float32)
df['anger_intensity'] = df.anger_intensity.astype(np.float32)
df['fear_intensity'] = df.fear_intensity.astype(np.float32)
df['sadness_intensity'] = df.sadness_intensity.astype(np.float32)
df['joy_intensity'] = df.joy_intensity.astype(np.float32)
for col in topic_data.columns:
  if col.startswith("topic_"):
    df[col] = df[col].astype(np.float32)

cols =['valence_intensity', 'anger_intensity',
         'fear_intensity', 'sadness_intensity', 'joy_intensity',
      ] + list(topic_data.columns)
df = df[cols]
dic = {'user_id':'first', 'valence_intensity':"mean", 'anger_intensity':"mean", 'fear_intensity':"mean",
       'sadness_intensity':"mean", 'joy_intensity':"mean"#pd.Series.mode,
       }
dic.update({col:'mean' for col in topic_data.columns if col.startswith("topic_")})
df.columns
#Columns in df:
# 'valence_intensity', 'anger_intensity', 'fear_intensity',
# 'sadness_intensity', 'joy_intensity', 'user_id', 'created_at', 'id',
# 'topic_0', 'topic_1', 'topic_2', 'topic_3'

In [None]:
cols = ['valence_intensity', 'anger_intensity',
         'fear_intensity', 'sadness_intensity', 'joy_intensity',
      ] + list(filter(lambda col:col.startswith("topic_"), topic_data.columns))

In [None]:
for EVENT in EVENTS:
  print(EVENT)
  eventdate = geteventDate(EVENT)
  df["diff_days"] = df['created_at'].apply(lambda x: (eventdate - x).days)

  subdata = df[abs(df.diff_days)<=WINDOW]
  subdata = subdata[subdata.diff_days>-WINDOW]

  print(f"len data: {len(subdata)}")
  print(f"num users:{len(subdata.user_id.unique())}")

  post_data = subdata[subdata.diff_days<=0]
  pre_data = subdata[subdata.diff_days>0]
  print(subdata.columns)

  print(f"post users before aggregating: {len(post_data.user_id.unique())}")
  post_data = post_data.groupby('user_id').agg(dic)
  post_data.reset_index(drop = True, inplace = True)
  print(f"post users after:{len(post_data)}")

  print(f"pre users before aggregating: {len(pre_data.user_id.unique())}")
  pre_data = pre_data.groupby('user_id').agg(dic)
  pre_data.reset_index(drop = True, inplace = True)
  print(f"pre users:{len(pre_data)}")

  subdata = pd.merge(post_data, pre_data, on = ["user_id"], suffixes=('', '_pre'))

  for col in cols:
    subdata[f"{col}_diff"] = subdata[col] - subdata[f"{col}_pre"]
    del subdata[col]
    del subdata[f"{col}_pre"]

  subdata.to_csv(f"{data_path}/{EVENT}_topics{ntopics}_emotions_diff_window_{WINDOW}{'_merged' if merge else ''}.csv")
  print(f"final length data: {len(subdata)}")
  print()

## Combine pretreatment topics, changes in topics and emotions with rest of data

In [None]:
ntopics = 7
merge = True
# topics
topic_data = pd.read_csv(f'{data_path}/All_topics_{ntopics}_pivoted_window_{WINDOW}{"_merged" if merge else ""}.csv', index_col =0).reset_index(drop=True)
topic_data['created_at'] = pd.to_datetime(topic_data['created_at'], errors='coerce')
dic = {'user_id':'first'}
dic.update({col:'mean' for col in topic_data.columns if col.startswith("topic_")})
print(topic_data.columns)

In [None]:
for EVENT in EVENTS:
  eventdate = geteventDate(EVENT)
  # pre-treatment topics
  subtopic_data = topic_data[(topic_data.created_at >= eventdate - timedelta(preTreatmentPeriod))&(topic_data.created_at<eventdate)]
  subtopic_data = subtopic_data.groupby('user_id').agg(dic).reset_index(drop=True)

  # other pre-treatment covariates, treatment, outcome (change in GCS)
  data = pd.read_csv(f"{data_path}/causal_data_{EVENT}_{str(WINDOW)}_allcovariates_except_topic.csv", index_col=0)
  data = data.merge(subtopic_data, how="inner", on="user_id", suffixes=["", "_y"])
  data = data.drop(columns=[col for col in data.columns if col.endswith("_y")])

  # outcome (2) changes in topics and emotions
  diffdata = pd.read_csv(f"{data_path}/{EVENT}_topics{ntopics}_emotions_diff_window_{WINDOW}{'_merged' if merge else ''}.csv", index_col=0)
  data = data.merge(diffdata, how="inner", on="user_id", suffixes=["", "_y"])
  data = data.drop(columns=[col for col in data.columns if col.endswith("_y")])

  # Save final event-level data: used for TE estimation
  data.to_csv(f"{data_path}/causal_data_{EVENT}_{str(WINDOW)}_allcovariates_topics-{ntopics}{'_merged' if merge else ''}.csv")