In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
# Install requirements after mounting drive, then comment out, restart runtime
%%capture
%cd /content/drive/My Drive/covid_tweets/code
!pip install -r requirements.txt

In [None]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime
from datetime import timedelta
from scipy.stats import pearsonr
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import nltk
import re
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

%matplotlib inline

np.random.seed(12)
eps = 1E-14
WINDOW = 7
ntopics = 7 #if merged we set merge variable to True but keep ntopics=7
merge=True

data_path = "/content/drive/My Drive/covid_tweets/data/"

# Descriptive Statistics: Event-Level Data

In [None]:
def geteventDate(EVENT):
  if EVENT == "juntaCurfew":
    EVENT_DATE = 'Mar 22 2020'
  elif EVENT == "tabliqi":
    EVENT_DATE = 'Mar 31 2020'
  elif EVENT=="migrantraildeath":
    EVENT_DATE="May 8 2020"
  elif EVENT == "Coronil":
    EVENT_DATE = "Jun 23 2020"
  elif EVENT=="exam":
    EVENT_DATE = "Aug 23 2020"
  elif EVENT == "gdpcontracts":
    EVENT_DATE = "Aug 31 2020"
  elif EVENT=="BiharManifesto":
    EVENT_DATE="Oct 22 2020"

  EVENT_DATE = datetime.strptime(EVENT_DATE, '%b %d %Y').date()
  return pd.to_datetime(EVENT_DATE).normalize()

EVENTS = ["juntaCurfew"
          , "tabliqi"
          , "migrantraildeath"
          , "Coronil"
          , "exam"
          , "gdpcontracts"
          , "BiharManifesto"
          ]

In [None]:
# compute means
M =[]
NC= []
MNC= []
NMNC=[]
GCS=[]
GCSHC=[]
GCSMC=[]
GCSHNC=[]
GCSMNC=[]
t0=[]
t1=[]
t2=[]
t3=[]
valence =[]
fear= []
sadness=[]
joy=[]
anger=[]
user_followers_count=[]
user_created_at =[]
user_friends_count=[]
retweet_count=[]
reply =[]
tweet_frequency=[]

def get_data(EVENT, ntopics, WINDOW):
  df = pd.read_csv(f"{data_path}causal_data_{EVENT}_{str(WINDOW)}_allcovariates_topics-{ntopics}{'_merged' if merge else ''}.csv", index_col=0)
  df['user_created_at']  = (geteventDate(EVENT) - pd.to_datetime(df['user_created_at'], errors='coerce')).dt.days
  M.append(sum(df.muslim==1)*100/len(df))
  NC.append(sum(df.interact==1)*100/len(df))
  MNC.append(sum((df.interact==1)&(df.muslim==1))*100/sum(df.muslim==1))
  NMNC.append(sum((df.interact==1)&(df.muslim==0))*100/sum(df.muslim==0))
  GCS.append("{:.4f}".format(df.gcs.mean()))
  GCSHC.append("{:.4f}".format(df[(df.interact==0)&(df.muslim==0)].gcs.mean()))
  GCSMC.append("{:.4f}".format(df[(df.interact==0)&(df.muslim==1)].gcs.mean()))
  GCSHNC.append("{:.4f}".format(df[(df.interact==1)&(df.muslim==0)].gcs.mean()))
  GCSMNC.append("{:.4f}".format(df[(df.interact==1)&(df.muslim==1)].gcs.mean()))
  t0.append("{:.2f}".format(df.topic_0.mean()))
  t1.append("{:.2f}".format(df.topic_1.mean()))
  t2.append("{:.2f}".format(df.topic_2.mean()))
  t3.append("{:.2f}".format(df.topic_3.mean()))
  # t4.append("{:.2f}".format(df.topic_4.mean()))
  valence.append("{:.2f}".format(df.valence_intensity.mean()))
  fear.append("{:.2f}".format(df.fear_intensity.mean()))
  sadness.append("{:.2f}".format(df.sadness_intensity.mean()))
  joy.append("{:.2f}".format(df.joy_intensity.mean()))
  anger.append("{:.2f}".format(df.anger_intensity.mean()))
  user_followers_count.append("{:.2f}".format(df.user_followers_count.mean()))
  user_created_at.append("{:.2f}".format(df.user_created_at.mean()))
  user_friends_count.append("{:.2f}".format(df.user_friends_count.mean()))
  retweet_count.append("{:.2f}".format(df.retweet_count.mean()))
  reply.append("{:.2f}".format(df.reply.mean()))
  tweet_frequency.append("{:.2f}".format(df.tweet_frequency.mean()))

for EVENT in EVENTS:
  get_data(EVENT, ntopics, WINDOW)

df = pd.DataFrame({"EVENT":EVENTS, "Muslim%":M, "Interact%":NC, "Hindu Interact%": NMNC,
                   "Muslim Interact%":MNC, "Avg GCS":GCS, "Avg GCS Hindu non-Interact":GCSHC,
                   "Avg GCS Muslim non-Interact": GCSMC, "Avg GCS Hindu Interact":GCSHNC, "Avg GCS Muslim Interact":GCSMNC,
                   "topic0":t0, "topic1":t1, "topic2":t2, "topic3":t3,
                   "valence":valence, "fear":fear,"sadness":sadness, "joy":joy, "anger":anger,
                   "user_followers_count":user_followers_count,"user_created_at":user_created_at,
                   "user_friends_count":user_friends_count,"retweet_count":retweet_count,"reply":reply,
                   "tweet_frequency":tweet_frequency})
res = df.T
res.to_csv(f"{data_path}descriptive_stats_{ntopics}topics{'_merged' if merge else ''}_{str(WINDOW)}.csv")

In [None]:
# compute standard deviations

gcs=[]
gcsHC=[]
gcsMC=[]
gcsHNC=[]
gcsMNC=[]
t0=[]
t1=[]
t2=[]
t3=[]
t4=[]
valence =[]
fear= []
sadness=[]
joy=[]
anger=[]
user_followers_count=[]
user_created_at =[]
user_friends_count=[]
retweet_count=[]
reply =[]
tweet_frequency=[]

def get_data(EVENT, ntopics, WINDOW):
  df = pd.read_csv(f"{data_path}causal_data_{EVENT}_{str(WINDOW)}_allcovariates_topics-{ntopics}.csv", index_col=0)
  df['user_created_at']  = (geteventDate(EVENT) - pd.to_datetime(df['user_created_at'], errors='coerce')).dt.days

  gcs.append(("{:.4f}".format(df.gcs.std())))
  gcsHC.append( "{:.4f}".format(df[(df.interact==0)&(df.muslim==0)].gcs.std()))
  gcsMC.append("{:.4f}".format(df[(df.interact==0)&(df.muslim==1)].gcs.std()))
  gcsHNC.append("{:.4f}".format(df[(df.interact==1)&(df.muslim==0)].gcs.std()))
  gcsMNC.append( "{:.4f}".format(df[(df.interact==1)&(df.muslim==1)].gcs.std()))
  t0.append("{:.2f}".format(df.topic_0.std()))
  t1.append("{:.2f}".format(df.topic_1.std()))
  t2.append("{:.2f}".format(df.topic_2.std()))
  t3.append("{:.2f}".format(df.topic_3.std()))
  valence.append("{:.2f}".format(df.valence_intensity.std()))
  fear.append( "{:.2f}".format(df.fear_intensity.std()))
  sadness.append( "{:.2f}".format(df.sadness_intensity.std()))
  joy.append( "{:.2f}".format(df.joy_intensity.std()))
  anger.append( "{:.2f}".format(df.anger_intensity.std()))
  user_followers_count.append( "{:.2f}".format(df.user_followers_count.std()))
  user_created_at.append( "{:.2f}".format(df.user_created_at.std()))
  user_friends_count.append( "{:.2f}".format(df.user_friends_count.std()))
  retweet_count.append( "{:.2f}".format(df.retweet_count.std()))
  reply.append( "{:.2f}".format(df.reply.std()))
  tweet_frequency.append("{:.2f}".format(df.tweet_frequency.std()))

for EVENT in EVENTS:
  get_data(EVENT, ntopics, WINDOW)

df = pd.DataFrame({"EVENT":EVENTS, "gcs":gcs, "gcs Hindu non-Interact":gcsHC,
                   "gcs Muslim non-Interact": gcsMC, "gcs Hindu Interact":gcsHNC, "gcs Muslim Interact":gcsMNC,
                   "topic0":t0, "topic1":t1, "topic2":t2, "topic3":t3,
                   "valence":valence, "fear":fear,"sadness":sadness, "joy":joy, "anger":anger,
                   "user_followers_count":user_followers_count,"user_created_at":user_created_at,
                   "user_friends_count":user_friends_count,"retweet_count":retweet_count,"reply":reply,
                   "tweet_frequency":tweet_frequency})
res = df.T
res.to_csv(f"{data_path}descriptive_stats_sd_{ntopics}topics{'_merged' if merge else ''}_{str(WINDOW)}.csv")

# BowGCS vs GCS Polarization over time

In [None]:
def geteventDate(EVENT):
  if EVENT == "Janata Curfew":
    EVENT_DATE = 'Mar 22 2020'
  elif EVENT=="Mahashivaratri":
    EVENT_DATE='Feb 21 2020'
  elif EVENT=="Holi":
    EVENT_DATE="Mar 10 2020"
  elif EVENT == "Tablighi":
    EVENT_DATE = 'Mar 31 2020'
  elif EVENT=="Vaisakhi":
    EVENT_DATE="Apr 13 2020"
  elif EVENT == "Ramadan":
    EVENT_DATE = "Apr 23 2020"
  elif EVENT=="Migrants Deaths":
    EVENT_DATE="May 8 2020"
  elif EVENT =="Eid-ul-fitr":
    EVENT_DATE='MAY 24 2020'
  elif EVENT == "Coronil Launch":
    EVENT_DATE = "Jun 23 2020"
  elif EVENT=="Eid-ul-zuha":
    EVENT_DATE="Jul 30 2020"
  elif EVENT=="Onam/Ganeshotsav":
    EVENT_DATE="Aug 22 2020"
  elif EVENT=="Exam Satyagraha":
    EVENT_DATE = "Aug 23 2020"
  elif EVENT=="Muharram":
     EVENT_DATE="Aug 29 2020"
  elif EVENT == "GDP Contracts":
    EVENT_DATE = "Aug 31 2020"
  elif EVENT=="modibday":
    EVENT_DATE = "Sep 20 2020"
  elif EVENT=="Bihar Manifesto":
    EVENT_DATE="Oct 22 2020"
  elif EVENT=="Dussehra":
    EVENT_DATE = "Oct 25 2020"
  elif EVENT=="Eid-e-Milad":
    EVENT_DATE="Oct 28 2020"
  elif EVENT == "Diwali":
    EVENT_DATE = "Nov 14 2020"
  elif EVENT=="Christmas":
    EVENT_DATE="Dec 25 2020"
  return pd.to_datetime(datetime.strptime(EVENT_DATE, '%b %d %Y'), format = '%Y-%m-%d').normalize()

EVENTS = ["Janata Curfew"
          , "Tablighi"
          , "Migrants Deaths"
          , "Coronil Launch"
          , "Exam Satyagraha"
          , "GDP Contracts"
          , "Bihar Manifesto"
          ]
Festivals = ["Holi", "Vaisakhi",  "Ramadan", "Eid-ul-fitr", 'Eid-ul-zuha',  "Eid-e-Milad", "Diwali", "Christmas"]

In [None]:
data = pd.read_csv(data_path+"bow_gcs.csv")
data = data[["date", "day_estimate"]]
data.date = pd.to_datetime(data['date'], errors='coerce')
data = data.drop_duplicates()

d=[]
df = pd.read_csv(f'{data_path}gcs.csv')[['user_id', 'created_at', 'gcs', "muslim"]]
df.created_at = pd.to_datetime(df['created_at'], errors='coerce')
start_date =  pd.to_datetime(datetime.strptime("Jan 28 2020", '%b %d %Y').date()).normalize()
end_date =  pd.to_datetime(datetime.strptime("Jan 01 2021", '%b %d %Y').date()).normalize()

while start_date <= end_date:
    daily_data =  df[df.created_at==start_date]
    nM = sum(daily_data.muslim==1)
    M_addup = sum(daily_data[daily_data.muslim==1].gcs)
    nH = sum(daily_data.muslim==0)
    H_addup = sum(daily_data[daily_data.muslim==0].gcs)
    M_val = 1 / nM * M_addup
    H_val = 1 / nH * H_addup
    pi = 12 * (H_val + M_val)
    d.append(pd.DataFrame([[start_date,pi]], columns = ['date','day_estimate_gcs']))
    start_date += timedelta(days=1)
df = pd.concat(d, ignore_index=True)
df = df.merge(data, how = "inner", on = "date", suffixes = ['', "_bow"])
df["day_estimate_bow"] = df["day_estimate"]
del df["day_estimate"]
df.reset_index(inplace=True)
df.to_csv(f'{data_path}daily_polarization.csv', index  = False) # contains date, day_estimate_gcs

In [None]:
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')# * 1000
pearsonr(df["day_estimate_gcs"], df["day_estimate_bow"])

In [None]:
tweets = pd.read_csv(f"{data_path}tweet_level_data.csv")
tweets.created_at = pd.to_datetime(tweets['created_at'], errors='coerce') # to compute NM and M tweets each day

In [None]:
ema = df['day_estimate_gcs'].ewm(span=7, adjust=False).mean()
df = df.join(ema, rsuffix='_ema')

ema = df['day_estimate_bow'].ewm(span=7, adjust=False).mean()
df = df.join(ema, rsuffix='_ema')

df["nHtweets"] = df.date.apply(lambda date: sum((tweets.created_at==date)&(tweets.muslim==0)))
ema = df['nHtweets'].ewm(span=7, adjust=False).mean()
df = df.join(ema, rsuffix='_ema')

df["nMtweets"] = df.date.apply(lambda date: sum((tweets.created_at==date)&(tweets.muslim==1)))
ema = df['nMtweets'].ewm(span=7, adjust=False).mean()
df = df.join(ema, rsuffix='_ema')

df.set_index('date', inplace=True)

In [None]:
# Create figure with secondary y-axis
color = "darkviolet"
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=df.index, y=df.day_estimate_gcs_ema, name="Contextualized Polarization"),
)

fig.add_trace(
    go.Scatter(x=df.index, y=df.day_estimate_bow_ema, name="BOW Polarization",
    yaxis="y2"))

fig.add_trace(go.Scatter(x=df.index, y=df.nHtweets_ema, name="#non-Muslim Tweets",
               marker=dict(color=color), line=dict(color=color, width=1, dash="dot"),

              yaxis="y3"))

fig.add_trace(go.Scatter(x=df.index, y=df.nMtweets_ema, name="#Muslim Tweets",
                         marker=dict(color='darkorange'), line=dict(color='darkorange', width=1, dash="dot"),
                        yaxis="y4"))

fig.update_xaxes(title_text="Dates")


for EVENT in EVENTS:
  xc = geteventDate(EVENT).timestamp()* 1000
  fig.add_vline(x= xc, line_width=2, line_dash="dash", line_color="green",
                annotation_text=f"<b>{EVENT}</b>",
                annotation_position="top left",
                annotation_textangle = 270)

# for legend
dmax = df[["day_estimate_gcs_ema"]].values.max()
dmin = df[["day_estimate_gcs_ema"]].values.min()
fig.add_trace(go.Scatter(x=[xc,xc],
                         y=[dmin,dmax],
                         mode='lines',
                         line=dict(color='green', width=2, dash='dash'),
                         name='Covid Events'))

for EVENT in Festivals:
  xc = geteventDate(EVENT).timestamp() * 1000
  fig.add_vline(x= xc, line_width=2, line_dash="dash", line_color="gold",  annotation_text=f"{EVENT}",annotation_position="top left",
                annotation_textangle = 270)

#for legend
fig.add_trace(go.Scatter(x=[xc,xc],
                         y=[dmin,dmax],
                         mode='lines',
                         line=dict(color='gold', width=2, dash='dash'),
                         name='Festivals'))


fig.update_layout(legend=dict(
    yanchor="top",
    y=-0.15,
    xanchor="right",
    x=0.87,
    orientation="h",
    ),
    plot_bgcolor=  '#FFFFFF',
    paper_bgcolor= '#FFFFFF',
    )

# Create axis objects
fig.update_layout(
    # split the x-axis to fraction of plots in proportions
    xaxis=dict(
        domain=[0.1, .92]
    ),

    # pass the y-axis title, titlefont, color and tickfont as a dictionary
    # and store it an variable yaxis
    yaxis=dict(
        title="Contextualized Polarization",
        titlefont=dict(
            color="#0000ff"
        ),
        tickformat= '.3f',showline=True, linewidth=1, linecolor='black',
        tickfont=dict(
            color="#0000ff"
        ),
    ),

    # pass the y-axis 2 title, titlefont, color and tickfont as a dictionary
    # and store it a variable yaxis 2
    yaxis2=dict(
        title="BOW Polarization",
        titlefont=dict(
            color="#FF0000"
        ),
        tickfont=dict(
            color="#FF0000"
        ),
        tickformat= '.2f',
        anchor="free",  # specifying x - axis has to be the fixed
        overlaying="y",  # specifyinfg y - axis has to be separated
        side="left",  # specifying the side the axis should be present
        position=0.025  # specifying the position of the axis
    ),

    # pass the y-axis 3 title, titlefont, color and tickfont as a dictionary
    # and store it a variable yaxis 3
    yaxis3=dict(
        title="#non-Muslim Tweets",
        titlefont=dict(
            color=color
        ),
        showline=True, linewidth=1, linecolor='black',
        tickfont=dict(
            color=color
        ),
        anchor="x",     # specifying x - axis has to be the fixed
        overlaying="y",  # specifyinfg y - axis has to be separated
        side="right"  # specifying the side the axis should be present
    ),
    yaxis4=dict(
        title="#Muslim Tweets",
        titlefont=dict(
            color="darkorange"
        ),
        tickfont=dict(
            color="darkorange"
        ),
        anchor="free",  # specifying x - axis has to be the fixed
        overlaying="y",  # specifyinfg y - axis has to be separated
        side="right",  # specifying the side the axis should be present
        position=.975 # specifying the side the axis should be present
    ),
)
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.show()

# Check gcs vs length

In [None]:
df_gcs = pd.read_csv(f'{data_path}gcs.csv')
df_gcs['created_at'] = pd.to_datetime(df_gcs['created_at'])
df_bow = pd.read_csv(data_path+"bow_gcs.csv")
df_bow["created_at"] = df_bow["date"]
df_bow['created_at'] = pd.to_datetime(df_bow['created_at'])
del df_bow["date"]
df = pd.merge(left=df_gcs, right=df_bow, on = ["user_id","created_at"], how = "left", suffixes = ["", "_bow"])
tweets = pd.read_csv(f"{data_path}tweet_text.csv")
tweets = tweets.merge(pd.read_csv(f"{data_path}tweet_level_data.csv"), on ="id", how="inner",
                      suffixes=('', '_del'))
tweets = tweets[[col for col in tweets.columns if  not col.endswith("_del")]]
tweets.created_at = pd.to_datetime(tweets['created_at'], errors='coerce')

userids = pd.read_csv(f"{data_path}final_clean_userid.csv")
tweets = tweets[tweets.user_id.isin(userids.user_id)].merge(userids, on = "user_id", how = "inner", suffixes=["", "_del"])
tweets = tweets[[col for col in tweets.columns if  not col.endswith("_del")]]
df = df.reset_index().merge(tweets.reset_index(), how = "left", on = ["user_id", "created_at"], suffixes =["","_del"])
df = df[[col for col in df.columns if not col.endswith("_del")]]

In [None]:
def find_len(text):
  text = re.sub("@[A-Za-z0-9_]+","", text)
  text = re.sub(r'http\S+', '', text)
  text = re.sub('\s+', ' ', text)
  words = text.strip()
  words = [word for word in nltk.word_tokenize(words)]
  return len(words)

df["tweet_length"] = df.text.apply(find_len)

In [None]:
dict = {'muslim':"first", 'gcs':"last", 'bowgcs':"last",
        'reply':"mean", 'tweet_length':"mean",
         'user_friends_count':'last', 'user_followers_count':'last',
        'retweet_count':"mean"}
data = df.groupby(["user_id", "created_at"]).agg(dict)
data.to_csv(f"{data_path}gcs_avg_tweet_len_user-day.csv")

# Event Selection

In [None]:
def geteventDate(EVENT):

  if EVENT=="delhiriots":
    EVENT_DATE = 'Feb 23 2020'
  if EVENT=="maoistattack":
    EVENT_DATE = 'Feb 23 2020'
  elif EVENT == "Janata Curfew":
    EVENT_DATE = 'Mar 22 2020'
  elif EVENT =="firstLock":
    EVENT_DATE = 'Mar 24 2020'
  elif EVENT == "Tablighi":
    EVENT_DATE = 'Mar 31 2020'
  elif EVENT=="Palghar Mob Lynching":
    EVENT_DATE = 'Apr 16 2020'
  elif EVENT == "Ramadan":
    EVENT_DATE = "Apr 23 2020"
  elif EVENT== "Migrants Deaths":
    EVENT_DATE="May 8 2020"
  elif EVENT=="amphan":
    EVENT_DATE = "May 15 2020"
  elif EVENT=="vizhaggas":
    EVENT_DATE="May 7 2020"
  elif EVENT =="idulfitr":
    EVENT_DATE='MAY 24 2020'
  elif EVENT=="baghjan":
    EVENT_DATE="May 27 2020"
  elif EVENT=="nisarg":
    EVENT_DATE = "Jun 2 2020"
  elif EVENT == "3agrordinance":
    EVENT_DATE = "Jun 5 2020"
  elif EVENT == "Sushant Singh case":
    EVENT_DATE = "Jun 14 2020"
  elif EVENT=="China Border Skirmish":
    EVENT_DATE = "Jun 15 2020"
  elif EVENT == "Coronil Launch":
    EVENT_DATE = "Jun 23 2020"
  elif EVENT== "indiaunsc":
    EVENT_DATE ="Jun 17 2020"
  elif EVENT=="banchineseapp":
    EVENT_DATE = "Jun 29 2020"
  elif EVENT=="vikasdubey":
    EVENT_DATE = "Jul 3 2020"
  elif EVENT == "Rail Suspension":
    EVENT_DATE = "Jun 25 2020"
  elif EVENT == "pulwama":
    EVENT_DATE="Jul 5 2020"
  elif EVENT == "kargilearthquake":
    EVENT_DATE="Jul 5 2020"
  elif EVENT == "assamflood":
    EVENT_DATE="Jul 21 2020"
  elif EVENT=="nep":
    EVENT_DATE="Jul 29 2020"
  elif EVENT=="idulzuha":
    EVENT_DATE="Jul 30 2020"
  elif EVENT == "ramtemple":
    EVENT_DATE="Aug 5 2020"
  elif EVENT == "airindiacrash":
    EVENT_DATE="Aug 7 2020"
  elif EVENT == "farmerprotest":
    EVENT_DATE = "Aug 9 2020"
  elif EVENT == "vijaywadafire":
    EVENT_DATE = "Aug 9 2020"
  elif EVENT == "BLRiot":
    EVENT_DATE = "Aug 11 2020"
  elif EVENT=="Exam Satyagraha":
    EVENT_DATE = "Aug 23 2020"
  elif EVENT == "GDP Contraction":
    EVENT_DATE = "Aug 31 2020"
  elif EVENT=="chineseapps_inc_pubg":
    EVENT_DATE = "Sep 3 2020"
  elif EVENT=="farmbill":
    EVENT_DATE = "Sep 14 2020"
  elif EVENT=="bharatband":
    EVENT_DATE = "Sep 25 2020"
  elif EVENT=="babriaquit":
    EVENT_DATE ="Sep 30 2020"
  elif EVENT=="journalistarrest":
    EVENT_DATE ="Oct 5 2020"
  elif EVENT=="IPL":
    EVENT_DATE = "Sep 19 2020"
  elif EVENT=="farmbillrajyasabha":
    EVENT_DATE = "Sep 20 2020"
  elif EVENT=="modibday":
    EVENT_DATE="Sep 17 2020"
  elif EVENT=="hathrasvictimdies":
    EVENT_DATE = "Sep 29 2020"
  elif EVENT=="Bihar Manifesto":
    EVENT_DATE="Oct 22 2020"
  elif EVENT == "Diwali":
    EVENT_DATE = "Nov 14 2020"
  elif EVENT == "lovejihadyogi":
    EVENT_DATE = "Oct 31 2020"
  elif EVENT == "Love Jihad Law":
    EVENT_DATE = "Nov 28 2020"
  elif EVENT == "Bengal":
    EVENT_DATE= "Dec 20 2020"
  EVENT_DATE = datetime.strptime(EVENT_DATE, '%b %d %Y').date()
  return pd.to_datetime(EVENT_DATE).normalize()

In [None]:
def get_event_string(EVENT):
  if EVENT=="delhiriots":
    EVENT_STR = r'(riot)|(communal)|(\bcaa\b)|(nrc)|(npr)|(shaheen)|(jaffraba)|(jafraba)|(delhipolice)|(delhi police)|(?=.*protest)(?=.*delhi)'
  if EVENT =="maoistattack":
    EVENT_STR = r'(maoist)|(sukma)|(chhattisgarh)|(chattisgarh)|(chateesgarh)|(martyr)'
  if EVENT=="Janata Curfew":
    EVENT_STR = r'(janata)|(curfew)|(janta)|(junta)'
  elif EVENT == "Tablighi":
    EVENT_STR = r'(tabliqi)|(tablighi)|(jamat)|(jamaat)|(coronajihad)'
  elif EVENT=="Palghar Mob Lynching":
    EVENT_STR = r'(palghar)|( mob )|(#mob )|(lynching)|(lynched)'
  elif EVENT == "Ramadan":
    EVENT_STR = r"(ramdan)|(ramzan)|(ramadan)|(mubarak)"
  elif EVENT=="Migrant Deaths":
    EVENT_STR= r"(\bmigrant)"
  elif EVENT== "amphan":
    EVENT_STR = r"(amphan)|(cyclone)"
  elif EVENT =="idulfitr":
    EVENT_STR= r'(ul-fitr)|(al-fitr)|(\beid\b)|(#eid)|(idul)|(fitr)|(mubarak)'
  elif EVENT =="idulzuha":
    EVENT_STR= r'((?=.*ul)|(?=.*al))((?=.*zuha)|(?=.*adha))|(bakr)|(\beid\b)|(#eid)|(mubarak)|((?=.*slaughter)|(?=.*kill)|(?=.*cruel)|(?=.*sacrific))((?=.*animal)|(?=.*goat)|(?=.*lamb))'
  elif EVENT=="nisarg":
    EVENT_STR = r"(cyclone)|(nisarg)"
  elif EVENT=="vizhaggas":
    EVENT_STR= r"((?=.*vizhag)|(?=.*vizag)|(?=.*vishakhap)|(?=.*visakhapa)|(?=.*gas))(?=.*leak)|(?=.*leak)((?=.*lgpolymer)|(?=.*lg polymer))"
  elif EVENT=="baghjan":
    EVENT_STR= r"(baghjan)|(leak)|(oil( )?india)"
  elif EVENT == "3agrordinance":
    EVENT_STR = r"((?=.*ordinance)|(?=.*act)|(?=.*bill))((?=.*farm)|(?=.*agro)|(?=.*agricultur))|(krishi)|(kis[a]+n)"
  elif EVENT == "Sushant Singh case":
    EVENT_STR = r"(sushant singh)|(sushantsingh)|(sushantsinghrajput)"
  elif EVENT== "indiaunsc":
    EVENT_STR = r"(?=.*india)(?=.*unsc)|(indiainunsc)|(unsc)|(?=.*member)(?=.* un )|(?=.*un)(?=.*security council)"
  elif EVENT=="banchineseapp":
    EVENT_STR = r"(?=.*chinese)(?=.*ban)|(chinese app)|(chinese aap)|(tiktok)|(tik-tok)|(digitalairstrike)"
  elif EVENT=="chineseapps_inc_pubg":
    EVENT_STR = r"(?=.*chinese)(?=.*ban)|(?=.*app)(?=.*ban)|(pubg)|(chinese app)|(chinese aap)|(digitalairstrike)"
  elif EVENT=="vikasdubey":
    EVENT_STR = r"(dubey)"
  elif EVENT == "kargilearthquake":
    EVENT_STR= r"(kargil)|(earthquake)"
  elif EVENT == "assamflood":
    EVENT_STR= r"(flood)|(brahmaputra)|(assam)"
  elif EVENT == "nep":
    EVENT_STR= r"(education policy)|(nep2020)|(\bnep\b)"
  elif EVENT == "airindiacrash":
    EVENT_STR= r"(airindia)|(crash)"
  elif EVENT == "vijaywadafire":
    EVENT_STR = r"(vijaywada)|(vijayawada)|(swarnahotel)|(swarna hotel)|((?=.*care facility)(?=.*fire))"
  elif EVENT=="China Border Skirmish":
    EVENT_STR = r"((?=.*border)|(?=.*skirmish)|(?=.*melee))(?=.*china)|((gal)[vw](an))|((martyr)|(soldier)|(troops))|(gogra)|(chinese aggression)|(boycott( )?china)"
  elif EVENT == "Coronil Launch":
    EVENT_STR = r"(ayurved)|(ayush)|(ramdev)|(coronil)|(patanjali)"
  elif EVENT == "Rail Suspension":
    EVENT_STR = r"(railway)|(railband)"
  elif EVENT == "pulwama":
    EVENT_STR= r"(pulwama)"
  elif EVENT == "ramtemple":
    EVENT_STR= r"((ram )(temple|mandir))|(ram(temple|mandir))|(ayodhya)|(babri)|(jai(shri|sri|shree)ram)|(jai (shri|shree|sri) ram)|(babar)|(?=.*demolish)(?=.*masjid)"
  elif EVENT == "farmerprotest":
    EVENT_STR = r"((?=.*ordinance)|(?=.*act)|(?=.*bill)|(?=.*protest))((?=.*farm)|(?=.*agro)|(?=.*agricultur))|(krishi)|(kis[a]+n)"
  elif EVENT == "BLRiot":
    EVENT_STR = r"(prophet)|(?=.*riot)(?=.*b(e|a)ng[a]?l((ore)|(uru)))"
  elif EVENT=="Exam Satyagraha":
    EVENT_STR = r"(exam)|(student)"
  elif EVENT == "GDP Contraction":
    EVENT_STR = r"(gdp)|(economy)|(unemployment)"
  elif EVENT=="farmbill":
    EVENT_STR = r"((?=.*ordinance)|(?=.*act)|(?=.*bill)|(?=.*protest))((?=.*farm)|(?=.*agro)|(?=.*agricultur))|(krishi)|(kis[a]+n)"
  elif EVENT=="bharatband":
    EVENT_STR = r"(bharat band)|(bharatband)|((?=.*ordinance)|(?=.*act)|(?=.*bill)|(?=.*protest))((?=.*farm)|(?=.*agro)|(?=.*agricultur))|(krishi)|(kis[a]+n)"
  elif EVENT=="babriaquit":
    EVENT_STR = r"(babri)"
  elif EVENT=="hathrasvictimdies":
    EVENT_STR= r"(manishavalmiki)|(manisha valmiki)|(hathras)|(dalit)|(rape)"
  elif EVENT=="journalistarrest":
    EVENT_STR = r"(kappan)|(siddique)|(hathras)|(dalit)|(rape)|(manishavalmiki)|(manisha valmiki)"
  elif EVENT=="IPL":
    EVENT_STR = r"(cricket)|(ipl)"
  elif EVENT=="farmbillrajyasabha":
    EVENT_STR = r"((?=.*ordinance)|(?=.*act)|(?=.*bill))((?=.*farm)|(?=.*agro)|(?=.*agricultur))|(krishi)|(kis[a]+n)"
  elif EVENT=="modibday":
    EVENT_STR= r"((?=.*unemployment)|(?=.*b.?day)|(?=.*birthday))(?=.*modi)"
  elif EVENT=="Bihar Manifesto":
    EVENT_STR= r"(?=.*bihar)((?=.*election)|(?=.*manifesto)|(?=.*vote)|(?=.*vaccine)|(?=.*bjp)|(?=.*modi))|((?=.*vote)|(?=.*bjp)|(?=.*modi))(?=.*vaccine)"
  elif EVENT == "Diwali":
    EVENT_STR = r"(diwali)|(deepavali)|(deepawali)"
  elif EVENT == "lovejihadyogi":
    EVENT_STR = r"(lovejiha)|(love jiha)|(love-jiha)"
  elif EVENT == "Love Jihad Law":
    EVENT_STR = r"(lovejiha)|(love jiha)|(love-jiha)"
  elif EVENT == "Bengal":
    EVENT_STR = r"(bengal)|(rally)|(rallies)|(election)"
  return EVENT_STR


In [None]:
EVENTS = ["delhiriots", "maoistattack", "JanataCurfew", "Tablighi", "PalgharMobLynching", "Ramadan", "MigrantDeaths",
          "amphan", "idulfitr", "idulzuha", "nisarg", "vizhaggas", "baghjan", "3agrordinance", "SushantSinghcase",
          "indiaunsc", "banchineseapp", "chineseapps_inc_pubg", "vikasdubey", "kargilearthquake", "assamflood", "nep",
          "airindiacrash", "vijaywadafire", "ChinaBorderSkirmish", "CoronilLaunch", "RailSuspension", "pulwama", "ramtemple",
          "farmerprotest", "BLRiot", "ExamSatyagraha", "GDPContraction", "farmbill", "bharatband", "babriaquit", "hathrasvictimdies",
          "journalistarrest", "IPL", "farmbillrajyasabha", "modibday", "BiharManifesto", "Diwali", "lovejihadyogi",
          "LoveJihadLaw", "Bengal"
          ]

In [None]:
df = pd.read_csv(f"{data_path}tweet_text.csv")
df = df.merge(pd.read_csv(f"{data_path}tweet_level_data.csv"), on ="id", how="inner",
                      suffixes=('', '_del'))
df = df[[col for col in tweets.columns if  not col.endswith("_del")]]
df.created_at = pd.to_datetime(tweets['created_at'], errors='coerce')
userids = pd.read_csv(f"{data_path}final_clean_userid.csv")
df = df[df.user_id.isin(userids.user_id)].merge(userids, on = "user_id", how = "inner", suffixes=["", "_del"])
df = df[[col for col in df.columns if  not col.endswith("_del")]]
df= df.reset_index().merge(df.reset_index(), how = "left", on = ["user_id", "created_at"], suffixes =["","_del"])
df = df[[col for col in df.columns if not col.endswith("_del")]]
df.text = df.text.str.lower()

In [None]:
for EVENT in EVENTS:
  print(EVENT)
  eventdate = geteventDate(EVENT)
  eventstring = get_event_string(EVENT)
  d = df[(df.created_at>=eventdate)&(df.created_at<eventdate+timedelta(7))&(df.text.str.contains(eventstring))]
  output = f"\n{EVENT} {eventdate} {eventstring} All:{len(d)} Hindu tweets on event day: {sum(d.muslim==0)} muslim tweets on event day: {sum(d.muslim==1)} "
  print(output)