In [3]:
import pandas as pd
import numpy as np
import json, os, glob, re

In [4]:
all_protests = pd.read_csv('data/raw/blm/protests.csv')

In [5]:
def extract_state(location):
    split = location.split(', ')
    if len(split)==0:
        return None
    if len(split[-1])!=2:
        return None
    return split[-1].upper()
all_protests['state'] = [extract_state(loc) for loc in all_protests.Location]

def extract_city(location):
    split = location.split(', ')
    if len(split)<2:
        return None
    if len(split[-1])!=2:
        return None
    return split[-2]
all_protests['city'] = [extract_city(loc) for loc in all_protests.Location]

In [6]:
pro_victim = {
    'for racial justice', 'for criminal justice','for criminal justice reform', 'police'
}

def tag_split(tags):
    return set([t.lower() for t in tags.split('; ')])

pro_vic_protests = all_protests[[ len(tag_split(ts).intersection(pro_victim))>0 for ts in all_protests.Tags.values]].copy()

In [8]:
att = []
for _, row in pro_vic_protests.iterrows():
    if np.isnan(row['Attendees']):
        consider = pro_vic_protests[pro_vic_protests['city']==row['city']]
        if len(consider) and not np.isnan(consider['Attendees'].mean()):
            att.append(consider['Attendees'].mean())
        else:
            consider = pro_vic_protests[pro_vic_protests['state']==row['state']]
            if len(consider) and not np.isnan(consider['Attendees'].mean()):
                att.append(consider['Attendees'].mean())
            else:
                print(row['city'], row['state'])
                att.append(np.nan)
    else:
        att.append(row['Attendees'])

Hagåtña GU
San Juan PR


In [9]:
def to_timeseries(df, method=None):
    df.index = pd.to_datetime(df.index)
    idx = pd.period_range(min(df.index), max(df.index))
    df = df.reindex(idx.to_timestamp(), method=method)
    df.fillna(value=0, inplace=True)
    return df

In [10]:
protests = pro_vic_protests.groupby('Date').sum()
protests = to_timeseries(protests)
protests['LogAttendees'] = np.log(protests['Attendees'] + 1)

In [11]:
shootings_binary = pd.read_csv('data/prepared/shootings/shooting_frames.csv')
shootings_binary['date'] = pd.to_datetime(shootings_binary['date'])
shootings_binary[['found.mental_illness','found.criminal_record','found.fleeing',
                            'found.video','found.age', 'found.gender','found.unarmed','found.armed',
                            'found.race','found.official_report', 'found.interview','found.attack',
                            'found.systemic','found.legal_language']] = (shootings_binary[['found.mental_illness','found.criminal_record','found.fleeing',
                            'found.video','found.age', 'found.gender','found.unarmed','found.armed',
                            'found.race','found.official_report', 'found.interview','found.attack',
                            'found.systemic','found.legal_language']].values<np.inf).astype(int)

In [12]:
dated_binary = shootings_binary.groupby('date').mean()[['found.mental_illness','found.criminal_record','found.fleeing',
                                                   'found.video','found.age', 'found.gender','found.unarmed','found.armed',
                                                   'found.race','found.official_report', 'found.interview','found.attack',
                                                   'found.systemic','found.legal_language']]

In [13]:
dated_binary = to_timeseries(dated_binary)

In [14]:
dated_binary = shootings_binary.groupby('date').mean()[['found.mental_illness','found.criminal_record','found.fleeing',
                                                   'found.video','found.age', 'found.gender','found.unarmed','found.armed',
                                                   'found.race','found.official_report', 'found.interview','found.attack',
                                                   'found.systemic','found.legal_language']].rolling(15, min_periods=1).sum()
dated_binary=to_timeseries(dated_binary)

In [15]:
dated_binary = dated_binary[dated_binary.index>=min(protests.index)]
protests = protests[protests.index<=max(dated_binary.index)]

In [16]:
len(dated_binary), len(protests)

(1328, 1328)

In [17]:
max(dated_binary.index)

Timestamp('2020-09-04 00:00:00', freq='D')

In [18]:
from statsmodels.tsa.stattools import grangercausalitytests
from scipy.stats import pearsonr
from statsmodels.tsa.stattools import adfuller

In [19]:
# we reject the null hypothesis that there is a unit root and thus Attendees is stationary
adfuller(protests['Attendees'])

(-8.842570042134522,
 1.653243575182644e-14,
 8,
 1319,
 {'1%': -3.4353174541055567,
  '5%': -2.863733732389869,
  '10%': -2.5679379527245407},
 28988.890762563275)

In [20]:
# we reject the null hypothesis that there is a unit root and thus found.race is stationary
adfuller(dated_binary['found.race'])

(-6.580959525265646,
 7.516533482886827e-09,
 14,
 1313,
 {'1%': -3.435340198430096,
  '5%': -2.863743768394356,
  '10%': -2.567943297315671},
 1755.5661801695403)

In [21]:
# we reject the null hypothesis that there is a unit root and thus found.race is stationary
adfuller(dated_binary['found.unarmed'])

(-5.349317151185906,
 4.324387032896597e-06,
 18,
 1309,
 {'1%': -3.4353554773774553,
  '5%': -2.86375051023515,
  '10%': -2.5679468876340037},
 -750.1322640298381)

In [22]:
# we reject the null hypothesis that there is a unit root and thus found.race is stationary
adfuller(dated_binary['found.systemic'])

(-4.5050907216164084,
 0.0001925167113871787,
 7,
 1320,
 {'1%': -3.4353136835264664,
  '5%': -2.863732068602694,
  '10%': -2.5679370666896233},
 2136.768293020318)

In [24]:
consider= dated_binary[['found.race']].join(protests[['Attendees']].copy())
print(pearsonr(consider['Attendees'].values , consider['found.race'].values))
grangercausalitytests(consider, maxlag=2, addconst=True, verbose=True)

(0.1137703823424304, 3.244562749237413e-05)

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=7.7521  , p=0.0054  , df_denom=1324, df_num=1
ssr based chi2 test:   chi2=7.7696  , p=0.0053  , df=1
likelihood ratio test: chi2=7.7470  , p=0.0054  , df=1
parameter F test:         F=7.7521  , p=0.0054  , df_denom=1324, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=3.7594  , p=0.0235  , df_denom=1321, df_num=2
ssr based chi2 test:   chi2=7.5472  , p=0.0230  , df=2
likelihood ratio test: chi2=7.5258  , p=0.0232  , df=2
parameter F test:         F=3.7594  , p=0.0235  , df_denom=1321, df_num=2


{1: ({'ssr_ftest': (7.75207974443201, 0.005441442804637791, 1324.0, 1),
   'ssr_chi2test': (7.769644879804591, 0.00531314934981986, 1),
   'lrtest': (7.746987466220844, 0.0053802215798856535, 1),
   'params_ftest': (7.752079744432018, 0.005441442804637791, 1324.0, 1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f72920fa750>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f72920cff50>,
   array([[0., 1., 0.]])]),
 2: ({'ssr_ftest': (3.7593666684525084, 0.02354813862068362, 1321.0, 2),
   'ssr_chi2test': (7.547191827960677, 0.02296931891056697, 2),
   'lrtest': (7.525794810698017, 0.023216375579675902, 2),
   'params_ftest': (3.7593666684050135, 0.023548138621792697, 1321.0, 2.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f7292101090>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f7292101b50>,
   array([[0., 0., 1., 0., 0.],
          [0., 0., 0., 1., 0.]])])}

In [25]:
consider= dated_binary[['found.unarmed']].join(protests[['Attendees']].copy())
print(pearsonr(consider['Attendees'].values , consider['found.unarmed'].values))
grangercausalitytests(consider, maxlag=2, addconst=True, verbose=True)

(0.011131870554487121, 0.6852605175304168)

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.6396  , p=0.4240  , df_denom=1324, df_num=1
ssr based chi2 test:   chi2=0.6410  , p=0.4233  , df=1
likelihood ratio test: chi2=0.6409  , p=0.4234  , df=1
parameter F test:         F=0.6396  , p=0.4240  , df_denom=1324, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.9531  , p=0.3858  , df_denom=1321, df_num=2
ssr based chi2 test:   chi2=1.9133  , p=0.3842  , df=2
likelihood ratio test: chi2=1.9119  , p=0.3844  , df=2
parameter F test:         F=0.9531  , p=0.3858  , df_denom=1321, df_num=2


{1: ({'ssr_ftest': (0.6395657833142451, 0.4240115316851255, 1324.0, 1),
   'ssr_chi2test': (0.6410149504969813, 0.4233435077223783, 1),
   'lrtest': (0.6408601773664486, 0.4233994860536281, 1),
   'params_ftest': (0.6395657833146746, 0.4240115316849655, 1324.0, 1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f72920faf50>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f72920fabd0>,
   array([[0., 1., 0.]])]),
 2: ({'ssr_ftest': (0.9530533349257138, 0.38582692055829704, 1321.0, 2),
   'ssr_chi2test': (1.9133213052407214, 0.3841736355743226, 2),
   'lrtest': (1.9119422401154225, 0.38443862715542515, 2),
   'params_ftest': (0.9530533350723729, 0.3858269205018, 1321.0, 2.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f72920e27d0>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f734da26d90>,
   array([[0., 0., 1., 0., 0.],
          [0., 0., 0., 1., 0.]])])}

In [42]:
consider= dated_binary[['found.unarmed']].join(protests[['Attendees']].copy())
print(pearsonr(consider['Attendees'].values , consider['found.unarmed'].values))
grangercausalitytests(consider, maxlag=2, addconst=True, verbose=True)

(0.07344779192739316, 0.00741380770882648)

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.9334  , p=0.1646  , df_denom=1324, df_num=1
ssr based chi2 test:   chi2=1.9378  , p=0.1639  , df=1
likelihood ratio test: chi2=1.9364  , p=0.1641  , df=1
parameter F test:         F=1.9334  , p=0.1646  , df_denom=1324, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=6.0475  , p=0.0024  , df_denom=1321, df_num=2
ssr based chi2 test:   chi2=12.1407 , p=0.0023  , df=2
likelihood ratio test: chi2=12.0855 , p=0.0024  , df=2
parameter F test:         F=6.0475  , p=0.0024  , df_denom=1321, df_num=2


{1: ({'ssr_ftest': (1.9334462626153535, 0.1646155749416249, 1324.0, 1),
   'ssr_chi2test': (1.9378271831499805, 0.16390464948639444, 1),
   'lrtest': (1.9364136479625813, 0.16405846426071224, 1),
   'params_ftest': (1.9334462626151125, 0.16461557494172074, 1324.0, 1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f7590b2b750>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f7591774750>,
   array([[0., 1., 0.]])]),
 2: ({'ssr_ftest': (6.047458725224418, 0.002429811354468975, 1321.0, 2),
   'ssr_chi2test': (12.140696850336985, 0.002310368079958846, 2),
   'lrtest': (12.085454408153964, 0.002375072764983717, 2),
   'params_ftest': (6.047458725384525, 0.002429811354083257, 1321.0, 2.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f75909a2190>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f75909a2050>,
   array([[0., 0., 1., 0., 0.],
          [0., 0., 0., 1., 0.]])])}

In [45]:
consider=protests[['Attendees']].copy().join(dated_binary[['found.systemic']])
print(pearsonr(consider['Attendees'].values , consider['found.systemic'].values))
grangercausalitytests(consider, maxlag=2, addconst=True, verbose=True)

(0.08819928617398119, 0.001293671168171001)

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.1308  , p=0.7177  , df_denom=1324, df_num=1
ssr based chi2 test:   chi2=0.1311  , p=0.7173  , df=1
likelihood ratio test: chi2=0.1311  , p=0.7173  , df=1
parameter F test:         F=0.1308  , p=0.7177  , df_denom=1324, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.6333  , p=0.0722  , df_denom=1321, df_num=2
ssr based chi2 test:   chi2=5.2866  , p=0.0711  , df=2
likelihood ratio test: chi2=5.2761  , p=0.0715  , df=2
parameter F test:         F=2.6333  , p=0.0722  , df_denom=1321, df_num=2


{1: ({'ssr_ftest': (0.13077155377489968, 0.7176911038646188, 1324.0, 1),
   'ssr_chi2test': (0.13106786394206335, 0.7173274003807627, 1),
   'lrtest': (0.13106139157389407, 0.7173340802845576, 1),
   'params_ftest': (0.13077155377535343, 0.7176911038641633, 1324.0, 1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f75909a2390>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f75909a2410>,
   array([[0., 1., 0.]])]),
 2: ({'ssr_ftest': (2.63331580759582, 0.07221695709376003, 1321.0, 2),
   'ssr_chi2test': (5.286565875657923, 0.07112737907905102, 2),
   'lrtest': (5.276055424026708, 0.07150215341714806, 2),
   'params_ftest': (2.633315807595703, 0.07221695709376832, 1321.0, 2.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f759092a750>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f759092a4d0>,
   array([[0., 0., 1., 0., 0.],
          [0., 0., 0., 1., 0.]])])}

In [44]:
consider= dated_binary[['found.systemic']].join(protests[['Attendees']].copy())
print(pearsonr(consider['Attendees'].values , consider['found.systemic'].values))
grangercausalitytests(consider, maxlag=2, addconst=True, verbose=True)

(0.08819928617398119, 0.001293671168171001)

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=3.0666  , p=0.0801  , df_denom=1324, df_num=1
ssr based chi2 test:   chi2=3.0736  , p=0.0796  , df=1
likelihood ratio test: chi2=3.0700  , p=0.0797  , df=1
parameter F test:         F=3.0666  , p=0.0801  , df_denom=1324, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.8196  , p=0.0600  , df_denom=1321, df_num=2
ssr based chi2 test:   chi2=5.6605  , p=0.0590  , df=2
likelihood ratio test: chi2=5.6485  , p=0.0594  , df=2
parameter F test:         F=2.8196  , p=0.0600  , df_denom=1321, df_num=2


{1: ({'ssr_ftest': (3.066624720296011, 0.0801467929582978, 1324.0, 1),
   'ssr_chi2test': (3.0735732657347485, 0.07957438634762673, 1),
   'lrtest': (3.0700192752201474, 0.07974852619256517, 1),
   'params_ftest': (3.0666247202962564, 0.08014679295827701, 1324.0, 1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f75909750d0>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f75909a2990>,
   array([[0., 1., 0.]])]),
 2: ({'ssr_ftest': (2.819601027297349, 0.0599886549587093, 1321.0, 2),
   'ssr_chi2test': (5.66054649840467, 0.05899673065715559, 2),
   'lrtest': (5.648498651846239, 0.059353195015394516, 2),
   'params_ftest': (2.8196010278090626, 0.05998865492814545, 1321.0, 2.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f7590938610>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f7590938e50>,
   array([[0., 0., 1., 0., 0.],
          [0., 0., 0., 1., 0.]])])}

In [34]:
j = pd.read_csv('data-prepared/shootings/shooting_frames.csv')

In [39]:
j[(j['leaning']==0) | (j['leaning']==2)][['id', 'page_num', 'url', 'leaning']].sample(frac=1, random_state=0).to_csv('TEMP.csv', index=False)