In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
df = pd.read_csv('goldstein_index_1990_to_2022_raw.csv')

In [3]:
df

Unnamed: 0,SQLDATE,Actor1Code,Actor2Code,GoldsteinScale,ActionGeo_Lat,ActionGeo_Long,year,pg_lat,pg_long
0,1990-01-01,BUS,COP,-4.4,51.4500,5.46667,1990,51.25,5.25
1,1990-01-01,BUS,JUD,3.0,42.2373,-71.53140,1990,42.25,-71.75
2,1990-01-01,BUS,MIL,-9.0,0.0000,0.00000,1990,0.25,0.25
3,1990-01-01,BUS,REB,6.0,0.0000,0.00000,1990,0.25,0.25
4,1990-01-01,BUS,REB,7.4,0.0000,0.00000,1990,0.25,0.25
...,...,...,...,...,...,...,...,...,...
91973757,2022-09-30,MIL,CVL,1.0,38.8990,-77.04220,2022,38.75,-77.25
91973758,2022-09-30,MIL,CVL,1.0,56.0000,10.00000,2022,56.25,10.25
91973759,2022-09-30,OPP,EDU,7.0,32.0000,53.00000,2022,32.25,53.25
91973760,2022-09-30,OPP,GOV,-2.0,25.3833,83.01670,2022,25.25,83.25


In [4]:
#Keep only the rows with the following values in Actor1Code: LAB, CVL, OPP, INS, REB, DEV, ELI, ENV, HRI, AGR
df = df[df['Actor1Code'].isin(['LAB', 'CVL', 'OPP', 'INS', 'REB', 'DEV', 'ELI', 'ENV', 'HRI', 'AGR'])]

#Keep only the rows with the following values in Actor2Code: BUS, MNC, GOV, JUD, LEG, IGO, ELI
df = df[df['Actor2Code'].isin(['BUS', 'MNC', 'GOV', 'JUD', 'LEG', 'IGO', 'ELI'])]

In [5]:
df

Unnamed: 0,SQLDATE,Actor1Code,Actor2Code,GoldsteinScale,ActionGeo_Lat,ActionGeo_Long,year,pg_lat,pg_long
98,1990-01-02,CVL,GOV,7.0,31.7667,35.2333,1990,31.75,35.25
99,1990-01-02,CVL,GOV,7.0,31.6667,35.2500,1990,31.75,35.25
140,1990-01-02,LAB,GOV,0.0,33.8667,35.5000,1990,33.75,35.25
176,1990-01-02,OPP,GOV,1.9,42.8333,12.8333,1990,42.75,12.75
177,1990-01-02,OPP,GOV,-2.0,42.6833,23.3167,1990,42.75,23.25
...,...,...,...,...,...,...,...,...,...
91973730,2022-09-30,OPP,GOV,-6.5,60.0000,100.0000,2022,60.25,100.25
91973731,2022-09-30,OPP,GOV,-5.0,20.0000,77.0000,2022,20.25,77.25
91973732,2022-09-30,OPP,GOV,-10.0,49.0000,32.0000,2022,49.25,32.25
91973756,2022-09-30,LAB,BUS,2.8,27.9659,-82.8001,2022,27.75,-82.75


In [6]:
#Rename INS & REB in Actor1Code to INS_REB
df['Actor1Code'] = df['Actor1Code'].replace(['INS', 'REB'], 'INS_REB')

#Rename BUS & MNC in Actor2Code to BUS_MNC
df['Actor2Code'] = df['Actor2Code'].replace(['BUS', 'MNC'], 'BUS_MNC')

#Rename GOV, JUD & LEG in Actor2Code to GOV_JUD_LEG
df['Actor2Code'] = df['Actor2Code'].replace(['GOV', 'JUD', 'LEG'], 'GOV_JUD_LEG')

In [7]:
df

Unnamed: 0,SQLDATE,Actor1Code,Actor2Code,GoldsteinScale,ActionGeo_Lat,ActionGeo_Long,year,pg_lat,pg_long
98,1990-01-02,CVL,GOV_JUD_LEG,7.0,31.7667,35.2333,1990,31.75,35.25
99,1990-01-02,CVL,GOV_JUD_LEG,7.0,31.6667,35.2500,1990,31.75,35.25
140,1990-01-02,LAB,GOV_JUD_LEG,0.0,33.8667,35.5000,1990,33.75,35.25
176,1990-01-02,OPP,GOV_JUD_LEG,1.9,42.8333,12.8333,1990,42.75,12.75
177,1990-01-02,OPP,GOV_JUD_LEG,-2.0,42.6833,23.3167,1990,42.75,23.25
...,...,...,...,...,...,...,...,...,...
91973730,2022-09-30,OPP,GOV_JUD_LEG,-6.5,60.0000,100.0000,2022,60.25,100.25
91973731,2022-09-30,OPP,GOV_JUD_LEG,-5.0,20.0000,77.0000,2022,20.25,77.25
91973732,2022-09-30,OPP,GOV_JUD_LEG,-10.0,49.0000,32.0000,2022,49.25,32.25
91973756,2022-09-30,LAB,BUS_MNC,2.8,27.9659,-82.8001,2022,27.75,-82.75


In [8]:
#Create different dataframes as per Actor2Code
df_BUS_MNC = df[df['Actor2Code'] == 'BUS_MNC']
df_GOV_JUD_LEG = df[df['Actor2Code'] == 'GOV_JUD_LEG']
df_IGO = df[df['Actor2Code'] == 'IGO']
df_ELI = df[df['Actor2Code'] == 'ELI']

In [9]:
df_BUS_MNC_pivot = df_BUS_MNC.pivot_table(index=['year', 'pg_lat', 'pg_long'], columns='Actor1Code', values='GoldsteinScale', aggfunc=['mean'])
df_BUS_MNC_pivot = df_BUS_MNC_pivot.reset_index().reset_index(drop=True)
df_BUS_MNC_pivot.columns = ['_'.join(col).strip() for col in df_BUS_MNC_pivot.columns.values]
df_BUS_MNC_pivot.columns = df_BUS_MNC_pivot.columns.str.rstrip('_')

df_GOV_JUD_LEG_pivot = df_GOV_JUD_LEG.pivot_table(index=['year', 'pg_lat', 'pg_long'], columns='Actor1Code', values='GoldsteinScale', aggfunc=['mean'])
df_GOV_JUD_LEG_pivot = df_GOV_JUD_LEG_pivot.reset_index().reset_index(drop=True)
df_GOV_JUD_LEG_pivot.columns = ['_'.join(col).strip() for col in df_GOV_JUD_LEG_pivot.columns.values]
df_GOV_JUD_LEG_pivot.columns = df_GOV_JUD_LEG_pivot.columns.str.rstrip('_')

df_IGO_pivot = df_IGO.pivot_table(index=['year', 'pg_lat', 'pg_long'], columns='Actor1Code', values='GoldsteinScale', aggfunc=['mean'])
df_IGO_pivot = df_IGO_pivot.reset_index().reset_index(drop=True)
df_IGO_pivot.columns = ['_'.join(col).strip() for col in df_IGO_pivot.columns.values]
df_IGO_pivot.columns = df_IGO_pivot.columns.str.rstrip('_')

df_ELI_pivot = df_ELI.pivot_table(index=['year', 'pg_lat', 'pg_long'], columns='Actor1Code', values='GoldsteinScale', aggfunc=['mean'])
df_ELI_pivot = df_ELI_pivot.reset_index().reset_index(drop=True)
df_ELI_pivot.columns = ['_'.join(col).strip() for col in df_ELI_pivot.columns.values]
df_ELI_pivot.columns = df_ELI_pivot.columns.str.rstrip('_')

In [10]:
df_BUS_MNC_pivot

Unnamed: 0,year,pg_lat,pg_long,mean_AGR,mean_CVL,mean_ELI,mean_ENV,mean_HRI,mean_INS_REB,mean_LAB,mean_OPP
0,1990,-41.25,174.75,,,,,,,1.9,
1,1990,-34.75,-58.75,,,,,,,1.5,
2,1990,-31.75,115.75,,,,,,,,-10.0
3,1990,-27.75,26.75,,-8.000000,,,,,,
4,1990,-26.25,27.75,,,,,,,-5.0,
...,...,...,...,...,...,...,...,...,...,...,...
66068,2022,70.25,-68.75,,,,,,,0.4,
66069,2022,71.75,-98.25,-6.5,,,,,,,
66070,2022,72.25,-40.75,,4.942857,,,,,,
66071,2022,74.75,-91.75,,,,,,,3.4,


In [11]:
#Remove mean_ from column names and add group name from dataframe to end
df_BUS_MNC_pivot.columns = df_BUS_MNC_pivot.columns.str.replace('mean_', '')
df_BUS_MNC_pivot.columns = df_BUS_MNC_pivot.columns + '_BUS_MNC'

df_GOV_JUD_LEG_pivot.columns = df_GOV_JUD_LEG_pivot.columns.str.replace('mean_', '')
df_GOV_JUD_LEG_pivot.columns = df_GOV_JUD_LEG_pivot.columns + '_GOV_JUD_LEG'

df_IGO_pivot.columns = df_IGO_pivot.columns.str.replace('mean_', '')
df_IGO_pivot.columns = df_IGO_pivot.columns + '_IGO'

df_ELI_pivot.columns = df_ELI_pivot.columns.str.replace('mean_', '')
df_ELI_pivot.columns = df_ELI_pivot.columns + '_ELI'

df_BUS_MNC_pivot.rename(columns={'year_BUS_MNC': 'year', 'pg_lat_BUS_MNC': 'pg_lat', 'pg_long_BUS_MNC': 'pg_long'}, inplace=True)
df_GOV_JUD_LEG_pivot.rename(columns={'year_GOV_JUD_LEG': 'year', 'pg_lat_GOV_JUD_LEG': 'pg_lat', 'pg_long_GOV_JUD_LEG': 'pg_long'}, inplace=True)
df_IGO_pivot.rename(columns={'year_IGO': 'year', 'pg_lat_IGO': 'pg_lat', 'pg_long_IGO': 'pg_long'}, inplace=True)
df_ELI_pivot.rename(columns={'year_ELI': 'year', 'pg_lat_ELI': 'pg_lat', 'pg_long_ELI': 'pg_long'}, inplace=True)

In [12]:
prio_grid = pd.read_csv('../PRIO GRID spine.csv') #read in PRIO GRID spine file

#Create date_df with year column containing 1990 to 2022
date_df = pd.DataFrame()
date_df['year'] = range(1990, 2023)
date_df['key'] = 1
prio_grid['key'] = 1
base_df = pd.merge(date_df, prio_grid, on ='key').drop("key", 1)
base_df

Unnamed: 0,year,gid,lon,lat
0,1990,49182,-69.25,-55.75
1,1990,49183,-68.75,-55.75
2,1990,49184,-68.25,-55.75
3,1990,49185,-67.75,-55.75
4,1990,49186,-67.25,-55.75
...,...,...,...,...
2138989,2022,249340,-70.25,83.25
2138990,2022,249341,-69.75,83.25
2138991,2022,249342,-69.25,83.25
2138992,2022,249343,-68.75,83.25


In [13]:
#Merge all dataframes with base_df to create one dataframe with all years, latitudes and longitudes
df_BUS_MNC_pivot = pd.merge(base_df, df_BUS_MNC_pivot, left_on = ['lat','lon','year'], right_on = ['pg_lat','pg_long','year'], how='left').drop(['pg_lat','pg_long'], 1)
df_GOV_JUD_LEG_pivot = pd.merge(base_df, df_GOV_JUD_LEG_pivot, left_on = ['lat','lon','year'], right_on = ['pg_lat','pg_long','year'], how='left').drop(['pg_lat','pg_long'], 1)
df_IGO_pivot = pd.merge(base_df, df_IGO_pivot, left_on = ['lat','lon','year'], right_on = ['pg_lat','pg_long','year'], how='left').drop(['pg_lat','pg_long'], 1)
df_ELI_pivot = pd.merge(base_df, df_ELI_pivot, left_on = ['lat','lon','year'], right_on = ['pg_lat','pg_long','year'], how='left').drop(['pg_lat','pg_long'], 1)

#Merge all dataframes into one
df = pd.merge(df_BUS_MNC_pivot, df_GOV_JUD_LEG_pivot, left_on = ['lat','lon','year','gid'], right_on = ['lat','lon','year','gid'], how='outer')
df = pd.merge(df, df_IGO_pivot, left_on = ['lat','lon','year','gid'], right_on = ['lat','lon','year','gid'], how='outer')
df = pd.merge(df, df_ELI_pivot, left_on = ['lat','lon','year','gid'], right_on = ['lat','lon','year','gid'], how='outer')

In [14]:
#Drop lon and lat columns
df = df.drop(['lon','lat'], 1)

#Rename gid to PRIO GRID
df.rename(columns={'gid':'PRIO GRID'}, inplace=True)

In [15]:
df

Unnamed: 0,year,PRIO GRID,AGR_BUS_MNC,CVL_BUS_MNC,ELI_BUS_MNC,ENV_BUS_MNC,HRI_BUS_MNC,INS_REB_BUS_MNC,LAB_BUS_MNC,OPP_BUS_MNC,...,LAB_IGO,OPP_IGO,AGR_ELI,CVL_ELI,ELI_ELI,ENV_ELI,HRI_ELI,INS_REB_ELI,LAB_ELI,OPP_ELI
0,1990,49182,,,,,,,,,...,,,,,,,,,,
1,1990,49183,,,,,,,,,...,,,,,,,,,,
2,1990,49184,,,,,,,,,...,,,,,,,,,,
3,1990,49185,,,,,,,,,...,,,,,,,,,,
4,1990,49186,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2138989,2022,249340,,,,,,,,,...,,,,,,,,,,
2138990,2022,249341,,,,,,,,,...,,,,,,,,,,
2138991,2022,249342,,,,,,,,,...,,,,,,,,,,
2138992,2022,249343,,,,,,,,,...,,,,,,,,,,


In [16]:
df.to_csv('task_20_gdelt_events_specific_groups_yearly_1990_2022.csv', index=False)