In [1]:
# use geo_env_2022

import os
import numpy as np
import pandas as pd # original 1.2.3
import geopandas as gpd
from shapely.geometry import Point
import pickle
import urllib.request

In [2]:
def get_prio_shape():

    location = '/home/simon/Documents/Bodies/data/PRIO'
    path_prio = location + '/priogrid_shapefiles.zip'

    if os.path.isfile(path_prio) == True:
        
        print('File already downloaded')
        prio_grid = gpd.read_file('zip://' + path_prio)

    else:
        print('Beginning file download PRIO...')
        url_prio = 'http://file.prio.no/ReplicationData/PRIO-GRID/priogrid_shapefiles.zip'

        urllib.request.urlretrieve(url_prio, path_prio)
        prio_grid = gpd.read_file('zip://' + path_prio)

    return prio_grid


def get_gwno():

    location = '/home/simon/Documents/Bodies/data/PRIO'
    #path_gwno = location + '/PRIO-GRID Yearly Variables for 2003-2009 - 2022-06-16.csv' #https://grid.prio.org/#/download # need to figrue out the API
    path_gwno = location + '/PRIO-GRID Yearly Variables for 1989-2014 - 2022-06-16.csv' #https://grid.prio.org/#/download # need to figrue out the API

    # why not just go 1989 - 2019 like ucdp...

    gwno = pd.read_csv(path_gwno)

    return gwno


def get_ucdp():
    location = '/home/simon/Documents/Bodies/data/UCDP' 
    path_ucdp = location + "/ged201-csv.zip"
    
    if os.path.isfile(path_ucdp) == True:
        print('file already downloaded')
        ucdp = pd.read_csv(path_ucdp)


    else: 
        print('Beginning file download UCDP...')

        url_ucdp = 'https://ucdp.uu.se/downloads/ged/ged201-csv.zip'
    
        urllib.request.urlretrieve(url_ucdp, path_ucdp)
        ucdp = pd.read_csv(path_ucdp)

    return ucdp


In [3]:
# something like get_gwno for the other stuff Jacob want

In [4]:

def add_months(ucdp, world_grid):

    diff = ucdp['year'].max() - world_grid['year'].max()

    subset_list = []

    for i in np.arange(1, diff+1, 1):

        subset = world_grid[world_grid['year'] == world_grid['year'].max()].copy()
        subset['year'] = world_grid['year'].max() + i

        subset_list.append(subset)

    new_years = pd.concat(subset_list)
    world_grid_all_years = pd.concat([world_grid, new_years])

    month = [str(i).zfill(2) for i in np.arange(1,13,1)]
    world_grid_all_years.loc[:,'month'] = world_grid_all_years.apply(lambda _: month, axis=1)
    world_grid_all_months = world_grid_all_years.sort_values('year').explode('month').copy()
    world_grid_all_months['year_months_start'] =  world_grid_all_months['year'].astype(str) + '-' +  world_grid_all_months['month'].astype(str)

    year_months = sorted(world_grid_all_months['year_months_start'].unique())
    ts = len(year_months)
    month_ids = np.arange(109, ts + 109, 1)
    month_id_dict = dict(zip(year_months,month_ids))
    month_df = pd.DataFrame({'year_months_start' : year_months, 'month_id': month_ids})
    world_grid_all_months_id = world_grid_all_months.merge(month_df, how = 'left', on = 'year_months_start')

    return world_grid_all_months_id

In [5]:

def prio_ucdp_merge(ucdp, world_grid_all_months):
    ucdp_tmp1 = ucdp.copy()

    ucdp_tmp1['year_months_start'] = ucdp_tmp1['date_start'].str.slice(start = 0, stop = 7) # Date YYYY-MM-DD
    ucdp_tmp1['year_months_end'] = ucdp_tmp1['date_start'].str.slice(start = 0, stop = 7) # Date YYYY-MM-DD


    mask1 = (ucdp_tmp1['year'] != ucdp_tmp1['year_months_start'].str.slice(start = 0, stop = 4).astype(int))
    mask2 = (ucdp_tmp1['year'] != ucdp_tmp1['year_months_end'].str.slice(start = 0, stop = 4).astype(int))

    # correction. Note that end and start year for the four entries that is corrected is the same.
    ucdp_tmp1.loc[mask1 | mask2, 'year'] = ucdp_tmp1.loc[mask1 | mask2,'year_months_start'].str.slice(start = 0, stop = 4).astype(int)

    feature_list = ['deaths_a','deaths_b', 'deaths_civilians', 'deaths_unknown','best', 'high', 'low']

    ucdp_monthly_unit = ucdp_tmp1.groupby(['year_months_start','year', 'priogrid_gid']).sum()[feature_list].reset_index()
    ucdp_monthly_unit.rename(columns={'priogrid_gid':'gid'}, inplace=True)

    ucdp_monthly_unit['log_best'] = np.log(ucdp_monthly_unit['best'] +1)
    ucdp_monthly_unit['log_low'] = np.log(ucdp_monthly_unit['low'] +1)
    ucdp_monthly_unit['log_high'] = np.log(ucdp_monthly_unit['high'] +1)

    prio_ucdp_df = world_grid_all_months.merge(ucdp_monthly_unit, how = 'left', on = ['gid', 'year_months_start', 'year'])
    prio_ucdp_df.fillna(0, inplace=True)

    return prio_ucdp_df


In [6]:
prio_grid = get_prio_shape()
gwno = get_gwno()
ucdp = get_ucdp()

world_grid = prio_grid.merge(gwno, how = 'right', on = 'gid') # if you just merge this on outer I think you get the full grid needed for R-UNET
world_grid_all_months = add_months(ucdp, world_grid)
prio_ucdp = prio_ucdp_merge(ucdp, world_grid_all_months)

File already downloaded
file already downloaded


  ucdp = pd.read_csv(path_ucdp)


In [7]:
data_dir = '/home/simon/Documents/Bodies/data/OD_dataframes_compiled/'

# with open(f'{data_dir}bodies_df_fatal.pkl', 'rb') as file:
#     bodies_df_fatal = pickle.load(file)

with open(f'{data_dir}df_ucdp_prio.pkl', 'rb') as file:
    old_df = pickle.load(file)

In [13]:
prio_ucdp.tail()

Unnamed: 0,gid,xcoord,ycoord,col,row,geometry,year,gwno,month,year_months_start,...,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,log_best,log_low,log_high
24112291,249344,-68.25,83.25,224,347,"POLYGON ((-68.50000 83.00000, -68.50000 83.500...",2019,20.0,8,2019-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24112292,249344,-68.25,83.25,224,347,"POLYGON ((-68.50000 83.00000, -68.50000 83.500...",2019,20.0,9,2019-09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24112293,249344,-68.25,83.25,224,347,"POLYGON ((-68.50000 83.00000, -68.50000 83.500...",2019,20.0,10,2019-10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24112294,249344,-68.25,83.25,224,347,"POLYGON ((-68.50000 83.00000, -68.50000 83.500...",2019,20.0,11,2019-11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24112295,249344,-68.25,83.25,224,347,"POLYGON ((-68.50000 83.00000, -68.50000 83.500...",2019,20.0,12,2019-12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
for i in prio_ucdp.columns:
    print(i)

gid
xcoord
ycoord
col
row
geometry
year
gwno
month
year_months_start
month_id
deaths_a
deaths_b
deaths_civilians
deaths_unknown
best
high
low
log_best
log_low
log_high


In [14]:
old_df.tail()

Unnamed: 0,gid,xcoord,ycoord,col,row,year,gwno,month,year_months_start,month_id,best,low,high,log_best,log_low,log_high
24112291,249344,-68.25,83.25,224,347,2019,20.0,8,2019-08,476,0.0,0.0,0.0,0.0,0.0,0.0
24112292,249344,-68.25,83.25,224,347,2019,20.0,9,2019-09,477,0.0,0.0,0.0,0.0,0.0,0.0
24112293,249344,-68.25,83.25,224,347,2019,20.0,10,2019-10,478,0.0,0.0,0.0,0.0,0.0,0.0
24112294,249344,-68.25,83.25,224,347,2019,20.0,11,2019-11,479,0.0,0.0,0.0,0.0,0.0,0.0
24112295,249344,-68.25,83.25,224,347,2019,20.0,12,2019-12,480,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
for i in old_df.columns:
    print(i)

gid
xcoord
ycoord
col
row
year
gwno
month
year_months_start
month_id
best
low
high
log_best
log_low
log_high


In [None]:
def compile_combined_df():

    prio_grid = get_prio_shape()
    gwno = get_gwno()
    ucdp = get_ucdp()

    world_grid = prio_grid.merge(gwno, how = 'right', on = 'gid') # if you just merge this on outer I think you get the full grid needed for R-UNET
    world_grid_all_months = add_months(ucdp, world_grid)
    prio_ucdp = prio_ucdp_merge(ucdp, world_grid_all_months)

    data_dir = '/home/simon/Documents/Bodies/data/OD_dataframes_compiled/'

    with open(f'{data_dir}g_df_ucdp_prio.pkl', 'wb') as file:
        pickle.dump(prio_ucdp, file)

    prio_ucdp_pd = pd.DataFrame(prio_ucdp.drop(columns= 'geometry').copy())

    with open(f'{data_dir}df_ucdp_prio.pkl', 'wb') as file:
        pickle.dump(prio_ucdp_pd, file)


if __name__ == "__main__":
    compile_combined_df()


In [12]:
ucdp_tmp1 = ucdp.copy()

ucdp_tmp1['year_months_start'] = ucdp_tmp1['date_start'].str.slice(start = 0, stop = 7) # Date YYYY-MM-DD
ucdp_tmp1['year_months_end'] = ucdp_tmp1['date_start'].str.slice(start = 0, stop = 7) # Date YYYY-MM-DD


mask1 = (ucdp_tmp1['year'] != ucdp_tmp1['year_months_start'].str.slice(start = 0, stop = 4).astype(int))
mask2 = (ucdp_tmp1['year'] != ucdp_tmp1['year_months_end'].str.slice(start = 0, stop = 4).astype(int))

# correction. Note that end and start year for the four entries that is corrected is the same.
ucdp_tmp1.loc[mask1 | mask2, 'year'] = ucdp_tmp1.loc[mask1 | mask2,'year_months_start'].str.slice(start = 0, stop = 4).astype(int)



#ucdp_monthly_unit = ucdp_tmp1.groupby(['year_months_start','year', 'priogrid_gid']).sum()[['best','low','high']].reset_index()

sum_list = ['deaths_a','deaths_b', 'deaths_civilians', 'deaths_unknown','best', 'high', 'low']
not_sum_list = [i for i in ucdp_tmp1.columns if i not in sum_list]
ucdp_monthly_unit_sum = ucdp_tmp1.groupby(['year_months_start','year', 'priogrid_gid']).sum()[sum_list].reset_index()
ucdp_monthly_unit_mean = ucdp_tmp1.groupby(['year_months_start','year', 'priogrid_gid']).mean().reset_index()
ucdp_monthly_unit_mean.drop(columns=sum_list, inplace= True)

In [15]:
ucdp_monthly_unit_sum.merge(ucdp_tmp1, how= 'left', on = )

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob,year_months_start,year_months_end
0,244657,IRQ-2017-1-524-322,2017,1,Clear,1,259,259,Iraq: Government,524,...,4,0,2,6,6,6,645,,2017-07,2017-07
1,132140,AFG-1989-1-411-2,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,0,0,0,6,6,6,700,,1989-01,1989-01
2,130364,AFG-1989-1-411-37,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,0,0,4,4,4,0,700,,1989-01,1989-01
3,130359,AFG-1989-1-411-4,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,0,0,600,600,600,600,700,,1989-01,1989-01
4,133883,AFG-1989-1-411-39,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,0,0,2,2,2,0,700,,1989-01,1989-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225380,15525,MZM-1989-3-1347-4,1989,1,Clear,3,498,562,Renamo - Civilians,498,...,0,9,0,9,9,9,,,1989-03,1989-03
225381,15524,MZM-1989-3-1347-16,1989,1,Clear,3,498,562,Renamo - Civilians,498,...,0,9,0,9,9,9,,,1989-07,1989-07
225382,15245,MZM-1990-3-1347-18,1990,1,Clear,3,498,562,Renamo - Civilians,498,...,0,7,0,7,7,7,,,1990-06,1990-06
225383,15516,MZM-1990-3-1347-28,1990,1,Clear,3,498,562,Renamo - Civilians,498,...,0,1,0,1,1,1,,,1990-11,1990-11


In [16]:
ucdp_tmp1

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob,year_months_start,year_months_end
0,244657,IRQ-2017-1-524-322,2017,1,Clear,1,259,259,Iraq: Government,524,...,4,0,2,6,6,6,645,,2017-07,2017-07
1,132140,AFG-1989-1-411-2,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,0,0,0,6,6,6,700,,1989-01,1989-01
2,130364,AFG-1989-1-411-37,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,0,0,4,4,4,0,700,,1989-01,1989-01
3,130359,AFG-1989-1-411-4,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,0,0,600,600,600,600,700,,1989-01,1989-01
4,133883,AFG-1989-1-411-39,1989,1,Clear,1,333,333,Afghanistan: Government,724,...,0,0,2,2,2,0,700,,1989-01,1989-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225380,15525,MZM-1989-3-1347-4,1989,1,Clear,3,498,562,Renamo - Civilians,498,...,0,9,0,9,9,9,,,1989-03,1989-03
225381,15524,MZM-1989-3-1347-16,1989,1,Clear,3,498,562,Renamo - Civilians,498,...,0,9,0,9,9,9,,,1989-07,1989-07
225382,15245,MZM-1990-3-1347-18,1990,1,Clear,3,498,562,Renamo - Civilians,498,...,0,7,0,7,7,7,,,1990-06,1990-06
225383,15516,MZM-1990-3-1347-28,1990,1,Clear,3,498,562,Renamo - Civilians,498,...,0,1,0,1,1,1,,,1990-11,1990-11


In [14]:
ucdp_monthly_unit_sum

Unnamed: 0,year_months_start,year,priogrid_gid,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low
0,1989-01,1989,86102,0,1,0,0,1,1,1
1,1989-01,1989,86821,2,1,2,4,9,10,9
2,1989-01,1989,86822,6,1,2,1,10,10,10
3,1989-01,1989,86823,1,0,1,0,2,2,2
4,1989-01,1989,87542,0,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...
82087,2019-12,2019,191970,1,2,0,0,3,3,3
82088,2019-12,2019,198436,4,0,1,0,5,5,5
82089,2019-12,2019,199157,1,0,0,0,1,1,1
82090,2019-12,2019,199158,3,0,0,0,3,3,0


In [8]:
ucdp_monthly_unit_sum

Unnamed: 0,year_months_start,year,priogrid_gid,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low
0,1989-01,1989,86102,0,1,0,0,1,1,1
1,1989-01,1989,86821,2,1,2,4,9,10,9
2,1989-01,1989,86822,6,1,2,1,10,10,10
3,1989-01,1989,86823,1,0,1,0,2,2,2
4,1989-01,1989,87542,0,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...
82087,2019-12,2019,191970,1,2,0,0,3,3,3
82088,2019-12,2019,198436,4,0,1,0,5,5,5
82089,2019-12,2019,199157,1,0,0,0,1,1,1
82090,2019-12,2019,199158,3,0,0,0,3,3,0


In [9]:
ucdp_monthly_unit_mean

Unnamed: 0,year_months_start,year,priogrid_gid,id,active_year,type_of_violence,conflict_dset_id,conflict_new_id,dyad_dset_id,dyad_new_id,...,side_b_dset_id,side_b_new_id,number_of_sources,where_prec,latitude,longitude,country_id,event_clarity,date_prec,gwnob
0,1989-01,1989,86102,18273.000000,1.0,2.0,5451.0,4841.0,5451.0,5451.0,...,983.0,983.0,-1.0,1.0,-30.146565,30.660253,560.0,1.0,1.0,
1,1989-01,1989,86821,15798.000000,0.9,2.1,5017.2,4547.4,5017.2,5144.6,...,1739.4,739.6,-1.0,1.0,-29.654428,30.335265,560.0,1.0,2.6,
2,1989-01,1989,86822,16151.000000,0.9,2.1,5017.2,4547.4,5017.2,5144.6,...,1739.4,739.6,-1.0,1.1,-29.811026,30.726721,560.0,1.0,3.2,
3,1989-01,1989,86823,13822.500000,1.0,2.0,5450.5,4840.5,5450.5,5450.5,...,801.5,801.5,-1.0,1.5,-29.857896,31.029198,560.0,1.0,3.0,
4,1989-01,1989,87542,15959.000000,1.0,2.0,5451.0,4841.0,5451.0,5451.0,...,983.0,983.0,-1.0,2.0,-29.064155,30.592786,560.0,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82087,2019-12,2019,191970,324678.000000,1.0,1.0,13588.0,13588.0,14619.0,14619.0,...,234.0,234.0,3.0,1.0,43.166439,44.810242,365.0,1.0,1.0,
82088,2019-12,2019,198436,324609.000000,1.0,1.0,13306.0,13306.0,15100.0,15100.0,...,6711.0,6711.0,1.5,1.5,47.790385,37.641244,369.0,1.0,1.0,
82089,2019-12,2019,199157,324720.000000,1.0,1.0,13306.0,13306.0,15100.0,15100.0,...,6711.0,6711.0,1.0,1.0,48.435297,38.220482,369.0,1.0,1.0,
82090,2019-12,2019,199158,324723.333333,1.0,1.0,13306.0,13306.0,15100.0,15100.0,...,6711.0,6711.0,1.0,5.0,48.150000,38.650000,369.0,1.0,1.0,


In [1]:
ucdp_tmp1 = ucdp.copy()

ucdp_tmp1['year_months_start'] = ucdp_tmp1['date_start'].str.slice(start = 0, stop = 7) # Date YYYY-MM-DD
ucdp_tmp1['year_months_end'] = ucdp_tmp1['date_start'].str.slice(start = 0, stop = 7) # Date YYYY-MM-DD


mask1 = (ucdp_tmp1['year'] != ucdp_tmp1['year_months_start'].str.slice(start = 0, stop = 4).astype(int))
mask2 = (ucdp_tmp1['year'] != ucdp_tmp1['year_months_end'].str.slice(start = 0, stop = 4).astype(int))

# correction. Note that end and start year for the four entries that is corrected is the same.
ucdp_tmp1.loc[mask1 | mask2, 'year'] = ucdp_tmp1.loc[mask1 | mask2,'year_months_start'].str.slice(start = 0, stop = 4).astype(int)



#ucdp_monthly_unit = ucdp_tmp1.groupby(['year_months_start','year', 'priogrid_gid']).sum()[['best','low','high']].reset_index()

sum_list = ['deaths_a','deaths_b', 'deaths_civilians', 'deaths_unknown','best', 'high', 'low']
not_sum_list = [i for i in ucdp_tmp1.columns if i not in sum_list]
ucdp_monthly_unit_sum = ucdp_tmp1.groupby(['year_months_start','year', 'priogrid_gid']).sum()[sum_list].reset_index()
ucdp_monthly_unit_mean = ucdp_tmp1.groupby(['year_months_start','year', 'priogrid_gid']).mean().reset_index()
ucdp_monthly_unit_mean.drop(columns=sum_list, inplace= True)

ucdp_monthly_unit = ucdp_monthly_unit_sum.merge(ucdp_monthly_unit_mean, how = 'outer', on = ['year_months_start','year', 'priogrid_gid'])

ucdp_monthly_unit.rename(columns={'priogrid_gid':'gid'}, inplace=True)

ucdp_monthly_unit['log_best'] = np.log(ucdp_monthly_unit['best'] +1)
ucdp_monthly_unit['log_low'] = np.log(ucdp_monthly_unit['low'] +1)
ucdp_monthly_unit['log_high'] = np.log(ucdp_monthly_unit['high'] +1)

prio_ucdp_df = world_grid_all_months.merge(ucdp_monthly_unit, how = 'left', on = ['gid', 'year_months_start', 'year'])
prio_ucdp_df.fillna(0, inplace=True)

NameError: name 'ucdp' is not defined

In [None]:
prio_ucdp_df.shape

In [19]:
prio_ucdp_df.shape

(24112296, 17)

File already downloaded
file already downloaded


  ucdp = pd.read_csv(path_ucdp)


In [None]:
def compile_combined_df():

    prio_grid = get_prio_shape()
    gwno = get_gwno()
    ucdp = get_ucdp()

    world_grid = prio_grid.merge(gwno, how = 'right', on = 'gid') # if you just merge this on outer I think you get the full grid needed for R-UNET
    world_grid_all_months = add_months(ucdp, world_grid)
    prio_ucdp = prio_ucdp_merge(ucdp, world_grid_all_months)

    data_dir = '/home/simon/Documents/Bodies/data/OD_dataframes_compiled/'

    with open(f'{data_dir}g_df_ucdp_prio.pkl', 'wb') as file:
        pickle.dump(prio_ucdp, file)

    prio_ucdp_pd = pd.DataFrame(prio_ucdp.drop(columns= 'geometry').copy())

    with open(f'{data_dir}df_ucdp_prio.pkl', 'wb') as file:
        pickle.dump(prio_ucdp_pd, file)


if __name__ == "__main__":
    compile_combined_df()
