In [1]:
import os
import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Set global variables
PROJECT = r"Z:\Laboral\World Bank\Paper - Child mortality and Climate Shocks"
OUTPUTS = rf"{PROJECT}\Outputs"
DATA = rf"{PROJECT}\Data"
DATA_IN = rf"{DATA}\Data_in"
DATA_PROC = rf"{DATA}\Data_proc"
DATA_OUT = rf"{DATA}\Data_out"
ERA5_DATA = rf"Z:\WB Data\ERA5 Reanalysis"

In [4]:
df = pd.read_stata(r"Z:\Laboral\World Bank\Paper - Child mortality and Climate Shocks\Data\Data_in\DHS\DHSBirthsGlobalAnalysis_04172024.dta")

In [22]:
df

Unnamed: 0,v000,v001,v002,v003,v007,v008,country_code,ID_HH,ID_R,ID_CB,...,d_weatlh_ind_3,d_weatlh_ind_4,d_weatlh_ind_5,hhsize,chu5size,hhh_age,interview_year,interview_month,interview_day,interview_date
0,AL5,36,13.0,2,2008,1307,AL,AL5-36-13-2,AL5-36-13-2,AL5-36-13-2-36-2,...,1.0,0.0,0.0,4.0,0,39.0,2008,11,1,2008-11-01
1,AL5,202,17.0,2,2009,1310,AL,AL5-202-17-2,AL5-202-17-2,AL5-202-17-2-202-2,...,0.0,0.0,0.0,5.0,0,42.0,2009,2,1,2009-02-01
2,AL5,311,9.0,9,2008,1307,AL,AL5-311-9-9,AL5-311-9-9,AL5-311-9-9-311-1,...,0.0,0.0,0.0,11.0,0,73.0,2008,11,1,2008-11-01
3,AL5,68,8.0,2,2009,1310,AL,AL5-68-8-2,AL5-68-8-2,AL5-68-8-2-68-1,...,0.0,0.0,0.0,4.0,0,36.0,2009,2,1,2009-02-01
4,AL5,341,6.0,4,2008,1307,AL,AL5-341-6-4,AL5-341-6-4,AL5-341-6-4-341-1,...,1.0,0.0,0.0,5.0,1,66.0,2008,11,1,2008-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5259465,ZW7,10,12.0,1,2015,1390,ZW,ZW7-10-12-1,ZW7-10-12-1,ZW7-10-12-1-10-1,...,0.0,0.0,1.0,5.0,2,31.0,2015,10,1,2015-10-01
5259466,ZW7,88,9.0,2,2015,1390,ZW,ZW7-88-9-2,ZW7-88-9-2,ZW7-88-9-2-88-2,...,0.0,0.0,1.0,6.0,0,46.0,2015,10,1,2015-10-01
5259467,ZW7,396,23.0,1,2015,1390,ZW,ZW7-396-23-1,ZW7-396-23-1,ZW7-396-23-1-396-2,...,0.0,1.0,0.0,4.0,1,38.0,2015,10,1,2015-10-01
5259468,ZW7,147,14.0,3,2015,1392,ZW,ZW7-147-14-3,ZW7-147-14-3,ZW7-147-14-3-147-1,...,0.0,0.0,1.0,4.0,0,68.0,2015,12,1,2015-12-01


In [63]:
df = df.dropna(subset=["v008", "chb_year", "chb_month"], how="any")

# Date of interview
df["year"] = 1900+(df["v008"]-1)//12
df["month"] = df["v008"]-12*(df["year"]-1900)
df["day"] = 1
df["interview_date"]  = pd.to_datetime(df[["year", "month", "day"]], dayfirst=False)
df["interview_month"] = df["month"]
df["interview_day"]   = df["day"]
df = df.drop(columns=["year", "month", "day"])

# Date of birth
df["year"] = df["chb_year"].astype(int)
df["month"] = df["chb_month"].astype(int)
df["day"] = 15
df["birth_date"]  = pd.to_datetime(df[["year", "month", "day"]], dayfirst=False)
df = df.drop(columns=["year", "month", "day"])

# Number of days from interview
df["days_from_interview"] = df["interview_date"] - df["birth_date"]

# excluir del análisis a aquellos niños que nacieron 12 meses alrededor de la fecha de la encuesta y no más allá de 10 y 15 años del momento de la encuesta. 
# PREGUNTA PARA PAULA: ¿ella ya hizo el filtro de 15 años y 30 dias?
df["last_15_years"] = (df["days_from_interview"] > np.timedelta64(30, 'D')) & (df["days_from_interview"] < np.timedelta64(15*365, 'D'))
df["last_10_years"] = (df["days_from_interview"] > np.timedelta64(30, 'D')) & (df["days_from_interview"] < np.timedelta64(10*365, 'D'))
df["since_2003"] = (df["interview_year"]>=2003)
df = df[df["last_15_years"] == True]

(5168071, 91)

In [65]:
df["interview_year"]

0          2008
1          2009
2          2008
3          2009
4          2008
           ... 
5259465    2015
5259466    2015
5259467    2015
5259468    2015
5259469    2015
Name: interview_year, Length: 5168071, dtype: int16

In [47]:
df

Unnamed: 0,v000,v001,v002,v003,v007,v008,country_code,ID_HH,ID_R,ID_CB,...,d_weatlh_ind_4,d_weatlh_ind_5,hhsize,chu5size,hhh_age,interview_year,interview_month,interview_day,interview_date,birth_date
0,AL5,36,13.0,2,2008,1307,AL,AL5-36-13-2,AL5-36-13-2,AL5-36-13-2-36-2,...,0.0,0.0,4.0,0,39.0,2008,11,1,2008-11-01,1996-06-15
1,AL5,202,17.0,2,2009,1310,AL,AL5-202-17-2,AL5-202-17-2,AL5-202-17-2-202-2,...,0.0,0.0,5.0,0,42.0,2009,2,1,2009-02-01,1997-12-15
2,AL5,311,9.0,9,2008,1307,AL,AL5-311-9-9,AL5-311-9-9,AL5-311-9-9-311-1,...,0.0,0.0,11.0,0,73.0,2008,11,1,2008-11-01,2002-10-15
3,AL5,68,8.0,2,2009,1310,AL,AL5-68-8-2,AL5-68-8-2,AL5-68-8-2-68-1,...,0.0,0.0,4.0,0,36.0,2009,2,1,2009-02-01,2002-03-15
4,AL5,341,6.0,4,2008,1307,AL,AL5-341-6-4,AL5-341-6-4,AL5-341-6-4-341-1,...,0.0,0.0,5.0,1,66.0,2008,11,1,2008-11-01,2003-08-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5259465,ZW7,10,12.0,1,2015,1390,ZW,ZW7-10-12-1,ZW7-10-12-1,ZW7-10-12-1-10-1,...,0.0,1.0,5.0,2,31.0,2015,10,1,2015-10-01,2010-01-15
5259466,ZW7,88,9.0,2,2015,1390,ZW,ZW7-88-9-2,ZW7-88-9-2,ZW7-88-9-2-88-2,...,0.0,1.0,6.0,0,46.0,2015,10,1,2015-10-01,2004-01-15
5259467,ZW7,396,23.0,1,2015,1390,ZW,ZW7-396-23-1,ZW7-396-23-1,ZW7-396-23-1-396-2,...,1.0,0.0,4.0,1,38.0,2015,10,1,2015-10-01,2001-06-15
5259468,ZW7,147,14.0,3,2015,1392,ZW,ZW7-147-14-3,ZW7-147-14-3,ZW7-147-14-3-147-1,...,0.0,1.0,4.0,0,68.0,2015,12,1,2015-12-01,2007-12-15


In [24]:
df[["year", "month", "day"]]

Unnamed: 0,year,month,day
0,1996.0,6.0,1
1,1997.0,12.0,1
2,2002.0,10.0,1
3,2002.0,3.0,1
4,2003.0,8.0,1
...,...,...,...
5259465,2010.0,1.0,1
5259466,2004.0,1.0,1
5259467,2001.0,6.0,1
5259468,2007.0,12.0,1


In [None]:
gen interview_year = v007
gen interview_month = v008-12*(interview_year-1900)

In [5]:
files = os.listdir(ERA5_DATA)
datasets = []
for file in tqdm(files):
    ds = xr.open_dataset(os.path.join(ERA5_DATA, file))
    datasets += [ds]
precipitation = xr.concat(datasets, dim="time")
# precipitation.to_netcdf(os.path.join(DATA_OUT, "ERA5_monthly_1970-2021.nc"))

  0%|          | 0/13 [00:00<?, ?it/s]

 85%|████████▍ | 11/13 [00:00<00:00, 96.49it/s]


ValueError: did not find a match in any of xarray's currently installed IO backends ['netcdf4', 'h5netcdf', 'scipy', 'cfgrib', 'ee', 'gini', 'rasterio', 'zarr']. Consider explicitly selecting one of the installed engines via the ``engine`` parameter, or installing additional IO dependencies, see:
https://docs.xarray.dev/en/stable/getting-started-guide/installing.html
https://docs.xarray.dev/en/stable/user-guide/io.html

In [2]:
xr.open_dataset(r"Z:\Laboral\World Bank\Paper - Child mortality and Climate Shocks\Data\Data_out\Climate_shocks_v3_spi.nc")

Cannot find the ecCodes library


In [45]:
import xarray as xr

climate_data = xr.open_dataset(r"Z:\Laboral\World Bank\Paper - Child mortality and Climate Shocks\Data\Data_out\Climate_shocks_v3_spi.nc")
climate_data

In [57]:
def get_climate_shock(from_date, to_date, lat, lon):
    if pd.isna(from_date):
        return np.nan

    # Filter point
    point_data = climate_data.isel(time=slice(from_date, to_date)).sel(
        lat=lat, lon=lon, method="nearest"
    )

    # Get position of original data
    lat = point_data.lat.item()
    lon = point_data.lon.item()

    # Filter by time
    inutero_q1 = point_data.isel(time=slice(0, 3))
    inutero_q2 = point_data.isel(time=slice(3, 6))
    inutero_q3 = point_data.isel(time=slice(6, 9))
    born_1m = point_data.isel(time=slice(9, 10))
    born_2to3m = point_data.isel(time=slice(10, 12))
    born_3to6m = point_data.isel(time=slice(12, 15))
    born_6to12m = point_data.isel(time=slice(15, 21))

    out_vars = [
        lat,
        lon,
    ]
    for spi in [
        "spi1",
        "spi3",
        "spi6",
        "spi9",
        "spi12",
    ]:
        # Compute mean values for SPI
        inutero_q1_mean = inutero_q1[spi].mean().item()
        inutero_q2_mean = inutero_q2[spi].mean().item()
        inutero_q3_mean = inutero_q3[spi].mean().item()
        born_1m_mean = born_1m[spi].mean().item()
        born_2to3m_mean = born_2to3m[spi].mean().item()
        born_3to6m_mean = born_3to6m[spi].mean().item()
        born_6to12m_mean = born_6to12m[spi].mean().item()

        out_vars_this_spi = [
            inutero_q1_mean,
            inutero_q2_mean,
            inutero_q3_mean,
            born_1m_mean,
            born_2to3m_mean,
            born_3to6m_mean,
            born_6to12m_mean,
        ]
        out_vars += out_vars_this_spi

    # Compute mean values for temperature
    inutero_q1_temp_mean = inutero_q1["t2m"].mean().item()
    inutero_q2_temp_mean = inutero_q2["t2m"].mean().item()
    inutero_q3_temp_mean = inutero_q3["t2m"].mean().item()
    born_1m_temp_mean = born_1m["t2m"].mean().item()
    born_2to3m_temp_mean = born_2to3m["t2m"].mean().item()
    born_3to6m_temp_mean = born_3to6m["t2m"].mean().item()
    born_6to12m_temp_mean = born_6to12m["t2m"].mean().item()

    out_vars_temp = [
        inutero_q1_temp_mean,
        inutero_q2_temp_mean,
        inutero_q3_temp_mean,
        born_1m_temp_mean,
        born_2to3m_temp_mean,
        born_3to6m_temp_mean,
        born_6to12m_temp_mean,
    ]
    out_vars += out_vars_temp

    return out_vars


In [63]:
def get_climate_shock2(from_date, to_date, lat, lon):
    if pd.isna(from_date):
        return np.nan

    # Filter point
    point_data = climate_data.isel(time=slice(from_date, to_date)).sel(
        lat=lat, lon=lon, method="nearest"
    )

    # Get position of original data
    lat = point_data.lat.item()
    lon = point_data.lon.item()

    # Filter by time
    inutero_q1 = point_data.isel(time=slice(0, 3))
    inutero_q2 = point_data.isel(time=slice(3, 6))
    inutero_q3 = point_data.isel(time=slice(6, 9))
    born_1m = point_data.isel(time=slice(9, 10))
    born_2to3m = point_data.isel(time=slice(10, 12))
    born_3to6m = point_data.isel(time=slice(12, 15))
    born_6to12m = point_data.isel(time=slice(15, 21))

    out_vars = [
        lat,
        lon,
    ]
    # Compute mean values for SPI
    inutero_q1_mean = inutero_q1.mean()
    inutero_q2_mean = inutero_q2.mean()
    inutero_q3_mean = inutero_q3.mean()
    born_1m_mean = born_1m.mean()
    born_2to3m_mean = born_2to3m.mean()
    born_3to6m_mean = born_3to6m.mean()
    born_6to12m_mean = born_6to12m.mean()

    return out_vars


In [72]:
xr.open_dataset(r"Z:\Downloads\wbgt_daily_mean_2000_2021\wbgtmean_2000_daily_ERA5.nc")

In [31]:
%%timeit -n 1000
ds.sel(lat=0, lon=-50).isel(time=20).mean()

AttributeError: 'Dataset' object has no attribute 'item'

In [28]:
%%timeit
selection = ds.sel(lat=0, lon=-50).isel(time=20)
for spi in ["spi1", "spi3", "spi6", "spi9", "spi12", "t2m"]:
    value = selection[spi].mean().item()

3.7 ms ± 280 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
from metpy.calc import wet_bulb_temperature
from metpy.units import units
# wet_bulb_temperature(993 * units.hPa, 32 * units.degC, 15 * units.degC)
test = precipitation.isel(time=0, longitude=slice(0, 10), latitude=slice(0, 10))
wet_bulb_temperature(test.sp * units.Pa, test.t2m * units.degK, test.d2m * units.degK)

0,1
Magnitude,[[243.86199125170205 243.86199125170205 243.86199125170205  243.86199125170205 243.86199125170205 243.86199125170205  243.86199125170205 243.86199125170205 243.86199125170205  243.86199125170205]  [243.6117604240331 243.61346635191708 243.61354786469275  243.61507029223756 243.6151512430659 243.61670527642937  243.61678678287296 243.6198521965023 243.6200151548014  243.62155338663777]  [243.414066107513 243.415690913106 243.4203874384892 243.42218124597454  243.42379550096592 243.42539977473652 243.42703038791188  243.43025361019755 243.43187894674728 243.4335200858608]  [243.29211328937996 243.29527811474316 243.29705459854128  243.3002889539493 243.30362625184299 243.30685060177817  243.31002255727273 243.31333813451704 243.31657770157528  243.31836482125513]  [243.3559650968821 243.35922037335735 243.36408511993412  243.3673245830969 243.37218929008802 243.37397616028431  243.37728774481755 243.38062673506997 243.38540282146064  243.3887259827667]  [243.39953892287707 243.40448305044598 243.41096649202336  243.41421660865944 243.4206894730107 243.42564926077816  243.43051278602434 243.43547201426142 243.44039970017394  243.44527417440412]  [243.35741032033303 243.36552455973754 243.37209294553307  243.3817467800119 243.3898552765411 243.39787831998567  243.40445116049162 243.41255394790713 243.42221212934786  243.4303155371548]  [243.27164584972002 243.27822448480075 243.28788520512916  243.29598953789667 243.30580903543176 243.31391214989134  243.32355682577725 243.33012556475296 243.3398494945804  243.34950449499243]  [243.3355623538883 243.34058586830022 243.34715478347113  243.3551850132749 243.3601977976853 243.36677736032587  243.37172068952026 243.37828484732728 243.38324412236037  243.3913422590849]  [243.55651787033412 243.56316392291004 243.5712651228909  243.5793873469836 243.58748899743398 243.59406700475708  243.6021730094808 243.61019032195603 243.61521195963377  243.62334575201504]]
Units,kelvin


## Function to assign climate shocks from a date and a point

In [None]:
climate_data = xr.open_dataset(rf"{DATA_OUT}/Climate_shocks_v2_previous_months.nc")
dates = climate_data.time.values

In [None]:
def get_climate_shock(from_date, to_date, lat, lon):
    if pd.isna(from_date):
        return np.nan
    
    # Filter point    
    point_data = climate_data.sel(time=slice(from_date, to_date)).sel(lat=lat, lon=lon, method='nearest')
    
    # Get position of original data
    lat = point_data.lat.item()
    lon = point_data.lon.item()

    # Filter by time
    inutero_q1   = point_data.isel(time=slice(0,3))
    inutero_q2   = point_data.isel(time=slice(3,6))
    inutero_q3   = point_data.isel(time=slice(6,9))
    born_1m      = point_data.isel(time=slice(9,10))
    born_2to3m  = point_data.isel(time=slice(10,12))
    born_3to6m  = point_data.isel(time=slice(12,15))
    born_6to12m  = point_data.isel(time=slice(15,21))

    out_vars = [lat, lon, ]
    for prec in ["standarized_precipitation", "standarized_precipitation_3", "standarized_precipitation_6", "standarized_precipitation_12"]:
        # Compute min and max values for both variables
        inutero_q1_mean   = inutero_q1[prec].mean().item()
        inutero_q2_mean   = inutero_q2[prec].mean().item()
        inutero_q3_mean   = inutero_q3[prec].mean().item()
        born_1m_mean      = born_1m[prec].mean().item()
        born_2to3m_mean  = born_2to3m[prec].mean().item()
        born_3to6m_mean  = born_3to6m[prec].mean().item()
        born_6to12m_mean  = born_6to12m[prec].mean().item()

        out_vars_this_prec = [inutero_q1_mean, inutero_q2_mean, inutero_q3_mean, born_1m_mean, born_2to3m_mean, born_3to6m_mean, born_6to12m_mean]
        out_vars += out_vars_this_prec

    return out_vars    


In [None]:
def get_climate_shock_old(from_date, to_date, lat, lon):
    if pd.isna(from_date):
        return np.nan
    
    # Filter point    
    point_data = climate_data.sel(time=slice(from_date, to_date)).sel(lat=lat, lon=lon, method='nearest')
    
    # Get max and min values for standarized precipitation
    max_prec = point_data["standarized_precipitation"].max().item()
    min_prec = point_data["standarized_precipitation"].min().item()
    max_prec_m = point_data["standarized_precipitation_m"].max().item()
    min_prec_m = point_data["standarized_precipitation_m"].min().item()
    
    # Get position of original data
    lat = point_data.lat.item()
    lon = point_data.lon.item()
    
    return lat, lon, max_prec, min_prec, max_prec_m, min_prec_m

In [None]:
climate_data.isel(time=slice(100, 110), lat=-50, lon=120)["standarized_precipitation"].mean().item()

In [None]:
%load_ext line_profiler
def get_climate_shock_prof():
    date = np.random.choice(dates[12:-12])
    from_date, to_date = date + pd.DateOffset(months=-9), date + pd.DateOffset(years=1)
    lat, lon = np.random.uniform(-90, 90), np.random.uniform(-180, 180)
    
    get_climate_shock(from_date, to_date, lat, lon)    
    return

In [None]:
%lprun -f get_climate_shock_prof get_climate_shock_prof()

In [None]:
%timeit get_climate_shock_prof()

## Open DHS data and add the shock data to the dataframe

In [None]:
df = pd.read_stata(rf"{DATA_IN}/DHS/DHSBirthsGlobalAnalysis_04172024.dta")
df['ID'] = df.index

### Create dates variables:
We considered a 

In [None]:
# Create datetime object from year and month
df["day"] = 1
df["month"] = df["chb_month"].astype(float)
df["year"] = df["chb_year"].astype(float)
df["birthdate"] = pd.to_datetime(df[["year", "month","day"]]).to_numpy()

# Maximum range of dates
df["from_date"] = df["birthdate"] + pd.DateOffset(months=-9) # From in utero (9 months before birth) 
df["to_date"] = df["birthdate"] + pd.DateOffset(years=1) # To the first year of life

# Filter children from_date greater than 1990 (we only have climate data from 1990)
df = df[df["from_date"] > "1990-01-01"]

# # Construct deathdate variable
# df["deathdate"] = df[df["child_agedeath"]<=12].progress_apply(lambda x: x["birthdate"] + pd.DateOffset(months=x["child_agedeath"]), axis=1)

# # Replace to_date with deathdate if the child died
# df["to_date"] = np.where((df["child_agedeath"]<=12) & (df["deathdate"]<df["to_date"]), df["deathdate"], df["to_date"])

# Filter children to_date smalle than 2021 (we only have climate data to 2020)
df = df[df["to_date"] < "2021-01-01"]

In [None]:
coords_cols = ["lat_climate", "lon_climate"]
prec_cols = ["prec_inutero_q1", "prec_inutero_q2", "prec_inutero_q3", "prec_born_1m", "prec_born_2to3m", "prec_born_3to6m", "prec_born_6to12m"]
prec_3_cols = ["prec_3_inutero_q1", "prec_3_inutero_q2", "prec_3_inutero_q3", "prec_3_born_1m", "prec_3_born_2to3m", "prec_3_born_3to6m", "prec_3_born_6to12m"]
prec_6_cols = ["prec_6_inutero_q1", "prec_6_inutero_q2", "prec_6_inutero_q3", "prec_6_born_1m", "prec_6_born_2to3m", "prec_6_born_3to6m", "prec_6_born_6to12m"]
prec_12_cols = ["prec_12_inutero_q1", "prec_12_inutero_q2", "prec_12_inutero_q3", "prec_12_born_1m", "prec_12_born_2to3m", "prec_12_born_3to6m", "prec_12_born_6to12m"]
all_cols = coords_cols + prec_cols + prec_3_cols + prec_6_cols + prec_12_cols

for n in tqdm(range(0, df.ID.max(), 10_000)):
    if os.path.exists(rf"{DATA_PROC}/births_climate_{n}.csv"):
        print(f"births_climate_{n}.csv exists, moving to next iteration")
        continue
    chunk = df.loc[(df.ID >= n) & (df.ID < n+10_000), ['ID', 'from_date', 'to_date', 'LATNUM', 'LONGNUM']].copy()
    if chunk.shape[0]==0:
        continue
    climate_results = chunk[['from_date', 'to_date', 'LATNUM', 'LONGNUM']].apply(lambda s: get_climate_shock(s['from_date'], s['to_date'], s['LATNUM'], s['LONGNUM']), axis=1)
    climate_results = climate_results.apply(pd.Series)
    climate_results.columns = all_cols
    climate_results["ID"] = chunk["ID"]
    climate_results.to_csv(rf"{DATA_PROC}/births_climate_{n}.csv")
    
# df[all_cols] = climate_data

In [None]:
# # For debugging
# chunk[['from_date', 'to_date', 'LATNUM', 'LONGNUM']].progress_apply(lambda s: get_climate_shock(s['from_date'], s['to_date'], s['LATNUM'], s['LONGNUM']), axis=1)

In [None]:
files = os.listdir(rf"{DATA_PROC}") 
files = [f for f in files if f.startswith("births_climate_")]
data = []
for file in tqdm(files):
    df = pd.read_csv(rf"{DATA_PROC}/{file}")
    data += [df]
df = pd.concat(data)

In [None]:
df = df.drop(columns="Unnamed: 0")
df.to_stata(rf"{DATA_PROC}\ClimateShocks_assigned.dta")

# Example Plots

In [None]:
import matplotlib.pyplot as plt
pos = np.random.randint(0, 500000)
pos = 428380      
filtered2 = climate_data.standarized_precipitation.sel(lat=df.at[pos, "LATNUM"], lon=df.at[pos, "LONGNUM"], method="nearest")
filtered2.plot(figsize=(12, 2))

plt.axhline(2, color="red")
plt.axhline(-2, color="red")



In [None]:
climate

In [None]:
import gc
import pandas as pd
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

climate = pd.read_stata(rf"{DATA_PROC}\ClimateShocks_assigned.dta")

dhs = pd.read_stata(rf"{DATA_IN}/DHS/DHSBirthsGlobalAnalysis_04172024.dta")
dhs["ID"] = dhs.index

merged = dhs.merge(climate, on="ID")
dhs = 0
climate = 0


In [None]:
# merged = merged.head(10_000)
merged["lon_climate_2"] = merged["lon_climate"].round(0) 
merged["lon_climate_3"] = merged["lon_climate_2"] - merged["lon_climate_2"].astype(int) % 2 
merged["lat_climate_2"] = merged["lat_climate"].round(0)
merged["lat_climate_3"] = merged["lat_climate_2"] - merged["lat_climate_2"].astype(int) % 2 

merged["ID_cell3"] = merged["lon_climate_3"].astype(str) + "_" + merged["lat_climate_3"].astype(str)
one_hot = pd.get_dummies(merged["ID_cell3"], prefix='ID_cell3')
years_interaction = one_hot.multiply(merged["chb_year"], axis="index")
years_interaction.columns = [f"years_{col}" for col in years_interaction.columns]
months_interaction = one_hot.multiply(merged["chb_year"], axis="index")
months_interaction.columns = [f"months_{col}" for col in months_interaction.columns]
gc.collect()

merged[one_hot.columns] = one_hot
merged[years_interaction.columns] = years_interaction
merged[months_interaction.columns] = months_interaction
gc.collect()

In [None]:
merged[one_hot.columns] = one_hot
merged[years_interaction.columns] = years_interaction
merged[months_interaction.columns] = months_interaction

In [None]:
pd.concat([merged, one_hot])

In [None]:
one_hot.memory_usage().sum() * 1e-9

In [2]:
import pandas as pd
df = pd.read_stata(rf"{DATA_OUT}\DHSBirthsGlobal&ClimateShocks.dta")

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df = pd.read_stata(rf"{DATA_OUT}\DHSBirthsGlobal&ClimateShocks.dta")


In [22]:
!py -m pip install fastparquet

Collecting fastparquet
  Obtaining dependency information for fastparquet from https://files.pythonhosted.org/packages/2d/58/c579cbdfa257e93f9f6c04a6ec620a42dcd361d70dbc09325a61b7d018b0/fastparquet-2024.2.0-cp311-cp311-win_amd64.whl.metadata
  Downloading fastparquet-2024.2.0-cp311-cp311-win_amd64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Obtaining dependency information for cramjam>=2.3 from https://files.pythonhosted.org/packages/08/32/1f90bee4b86d1b92fb76c26f11db07b5bce7db842fb1cc912ed7f045b696/cramjam-2.8.3-cp311-none-win_amd64.whl.metadata
  Downloading cramjam-2.8.3-cp311-none-win_amd64.whl.metadata (4.3 kB)
Downloading fastparquet-2024.2.0-cp311-cp311-win_amd64.whl (670 kB)
   ---------------------------------------- 0.0/670.5 kB ? eta -:--:--
   ----------------- ---------------------- 286.7/670.5 kB 5.9 MB/s eta 0:00:01
   ---------------------------------------  665.6/670.5 kB 8.3 MB/s eta 0:00:01
   ---------------------------------------- 670.5/670.


[notice] A new release of pip is available: 23.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
df.to_csv(rf"{DATA_OUT}\DHSBirthsGlobal&ClimateShocks.csv")

In [1]:
import pandas as pd
pd.read_excel(r"Z:\Downloads\serie2016.xls")

Unnamed: 0.1,Unnamed: 0,"RECAUDACION POR IMPUESTO, RECURSOS DE LA SEGURIDAD SOCIAL Y ADUANAS",Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,,AÑO 2016,,,,,,,,,,,,,
1,,Miles de pesos,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,
3,,CONCEPTO,ENERO,FEBRERO,MARZO,ABRIL,MAYO,JUNIO,JULIO,AGOSTO,SEPTIEMBRE,OCTUBRE,NOVIEMBRE,DICIEMBRE,TOTAL
4,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,,"5/ Incluye Decretos N° 93/00, 963/95, 1053/96,...",,,,,,,,,,,,,
74,,6/ No incluye recaudación por Impuestos a las ...,,,,,,,,,,,,,
75,,"7/ Netos de Tasas aduaneras, Otras rec. aduane...",,,,,,,,,,,,,
76,,,,,,,,,,,,,,,
