# WB

In [1]:
import gc
import test_tools
import numpy as np
import pandas as pd
import geopandas as gpd
from procesa_bases import load_WB_country_data

PATH = "D:\World Bank\CLIENT v2"
DATA_RAW = rf"{PATH}\Data\Data_raw"
DATA_PROC = rf"{PATH}\Data\Data_proc"
DATA_OUT = rf"{PATH}\Data\Data_out"
GPW_PATH = rf"D:\Datasets\Gridded Population of the World"

  PATH = "D:\World Bank\CLIENT v2"
  PATH = "D:\World Bank\CLIENT v2"


## Genera mapa con etiquetas de zona (adm0 adm1 adm2)

In [None]:
# Load ID dataset
gdf = gpd.read_feather(r"D:\World Bank\CLIENT v2\Data\Data_proc\WB_country_IDs.feather")
gdf.columns = gdf.columns.str.lower()
gdf = gdf.rename(columns={"id":"ID"}).drop(columns="objectid")

# Add names from the original WB adm2 dataset
gdf_raw = load_WB_country_data()
gdf_raw.columns = gdf_raw.columns.str.lower()
gdf_raw = gdf_raw[["adm0_code", "adm1_code", "adm2_code", "adm0_name", "adm1_name", "adm2_name", "geometry"]]
assert gdf_raw.duplicated(subset=["adm0_code", "adm1_code", "adm2_code"]).sum() == 0, "There are duplicated entries in the raw dataset!!"

# Merge both datasets to assert that the codes are correct and consistent
gdf = gdf.merge(gdf_raw.drop(columns="geometry"), how="outer", on=["adm0_code", "adm1_code", "adm2_code"], indicator=True, validate="1:1")
assert (gdf._merge == "both").all(), "There are problems with the merge!!"
gdf = gdf.drop(columns="_merge")

gdf.drop(columns="ID").to_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\WB_map.csv", index=False) # Export without the ID column

In [43]:
# Set admin level to categorical dtype (when the dataset is expanded, it will be more memory efficient)
gdf["ID"]        = gdf["ID"].astype("category")
gdf["adm0_code"] = gdf["adm0_code"].astype("category")
gdf["adm1_code"] = gdf["adm1_code"].astype("category")
gdf["adm2_code"] = gdf["adm2_code"].astype("category")
gdf = gdf.set_index("ID")

gdf = gdf.drop(columns=["adm0_name","adm1_name","adm2_name"])

## Genera datos de shocks climáticos

In [5]:
gdf_raw = gpd.read_file(r"D:\World Bank\CLIENT v2\Data\Data_raw\world_bank_adm2\world_bank_adm2.shp")
gdf_raw.columns = gdf_raw.columns.str.lower()
gdf_raw = gdf_raw[["adm0_code", "adm1_code", "adm2_code", "adm0_name", "adm1_name", "adm2_name"]]

In [44]:
import pandas as pd
import itertools


def expand_dataset(df, gdf):
                    
    # Collect all dimension values from df
    all_years      = df.index.get_level_values("year").categories
    all_variables  = df.index.get_level_values("variable").categories
    all_thresholds = df.index.get_level_values("threshold").categories
    all_measures   = df.index.get_level_values("measure").categories
    all_regions    = gdf.index.categories # ID is the index of gdf

    # Convert each list to a small DataFrame
    df_years      = pd.DataFrame({'year': all_years}, dtype='category')
    df_variables  = pd.DataFrame({'variable': all_variables}, dtype='category')
    df_thresholds = pd.DataFrame({'threshold': all_thresholds}, dtype='category')
    df_measures   = pd.DataFrame({'measure': all_measures}, dtype='category')
    df_regions    = pd.DataFrame({'ID': all_regions}, dtype='category')

    # Step-by-step merges using how='cross'
    df_temp = df_years.merge(df_variables, how='cross')
    df_temp = df_temp.merge(df_regions, how='cross')
    df_temp = df_temp.merge(df_thresholds, how='cross')
    df_temp = df_temp.merge(df_measures, how='cross')
    expanded_without_data = df_temp.set_index(["ID", "year", "variable", "threshold", "measure"])
    
    # add admcodes to the expanded set
    expanded_without_data = expanded_without_data.join(
        gdf.drop(columns=["geometry"]),
        how="left",
        on="ID",
        validate="m:1"
    )
    
    # Merge original data (df) onto the expanded set
    expanded_with_data = expanded_without_data.join(
        df,
        how="left",
        validate="1:1",
        rsuffix="_y"
    ).reset_index().drop(columns="ID")
    
    expanded_with_data = test_tools.assert_correct_admcodes(expanded_with_data)        

    return expanded_with_data

In [None]:
from importlib import reload

## Generados Nico
# Set dtypes to make this loading efficient
dtypes = {"year": np.int16, "variable":"category", "threshold":"category", "area_affected":np.float32, "population_affected":np.float32, "ID":np.int64}# "adm2_code": np.int16, "adm1_code": np.int16, "adm0_code": np.int16,

for shock in ["floods", "drought", "hurricanes", "intenserain", "heatwaves", "coldwaves"]:
    print(shock)
    df = pd.read_csv(
        rf"D:\World Bank\CLIENT v2\Data\Data_out\WB_{shock}_long.csv",
        dtype=dtypes, 
        usecols=dtypes.keys(),
    )
        
    # Set ID to categorical dtype (this is after loading as int to match with the categories of gdf)
    df["ID"] = df["ID"].astype("category")
    
    # Reshape to long format
    df = df.melt(id_vars=["ID", "year", "variable", "threshold"], var_name="measure", value_name="value")

    # Set categorical and index to make faster merges
    df["measure"] = df["measure"].astype("category")
    df["year"] = df["year"].astype("category")
    df = df.set_index(["ID"])    
    
    # Add adm0, adm1 and adm2 codes    
    df = gdf.drop(columns=["geometry"]).join(df, on=["ID"], how="inner", validate="1:m")
    df = df.reset_index()

    # Set index to make faster merges and expand dataset
    #   Replace columns with null categories with zeros before setting the index to make it work as expected:
    index = ["ID", "year", "variable", "threshold", "measure"]
    for col in index:
        if (df[col].dtype == "category"):
            if (df[col].cat.categories.shape[0]==0):
                df[col] = df[col].astype(float).fillna(0)
                df[col] = df[col].astype("category")

    df = df.set_index(index)
    df = expand_dataset(df, gdf)

    # Test the output
    test_tools.assert_correct_colnames(df)
    test_tools.assert_correct_shape(df, gdf)

    # Export
    df.to_csv(rf"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\WB_{shock}.csv", index=False)

    df = None
    gc.collect()    

In [None]:
# test all is ok
from importlib import reload
reload(test_tools)
gdf = pd.read_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\WB_map.csv")
# gdf = gpd.GeoDataFrame(gdf, geometry=gpd.GeoSeries.from_wkt(gdf["geometry"]))

for shock in ["floods", "drought", "hurricanes", "intenserain", "heatwaves", "coldwaves"]:
    print("Verifying", shock)
    df = pd.read_csv(rf"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\WB_{shock}.csv")
    test_tools.validate_dataset_merge(df, gdf, dataset_name="climate")

# IPUMS

In [2]:
import pandas as pd
import geopandas as gpd

gdf_full = gpd.read_feather(r"D:\World Bank\CLIENT v2\Data\Data_proc\IPUMS_country_IDs.feather")
gdf_full = gdf_full.drop(columns=["ID"])
gdf_full = gdf_full.rename(columns={"CNTRY_CODE":"adm0", "GEOLEVEL1":"adm1", "GEOLEVEL2":"adm2"})
gdf_full[["adm0_name", "adm1_name", "adm2_name"]] = "To be filled"
ids = ["adm0", "adm2"]


## Nacional

In [None]:
import os

# Shocks
path = r"D:\World Bank\CLIENT v2\Data\Data_out\HC Treatment Complete"
       
files = os.listdir(path)
files = [f for f in files if "HC_national_data" in f and f.endswith(".csv")]

dfs = []
for file in files:
    df = pd.read_csv(rf"D:\World Bank\CLIENT v2\Data\Data_out\HC Treatment Complete\{file}")
    df["s3"] = pd.NA
    df["s4"] = pd.NA
    
    s3cols = ["s3a", "s3b", "s3c", "s3d", "s3f"]
    s4cols = ["s4a", "s4b", "s4c"]

    for col in s3cols: 
        df["s3"] = df["s3"].fillna(df[col])
        assert (df[s3cols].notna().sum(axis=1) <= 1).all(), f"{df[(df[s3cols].notna().sum(axis=1) > 1)]}"
    for col in s4cols:
        df["s4"] = df["s4"].fillna(df[col])
        assert (df[s4cols].notna().sum(axis=1) <= 1).all(), f"{df[(df[s3cols].notna().sum(axis=1) > 1)]}"

    dfs += [df]

df = pd.concat(dfs)
for col in df.columns:
    assert not df[col].isna().all()

# Drop s3* columns
df = df.drop(columns=[col for col in df.columns if ("s3" in col or "s4" in col) and (col != "s3" and col != "s4")])
# Order variables
df = df[["adm0", "s1", "s2", "s3", "s4", "s5", "outcome", "new", "v"]]
df = df.rename(columns={"new":"time", "v": "value", "status":"treatment"})
df.loc[df.s1 == "Hurricane", "s5"] = df.loc[df.s1 == "Hurricane", "s5"] / 100

df = df.merge(gdf_full[["adm0"]].drop_duplicates(), on=["adm0"], validate="m:1")
print(f"Hay datos de {df.adm0.unique().size} países")
df.to_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\HC_national_data.csv", index=False)

In [8]:
labels = {
    "s1":"Shock",
    "s2":"Weight",
    "s3": {
        "Cold wave":"Temperature <0 °C",
        "Heat wave":"Degrees (°C)",
        "Drought":"Drought indicator",
        "Intense rain":"Number of days",
        "Hurricane":"Category"
    },
    "s4": {
        "Cold wave":"Standard Deviations from historical mean",
        "Heat wave":"Standard Deviations from historical mean",
        "Drought":"Standard Deviations from historical mean",
        "Intense rain":"Rainfall (mm)",
        "Hurricane":"Distance from center of the storm (degrees)" # Fixme: turn to km
    },
    "s5": r"Threshold (% affected)",
}

## ADM2

In [52]:
files

['HC_geodata_category.csv',
 'HC_geodata_cwa.csv',
 'HC_geodata_flooded.csv',
 'HC_geodata_hwa.csv',
 'HC_geodata_ia.csv',
 'HC_geodata_sp.csv']

In [None]:
import os
from tqdm import tqdm 

# Shocks
path = r"D:\World Bank\CLIENT v2\Data\Data_out\HC Treatment Complete"
       
files = os.listdir(path)
files = [f for f in files if "HC_geodata" in f and f.endswith(".csv")]

dfs = []
for file in tqdm(files):
    df = pd.read_csv(rf"D:\World Bank\CLIENT v2\Data\Data_out\HC Treatment Complete\{file}")
    df["s3"] = pd.NA
    df["s4"] = pd.NA
    
    s3cols = ["s3a", "s3b", "s3c", "s3d", "s3f"]
    s4cols = ["s4a", "s4b", "s4c"]

    for col in s3cols: 
        df["s3"] = df["s3"].fillna(df[col])
        assert (df[s3cols].notna().sum(axis=1) <= 1).all(), f"{df[(df[s3cols].notna().sum(axis=1) > 1)]}"
    for col in s4cols:
        df["s4"] = df["s4"].fillna(df[col])
        assert (df[s4cols].notna().sum(axis=1) <= 1).all(), f"{df[(df[s3cols].notna().sum(axis=1) > 1)]}"

    dfs += [df]
    
df = pd.concat(dfs)
for col in df.columns:
    assert not df[col].isna().all()

# Drop s3* columns
df = df.drop(columns=[col for col in df.columns if ("s3" in col or "s4" in col) and (col != "s3" and col != "s4")])
# Order variables
df = df[["adm0", "adm1", "adm2", "s1", "s2", "s3", "s4", "s5", "outcome", "status", "diftime"]]
df = df.rename(columns={"status":"treatment_sub", "diftime":"diff"})
df.loc[df.s1 == "Hurricane", "s5"] = df.loc[df.s1 == "Hurricane", "s5"] / 100

df.merge(gdf_full[["adm0", "adm2"]], on=["adm0", "adm2"], validate="m:1", how="inner")
print(f"Hay datos de {df.adm0.unique().size} países")

df.to_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\HC_geo_data.csv", index=False)

In [49]:
# test all is ok
from importlib import reload
reload(test_tools)
gdf = pd.read_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\HC_geo_map.csv")
df = pd.read_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\HC_geo_data.csv")
test_tools.validate_hc_merge(df, gdf)

  df = pd.read_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\HC_geo_data.csv")


Checking groups:   0%|          | 0/13650 [00:00<?, ?it/s]

Number of polygons without data: 13117
Number of polygons without data: 13068
Number of polygons without data: 12933
Number of polygons without data: 13019
Number of polygons without data: 13069
Number of polygons without data: 13085
Number of polygons without data: 13971
Number of polygons without data: 13331
Number of polygons without data: 13521
Number of polygons without data: 13472
Number of polygons without data: 13200
Number of polygons without data: 13280
Number of polygons without data: 13244
Number of polygons without data: 13284
Number of polygons without data: 13215
Number of polygons without data: 13238
Number of polygons without data: 13350
Number of polygons without data: 13298
Number of polygons without data: 13327


MergeError: Merge keys are not unique in right dataset; not a one-to-one merge

In [21]:
for code in tqdm(df.adm0.unique()):
    arg_df = df.query(f"adm0=={code} and s1=='Hurricane' and s2=='Area' and s3f==3 and s4c==10 and s5==0")
    arg_gdf = gdf_full.query(f"adm0=={code}")
    merged = arg_gdf.merge(arg_df, on=["adm0", "adm1", "adm2"], how="outer", indicator=True, validate="1:1")
    assert (merged._merge != "right_only").all()
    if not (merged._merge == "both").all():
        print(merged[merged._merge != "both"].shape[0])

  5%|▍         | 4/87 [00:00<00:16,  5.14it/s]

1


  7%|▋         | 6/87 [00:01<00:16,  4.92it/s]

1


 13%|█▎        | 11/87 [00:01<00:11,  6.61it/s]

2


 15%|█▍        | 13/87 [00:02<00:12,  5.90it/s]

1
1


 23%|██▎       | 20/87 [00:03<00:09,  6.74it/s]

1


 29%|██▊       | 25/87 [00:04<00:09,  6.44it/s]

8


 32%|███▏      | 28/87 [00:04<00:08,  6.65it/s]

1


 36%|███▌      | 31/87 [00:05<00:10,  5.44it/s]

2


 43%|████▎     | 37/87 [00:06<00:08,  5.87it/s]

1
1


 49%|████▉     | 43/87 [00:07<00:06,  6.73it/s]

4


 53%|█████▎    | 46/87 [00:07<00:06,  5.86it/s]

3


 56%|█████▋    | 49/87 [00:08<00:06,  5.71it/s]

1


 59%|█████▊    | 51/87 [00:08<00:06,  5.97it/s]

1
1


 63%|██████▎   | 55/87 [00:09<00:05,  5.50it/s]

1
1


 70%|███████   | 61/87 [00:10<00:04,  5.78it/s]

2


 76%|███████▌  | 66/87 [00:11<00:03,  6.44it/s]

1


 86%|████████▌ | 75/87 [00:12<00:01,  6.84it/s]

1


 91%|█████████ | 79/87 [00:13<00:01,  6.33it/s]

1


 93%|█████████▎| 81/87 [00:13<00:00,  6.16it/s]

1


 94%|█████████▍| 82/87 [00:13<00:00,  6.15it/s]

1


 95%|█████████▌| 83/87 [00:14<00:00,  4.96it/s]

1


100%|██████████| 87/87 [00:14<00:00,  5.87it/s]


## Map

In [96]:
import pandas as pd
import geopandas as gpd
import procesa_bases 

gdf_full = gpd.read_feather(r"D:\World Bank\CLIENT v2\Data\Data_proc\IPUMS_country_IDs.feather")
gdf_full = gdf_full.drop(columns=["ID"])

WB_country = procesa_bases.load_WB_country_data()
IPUMS_country = procesa_bases.load_IPUMS_country_data(WB_country, keep_name=True)
IPUMS_country = IPUMS_country.clip(WB_country.total_bounds)

assert (gdf_full.merge(IPUMS_country, on=["GEOLEVEL1", "GEOLEVEL2", "CNTRY_CODE"], how="outer", indicator=True, validate="1:1")._merge == "both").all()

# Rename columns to make it in the intended format
IPUMS_country = IPUMS_country.rename(columns={"CNTRY_CODE":"adm0", "GEOLEVEL1":"adm1", "GEOLEVEL2":"adm2", "CNTRY_NAME":"adm0_name", "ADMIN_NAME":"adm2_name"})
IPUMS_country = IPUMS_country.drop(columns="ID")

In [108]:
IPUMS_country.to_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\HC_geo_map.csv", index=False)

## Export Labels

In [13]:
df = pd.read_excel(r"D:\World Bank\CLIENT v2\Data\Data_raw\button_labels.xlsx")

In [14]:
df.to_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\selector_labels.csv", index=False)

## Check consistency

In [None]:
from shapely import wkt

# Assert that all data merges correctly

gdf = pd.read_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\IPUMS_map.csv")
gdf['geometry'] = gdf['geometry'].apply(wkt.loads)
gdf = gpd.GeoDataFrame(gdf, crs='epsg:4326')

df_adm2 = pd.read_csv(r"D:\World Bank\CLIENT v2\Data\Data_out\for webpage\HC_geo_data.csv")

In [None]:
gdf.merge(df_adm2, on=["adm0", "adm2"], validate="1:m", how="outer", indicator=True)._merge.value_counts()

In [None]:
merged = gdf_full[["CNTRY_CODE", "GEOLEVEL2", "geometry"]].merge(df, right_on=["adm0", "adm2"], left_on=["CNTRY_CODE", "GEOLEVEL2"], how="outer", indicator=True)
merged = merged[~merged.CNTRY_CODE.isin([231,276,356,368,376,504,566,586,662])]
pd.crosstab(merged[merged._merge!="both"].CNTRY_CODE, merged[merged._merge!="both"]._merge)

In [None]:
# Palestina no está porque solo tiene after. El resto 10/10

import folium
m = merged[merged._merge!="both"].drop_duplicates(subset=["CNTRY_CODE", "GEOLEVEL2"]).explore()

# add control for layers
folium.LayerControl().add_to(m)

m