## Fix IPUMS conflicting shapes

in IPUMS shapes there are two kind of errors:
1) Conflicting territories, that are reported as being part of two different countries,
2) Conflicting geometries, where, for some reason, two geometries of a single country overlap and missasing where the data comes from.

For the first point, we use the WB official boundaries. For the second, we remove the wrong duplicate (remove PR from USA, remove small polygon for Laos).   

In [None]:
import pandas as pd
import geopandas as gpd

wb = gpd.read_file(r"D:\World Bank\CLIENT v2\Data\Data_raw\world_bank_adm2\world_bank_adm2.shp")
israel = gpd.read_file(r"D:\World Bank\CLIENT v2\Data\Data_raw\IPUMS Fixed\geo2_il1972_1995.shp")

In [None]:
## ISRAEL
israel_ipums = gpd.read_file(r"D:\World Bank\CLIENT v2\Data\Data_raw\IPUMS Fixed\geo2_il1972_1995.shp")
israel_ipums["geometry"] = israel_ipums.centroid
israel_ipums = israel_ipums.set_crs(epsg=4326)

israel_WB = wb[wb.ADM0_NAME == "Israel"][["ADM1_CODE", "ADM2_CODE", "geometry"]]

# Genera csv con el fix
out = israel_WB.sjoin(israel_ipums)[["GEOLEVEL2", "ADM1_CODE"]]
out = out.rename(columns={"GEOLEVEL2":"geolev2", "ADM1_CODE": "geolev1"})
out["geolev2_new"] = out["geolev2"].str[:6].astype(str) + out["geolev1"].astype(str).str[1:].astype(str)
out = out.astype(int).drop(columns="geolev1")
out.set_index("geolev2").to_stata(r"D:\World Bank\CLIENT v2\Data\Data_proc\fixes\fix_israel_geo2_adm1.dta")

# Muestra los datos
m = israel_WB.explore()
israel_ipums.explore(m=m, color="red") 

In [None]:
import folium

## PALESTINE
palestine_ipums = gpd.read_file(r"D:\World Bank\CLIENT v2\Data\Data_raw\IPUMS Fixed\geo1_ps1997_2017.shp")
pal_ipums = palestine_ipums.copy()
pal_ipums["geometry"] = pal_ipums.centroid
pal_ipums = pal_ipums.set_crs(epsg=4326)

pal_WB = wb[wb.ADM0_NAME == "West Bank and Gaza"][["ADM1_CODE", "ADM2_CODE", "geometry"]]

# Genera csv con el fix
out = pal_WB.sjoin(pal_ipums)[["GEOLEVEL1", "ADM1_CODE"]]
out = out.rename(columns={"GEOLEVEL1":"geolev1"})
out["geolev1_new"] = out["ADM1_CODE"] 
out = out.astype(int).drop(columns="ADM1_CODE")
out.set_index("geolev1").to_stata(r"D:\World Bank\CLIENT v2\Data\Data_proc\fixes\fix_palestine_geo1_adm1.dta")

# Muestra los datos
m = pal_WB.explore()
# palestine_ipums.explore(m=m, color="orange")
pal_ipums.explore(m=m, color="red")

folium.LayerControl().add_to(m)  # use folium to add layer control
m


In [None]:
# Visualize overlaps
wb = gpd.read_file(r"D:\World Bank\CLIENT v2\Data\Data_raw\world_bank_adm2\world_bank_adm2.shp")
wb.geometry = wb.buffer(-0.0001)
# Perform a spatial join between the GeoDataFrame and itself to find overlaps
overlaps = gpd.sjoin(wb, wb, how='inner', predicate='intersects')

# Filter out self-overlaps
overlaps = overlaps[overlaps.OBJECTID_left != overlaps.OBJECTID_right]

# Display the overlapping rows
overlaps


# Arregla límites internacionales problematicos 

In [None]:
import pandas as pd
import geopandas as gpd

wb = gpd.read_file(r"D:\World Bank\CLIENT v2\Data\Data_raw\world_bank_adm2\world_bank_adm2.shp")

In [None]:
import shapely

ipums = pd.read_parquet(r"D:\World Bank\CLIENT v2\Data\Data_proc\IPUMS_full.parquet")
ipums["geometry"] = ipums["geometry"].apply(lambda x: shapely.wkt.loads(x))
ipums = gpd.GeoDataFrame(ipums, geometry='geometry')
ipums = ipums.set_crs(epsg=4326)


In [None]:
import matplotlib.pyplot as plt
import warnings

def fix_IPUMS_conflicting_international_boundaries(wb, ipums, export_maps=False):
    ''' Fix IPUMS conflicting international boundaries by clipping the data to the World Bank boundaries.
    
    Parameters:
    wb (GeoDataFrame): World Bank boundaries
    ipums (GeoDataFrame): IPUMS boundaries
    
    Returns:
    GeoDataFrame: IPUMS boundaries with the conflicting boundaries clipped to the World Bank boundaries
    '''
    countries_to_clip = {
        # Countries with conflicting boundaries
        "Marruecos": {"WB": 169, "IPUMS": "504"},
        "South Sudan": {"WB": 74, "IPUMS": "728"},
        "Sudan": {"WB": 6, "IPUMS": "729"},
        "Egypt": {"WB": 40765, "IPUMS": "818"},
        "Kenya": {"WB": 133, "IPUMS": "404"},
        "Russia": {"WB": 204, "IPUMS": "643"},
        "India": {"WB": 115, "IPUMS": "356"},
        "China": {"WB": 147295, "IPUMS": "156"},
        "Kyrghyzstan": {"WB": 138, "IPUMS": "417"},
    }                        

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for country, codes in countries_to_clip.items():
            print(country)
            wbcode = codes["WB"]
            ipumscode = codes["IPUMS"]
            
            # Clip IPUMS using WB
            clipped = (
                ipums[ipums.CNTRY_CODE == ipumscode]
                .clip(wb[wb.ADM0_CODE == wbcode])
            )
            
            if export_maps:
                # Plot the clipped data
                fig, ax = plt.subplots(figsize=(10, 10))
                ipums[ipums.CNTRY_CODE == ipumscode].plot(ax=ax)
                clipped.plot(ax=ax, facecolor="none", edgecolor="red")
                plt.savefig(f"{DATA_PROC}/fixes/{country}.png")
            
            # remove areas with residual geometry
            clipped = clipped[clipped.geometry.area > .0001]
            print("Se eliminaron ", len(ipums[ipums.CNTRY_CODE == ipumscode]) - len(clipped), " registros")
            
            # Update the original dataframe
            ipums.loc[ipums.CNTRY_CODE == ipumscode] = clipped
            
        # Remove unwanted shapes from Israel & Palestine
        ipums = ipums[~(ipums["GEOLEVEL2"].astype(str).str[-2:].isin(["97", "98", "99"]) & ipums["CNTRY_CODE"].isin(["376", "275"]))]
        
    return ipums

In [None]:
ipums.to_parquet(r"D:\World Bank\CLIENT v2\Data\Data_proc\IPUMS_full_clipped.parquet")

In [None]:
m = ipums[ipums["CNTRY_CODE"]=="356"].explore()
wb[wb.ADM0_NAME=="India"].explore(m=m)

# add layer control
folium.LayerControl().add_to(m)  # use folium to add layer control
m