# Data Preparation

## Setup

### Library import


In [None]:
import logging
import os
import random
import subprocess
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
import rasterio as rio
import rioxarray as rxr
from shapely.geometry import Point

In [None]:
# Create a logger
logger = logging.getLogger(__name__)

# Set the log level to INFO
logger.setLevel(logging.INFO)

## Utils

**create_mbtiles**

In [None]:
def create_mbtiles(
    source_path: Path,
    output_path: Path,
    layer_name: str,
    max_zoom: int,
    opts="--read-parallel --no-tile-compression -s EPSG:4326 -B4",
):
    """
    Use tippecanoe to create pbf tiles at dest_path from source_path (geojson).
    layer_name is used for the name of the layer in the MBTILE.
    Regex file path (/*.geojson) is supported for source_path.
    This function replaces the previous two functions (create_mbtiles & mbtile_to_pbf).

    More info: https://github.com/mapbox/tippecanoe#options

    Args:
        source_path (Path): path to source geojson
        output_path (Path): path to output .mbtiles
        layer_name (str): name of layer in the MBTILE
        max_zoom (int): max zoom level
        opts (str): options for tippecanoe

    Returns:
        (int): 0 if the file was created successfully, 1 if the file creation failed.
    """
    try:
        opts += f" -z{max_zoom}"
        cmd = f"tippecanoe -o {output_path} -l {layer_name} {opts} {source_path}"
        logger.info(f"Processing: {cmd}")
        r = subprocess.call(cmd, shell=True)
        if r == 0:
            logger.info("Task created")
        return r

    except Exception as e:
        logger.error(e)
        return 1

**random_point_in_polygon**

In [None]:
def random_point_in_polygon(polygon):
    """
    Generate a random point within a given polygon.
    Args:
        polygon (shapely.geometry.Polygon): The polygon within which to generate a random point.
    Returns:
        shapely.geometry.Point: A random point within the polygon.
    """
    min_x, min_y, max_x, max_y = polygon.bounds
    while True:
        random_point = Point(random.uniform(min_x, max_x), random.uniform(min_y, max_y))
        if polygon.contains(random_point):
            return random_point

***
## Dhaka
### Settlement extent

In [None]:
file_01 = (
    "GDA_Urban_Phase1_DLR_BGD_Dhaka_Settlement_Extent_and_Change_1985-2015_30m_1_0.tif"
)
file_02 = "GDA_Urban_Phase1_DLR_BGD_Dhaka_SettlementExtentandChange_HFC_2016-2022_quarterly_10m_1_0\
.tif"

datasets = {
    "01_Settlement_Extent_and_Change_1985-2015": file_01,
    "02_Settlement_Extent_and_Change_2016-2022": file_02,
}

years = {
    "01_Settlement_Extent_and_Change_1985-2015": np.arange(1985, 2016, 1),
    "02_Settlement_Extent_and_Change_2016-2022": np.arange(2016, 2023, 1),
}

year_relations = {
    "2022": 1,
    "2021": 5,
    "2020": 9,
    "2019": 13,
    "2018": 17,
    "2017": 21,
    "2016": 25,
}

input_path = "../data/raw/04_Products/BGD/Phase_1"
output_path = "../data/raw/Dhaka/GeoTIFFs/Settlement"

mask_color = [251, 171, 24]

In [None]:
for dataset, input_file in datasets.items():
    # Read tiff
    da = (
        rxr.open_rasterio(os.path.join(input_path, dataset, input_file))
        .squeeze()
        .drop("band")
    )

    # Create output directory
    if not os.path.exists(os.path.join(output_path)):
        os.makedirs(os.path.join(output_path))

    # Loop through years
    for year in years[dataset]:
        # Create year mask
        if dataset == "02_Settlement_Extent_and_Change_2016-2022":
            t = year_relations[str(year)]
            mask = da.where(da >= t, 0)
            mask = mask.where(mask == 0, 1)
        else:
            mask = da.where(da <= year, 0)
            mask = mask.where(mask == 0, 1)

        # Reproject to 3857
        mask_3857 = mask.rio.reproject("epsg:3857")

        # Create an RGBA array with the same shape as your mask
        rgba = np.zeros((*mask_3857.shape, 4), dtype=np.uint8)

        # Set the RGBa values where the mask is 1
        rgba[mask_3857.values == 1, :3] = mask_color

        # Set the alpha channel to 255 where the mask is 1 and 0 where the mask is 0
        rgba[..., 3] = mask_3857.values * 255

        # Get the transform and crs from the original mask
        transform = mask_3857.rio.transform()
        crs = mask_3857.rio.crs

        # Open a new GeoTIFF file in write mode
        with rio.open(
            os.path.join(output_path, f"Settlement_{year}.tif"),
            "w",
            driver="GTiff",
            height=rgba.shape[0],
            width=rgba.shape[1],
            count=4,
            dtype=rgba.dtype,
            crs=crs,
            transform=transform,
        ) as dst:
            # Write the RGBa array to the file
            dst.write(rgba.transpose((2, 0, 1)))

### Population density

In [None]:
file_path = "../data/processed/Dhaka/population_density_2021.geojson"
output_path = "../data/processed/Dhaka/population_density_2021.mbtiles"
create_mbtiles(
    file_path,
    output_path,
    "population_density_2021",
    16,
    "--force --read-parallel -zg --drop-densest-as-needed --extend-zooms-if-still-dropping",
)

***
## South Sudan
### [HydroRIVERS v1.0](https://www.hydrosheds.org/products/hydrorivers/)

In [None]:
file_path = "../data/processed/South Sudan/HydroRIVERS_v10_af_SSD.geojson"
output_path = "../data/processed/South Sudan/HydroRIVERS_v10_af_SSD.mbtiles"
create_mbtiles(
    file_path,
    output_path,
    "HydroRIVERS_v10_af_SSD",
    16,
    "--force --read-parallel -zg -Z4 --drop-densest-as-needed --extend-zooms-if-still-dropping",
)

### EOIDs

In [None]:
countries = gpd.read_file(
    "../data/raw/Climate Resilience/South Sudan/WB_countries_Admin0_10m/WB_countries_Admin0_10m.shp"
)
countries = countries[["ISO_A3", "NAME_EN", "geometry"]]
countries

In [None]:
df = pd.read_csv(
    "../data/raw/Climate Resilience/South Sudan/ESA GDA Programme Dashboard (GDA Website)_GDA AID \
    activities - in process and completed_Geo chart - Sheet1.csv"
)

In [None]:
df = pd.merge(
    df,
    countries,
    left_on="Country",
    right_on="NAME_EN",
    how="left",
)
df

In [None]:
new_rows = []

gdf = gpd.GeoDataFrame(df)

for _idx, row in gdf.iterrows():
    count = int(row["EOIDs"])
    for _ in range(count):
        rand_point = random_point_in_polygon(row["geometry"])
        new_rows.append(
            {"Country": row["Country"], "ISO_A3": row["ISO_A3"], "geometry": rand_point}
        )

# Create the new GeoDataFrame using the same CRS as the original gdf
new_gdf = gpd.GeoDataFrame(new_rows, crs=gdf.crs)

In [None]:
new_gdf.to_file("../data/processed/EOIDs_random_points.geojson", driver="GeoJSON")

***
## Ukraine
### Looting Locations 

In [None]:
df = pd.read_excel(
    "../data/raw/Fragility Conflict Security/\
    UC4_LandAndConflict/D3.4/D3.43/Locations Looting Ukraine.xlsx"
)
df.head()

Create a GeoDataFrame from the excel file containing the looting locations.

In [None]:
# Ensure that your Latitude and Longitude columns are not null
df = df.dropna(subset=["Latitude", "Longitude"])

# Create a new geometry column in your DataFrame
df["geometry"] = df.apply(lambda row: Point(row["Longitude"], row["Latitude"]), axis=1)

# Convert your DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry="geometry")

gdf.head()

**Save as `GeoJSON`**

In [None]:
gdf.to_file("../data/processed/Ukraine/looting_locations.geojson", driver="GeoJSON")

**Looting Locations in Ukraine**

In [None]:
df_ukraine = (
    df.drop(columns=["Latitude", "Longitude", "Source", "geometry"])
    .groupby("Date")
    .count()
)
df_ukraine = df_ukraine.reset_index().rename(
    columns={"Date": "date", "Location": "count"}
)
df_ukraine.plot(x="date", y="count", kind="line")

In [None]:
df_ukraine.to_csv(
    "../data/processed/Ukraine/looting_land_grabbing_mentions_ukraine.csv", index=False
)

**Looting Locations in Kherson**

In [None]:
df_kherson = df[df["Location"] == "Kherson"]
df_kherson = (
    df_kherson.drop(columns=["Latitude", "Longitude", "Source", "geometry"])
    .groupby("Date")
    .count()
)
df_kherson = df_kherson.reset_index().rename(
    columns={"Date": "date", "Location": "count"}
)
df_kherson.plot(x="date", y="count", kind="line")

In [None]:
df_kherson.to_csv(
    "../data/processed/Ukraine/looting_land_grabbing_mentions_kherson.csv", index=False
)

### Seasonal Crop Monitoring 

In [None]:
gdf = gpd.read_file(
    "../data/raw/Fragility Conflict Security/UC4_LandAndConflict/\
        D3.4/D3.49/Harvestdates_zaporizhzya/test_shp.shp"
)
gdf["#_of_harvest"] = gdf[["Harvest_1", "Harvest_2", "Harvest_3", "Harvest_4"]].apply(
    lambda x: x.notnull().sum(), axis=1
)
cols = gdf.columns.tolist()
cols.remove("geometry")  # Remove 'geometry' from its current position
cols.append("geometry")  # Append 'geometry' at the end
gdf = gdf[cols]  # Reorder the DataFrame
gdf.head()

**Save as `GeoJSON`**

In [None]:
gdf = gdf.to_crs(epsg=4326)
gdf.to_file(
    "../data/processed/Ukraine/seasonal_crop_monitoring.geojson", driver="GeoJSON"
)

**Create `MBTiles`**

In [None]:
file_path = "../data/processed/Ukraine/seasonal_crop_monitoring.geojson"
output_path = "../data/processed/Ukraine/seasonal_crop_monitoring.mbtiles"
create_mbtiles(
    file_path,
    output_path,
    "Seasonal Crop Monitoring",
    16,
    "--force --read-parallel -zg -Z7 --drop-densest-as-needed --extend-zooms-if-still-dropping",
)

***
## Nigeria
### Ground Displacement 

In [None]:
gdf = gpd.read_file(
    "../data/processed/Nigeria/NGA_Warri_Ground_Displacement_SQ_2D_SNT_VERT_1_0.geojson"
)

In [None]:
date_columns = list(gdf.columns)[6:-1]

# gdf['VCGD'] = gdf[date_columns].sum(axis=1)
gdf = gdf.drop(date_columns, axis=1)
## reordering columns
# cols = list(gdf.columns)
# cols.remove('geometry')
# cols.append('geometry')
# gdf = gdf[cols]

gdf.head()

**Save as `GeoJSON`**

In [None]:
gdf.to_file(
    "../data/processed/Nigeria/NGA_Warri_Ground_Displacement_SQ_2D_SNT_VERT_1_0.geojson",
    driver="GeoJSON",
)

### Land Use / Land Cover

**Create `MBTiles`**

In [None]:
file_path = "../data/processed/Nigeria/NGA_Warri_LULC_2022_1_0.geojson"
output_path = "../data/processed/Nigeria/NGA_Warri_LULC_2022_1_0.mbtiles"
create_mbtiles(
    file_path,
    output_path,
    "Land Use Land Cover",
    16,
    "--force --read-parallel -zg -Z10 --drop-densest-as-needed --extend-zooms-if-still-dropping",
)

***
## Pakistan
### Transport Network

In [None]:
gdf = gpd.read_file(
    "../data/raw/Urban Sustainability/04_Products/PAK/Phase_1/02_Transport_Network_2022/\
GDA_Urban_Phase1_GAF_PAK_Sargodha_Transport_Network_2022_1_0/\
GDA_Urban_Phase1_GAF_PAK_Sargodha_Transport_Network_2022_1_0.shp"
)
gdf["Road_Type"] = gdf["Road_Type"].fillna("Transport")
gdf = gdf.to_crs(epsg=4326)

In [None]:
gdf["Road_Type"].unique()

**Save as `GeoJSON`**

In [None]:
gdf.to_file(
    "../data/processed/Pakistan/PAK_Sargodha_Transport_Network_2022_1_0.geojson",
    driver="GeoJSON",
)

### Land Use / Land Cover
**Create `MBTiles`**

In [None]:
file_path = "../data/processed/Pakistan/PAK_Sargodha_LULC_2022_1_0.geojson"
output_path = "../data/processed/Pakistan/PAK_Sargodha_LULC_2022_1_0.mbtiles"
create_mbtiles(
    file_path,
    output_path,
    "Land Use Land Cover",
    16,
    "--force --read-parallel -zg -Z10 --drop-densest-as-needed --extend-zooms-if-still-dropping",
)

### Surface Urban Heat Island Intensity 

In [None]:
gdf = gpd.read_file(
    "../data/raw/Urban Sustainability/04_Products/PAK/Phase_1/\
04_Surface_Urban_Heat_Island_Intensity_2021/\
GDA_Urban_Phase1_GAF_PAK_Sargodha_LULCinclSUHII_2021_1_0/\
GDA_Urban_Phase1_GAF_PAK_Sargodha_LULCinclSUHII_2021_1_0.shp"
)

In [None]:
# Define the labels for the temperature categories
labels = [
    "coolest",
    "moderately warmer",
    "warmer",
    "moderately hotter",
    "hotter",
    "hottest",
]

# Use pandas qcut function to categorize 'SUHII_AVG' into different bins
# The number of bins is equal to the number of labels
# Each bin has approximately the same number of records
# The result is stored in a new column 'SUHII_Category'
gdf["SUHII_Category"] = pd.qcut(gdf["SUHII_AVG"], q=len(labels), labels=labels)

# Convert the 'Temperature_Category' column to string type
gdf["SUHII_Category"] = gdf["SUHII_Category"].astype(str)

# reordering columns
cols = list(gdf.columns)
cols.remove("geometry")
cols.append("geometry")
gdf = gdf[cols]

# Change the CRS to EPSG 4326
gdf = gdf.to_crs(epsg=4326)

In [None]:
gdf.head()

**Save as `GeoJSON`**

In [None]:
gdf.to_file(
    "../data/processed/Pakistan/PAK_Sargodha_LULCinclSUHII_2021_1_0.geojson",
    driver="GeoJSON",
)

**Create `MBTiles`**

In [None]:
file_path = "../data/processed/Pakistan/PAK_Sargodha_LULCinclSUHII_2021_1_0.geojson"
output_path = "../data/processed/Pakistan/PAK_Sargodha_LULCinclSUHII_2021_1_0.mbtiles"
create_mbtiles(
    file_path,
    output_path,
    "Land Use Land Cover",
    16,
    "--force --read-parallel -zg -Z10 --drop-densest-as-needed --extend-zooms-if-still-dropping",
)