In [88]:
import numpy as np
import pandas as pd
import re
import os
import csv
import json

import rasterio
from rasterio.plot import show
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.features import geometry_mask

import geopandas as gpd
from pyproj import CRS

from matplotlib import pyplot as plt

# 1 Reprojection

**CRS of Land as reprojection reference.**


In [7]:
# Define years list
years = range(2000, 2024)

In [6]:
# Define input and output paths
data_path = 'Datasets_Hackathon'

reproject_path = 'Datasets_Hackathon/Reprojected_Data'
if not os.path.exists(reproject_path):
        os.makedirs(reproject_path)
        
csv_path = 'For_dashboard'
if not os.path.exists(csv_path):
        os.makedirs(csv_path)

## 1.1 Dynamic Data

In [None]:
"""
Reprojecting all dynamic data
"""

# List of data categories with their appropriate resampling methods and filename formats
datasets = [
    {'short_name':'land', 'name': 'Land_Cover_Data', 'file_format': '{year}LCT.tif', 'is_reference': True},
    {'short_name':'rainfall', 'name': 'Climate_Precipitation_Data', 'file_format': '{year}R.tif', 'resampling': Resampling.bilinear},
    {'short_name':'pop', 'name': 'Gridded_Population_Density_Data', 'file_format': 'Assaba_Pop_{year}.tif','resampling': Resampling.bilinear},
    {'short_name':'popdens', 'name': 'Gridded_Population_Density_Data', 'file_format': 'mrt_pd_{year}_1km.tif','resampling': Resampling.bilinear},
    {'short_name':'gpp', 'name': 'Gross_Primary_Production_GPP', 'file_format': '{year}_GP.tif','resampling': Resampling.nearest},
]

# Dictionary to store data for all years and all datasets
all_data = {d['short_name']: {} for d in datasets}

# Loop through each year
for year in years:
    print(f"Processing data for year {year}...")
    
    # First, open the reference dataset (GPP)
    ref_dataset = next(d for d in datasets if d['is_reference'])
    ref_file = os.path.join(data_path, ref_dataset['name'], ref_dataset['file_format'].format(year=year))
    
    try:
        with rasterio.open(ref_file) as src_ref:
            # Get reference metadata
            dst_crs = src_ref.crs
            dst_transform = src_ref.transform
            dst_height = src_ref.height
            dst_width = src_ref.width
            
            # Read reference data (land)
            land_data = src_ref.read(1)
            all_data['land'][year] = land_data
            
            # Store reference profile for output files
            profile = src_ref.profile.copy()
            profile.update(dtype=rasterio.float64, count=1)
            
            # Process each non-reference dataset
            for dataset in [d for d in datasets if not d.get('is_reference', False)]:
                dataset_name = dataset['short_name']  # Extract short name
                
                # Construct input filename using the file format template
                input_file = os.path.join(data_path, dataset['name'], dataset['file_format'].format(year=year))
                output_file = os.path.join(reproject_path, f"{dataset_name}_reprojected_{year}.tif")
                
                # Create destination array
                dst_array = np.zeros((dst_height, dst_width), dtype=rasterio.float32)
                
                # Open and reproject
                try:
                    with rasterio.open(input_file) as src:
                        reproject(
                            source=rasterio.band(src, 1),
                            destination=dst_array,
                            src_transform=src.transform,
                            src_crs=src.crs,
                            dst_transform=dst_transform,
                            dst_crs=dst_crs,
                            resampling=dataset['resampling']
                        )
                        
                        # Store in all_data dictionary by year
                        all_data[dataset_name][year] = dst_array
                        
                        # Save reprojected data
                        with rasterio.open(output_file, 'w', **profile) as dst:
                            dst.write(dst_array, 1)
                            
                        print(f"Successfully reprojected and saved {output_file}")
                        
                except Exception as e:
                    print(f"Error processing {input_file}: {e}")
                    continue

    except Exception as e:
        print(f"Error processing year {year}: {e}")
        continue

Processing data for year 2000...
Error processing year 2000: Datasets_Hackathon/Land_Cover_Data/2000LCT.tif: No such file or directory
Processing data for year 2001...
Error processing year 2001: Datasets_Hackathon/Land_Cover_Data/2001LCT.tif: No such file or directory
Processing data for year 2002...
Error processing year 2002: Datasets_Hackathon/Land_Cover_Data/2002LCT.tif: No such file or directory
Processing data for year 2003...
Error processing year 2003: Datasets_Hackathon/Land_Cover_Data/2003LCT.tif: No such file or directory
Processing data for year 2004...
Error processing year 2004: Datasets_Hackathon/Land_Cover_Data/2004LCT.tif: No such file or directory
Processing data for year 2005...
Error processing year 2005: Datasets_Hackathon/Land_Cover_Data/2005LCT.tif: No such file or directory
Processing data for year 2006...
Error processing year 2006: Datasets_Hackathon/Land_Cover_Data/2006LCT.tif: No such file or directory
Processing data for year 2007...
Error processing year 

## 1.2 Static Data

In [16]:
def preprocess_cols(gdf):
    """
    Preprocesses GeoDataFrame for shapefile compatibility.

    Converts large integers to strings and formats datetime columns as ISO strings.

    Parameters:
    -----------
    gdf: geopandas.GeoDataFrame
        Input GeoDataFrame.

    Returns:
    -----------
    gdf_copy: geopandas.GeoDataFrame
        Processed GeoDataFrame.
    """

    gdf_copy = gdf.copy()
    
    for col in gdf_copy.columns:
        # Convert large integers to strings, eg. "osm_id" col in Water shp
        if gdf_copy[col].dtype == 'float64':
            gdf_copy[col] = gdf_copy[col].astype(str)
        
        # Handle datetime columns, eg. "date" cols
        if pd.api.types.is_datetime64_any_dtype(gdf_copy[col]): 
            # Convert to string in ISO format
            gdf_copy[col] = gdf_copy[col].dt.strftime('%Y-%m-%d %H:%M:%S')

    return gdf_copy
    

In [17]:
def reproject_and_save_shapefile(input_gdf, dst_crs, output_path):
    """
    Reprojects GeoDataFrame and saves as shapefile.

    Handles large integers, datetime, and CRS conversion.

    Parameters:
    -----------
    input_gdf: geopandas.GeoDataFrame
        Input GeoDataFrame.
        
    dst_crs: str or pyproj.CRS
        Destination CRS.

    output_path: str
        Output shapefile path.

    Returns:
    -----------
    gdf_reprojected: geopandas.GeoDataFrame
        Reprojected GeoDataFrame.
    """

    gdf_preprocessed = preprocess_cols(input_gdf)
    
    # Reproject
    gdf_reprojected = gdf_preprocessed.to_crs(dst_crs)
    
    # Save with modified field handling
    gdf_reprojected.to_file(output_path)
    
    return gdf_reprojected

In [15]:
def shp_to_tif(gdf, ref_raster, output_tif_path):
    """
    Converts GeoDataFrame to TIFF raster file based on a reference raster.

    Parameters:
    -----------
    gdf: geopandas.GeoDataFrame
        Input GeoDataFrame.
        
    ref_raster: rasterio.DatasetReader
        Reference raster dataset to use for getting transform, crs, width, height.
    
    output_tif_path: str
        Output path for the TIFF raster file.
    """
    shapes = ((geom, 1) for geom in gdf.geometry)
    
    # Use rasterio.features.rasterize to rasterize the shapefile
    rasterized_array = rasterio.features.rasterize(
        shapes=shapes,
        out_shape=(ref_raster.height, ref_raster.width),
        transform=ref_raster.transform,
        fill=ref_raster.nodata,
        dtype='float64'
    )
    
    # Create a profile for the new TIFF file
    profile = ref_raster.profile.copy()
    profile.update({
        'dtype': 'float64',
        'compress': 'lzw'
    })
    
    # Write the rasterized shapefile to a new TIFF file
    with rasterio.open(output_tif_path, 'w', **profile) as dst:
        dst.write(rasterized_array, 1)

In [18]:
# Paths
Road_path = "Datasets_Hackathon/Streamwater_Line_Road_Network/Main_Road.shp"
Water_path = "Datasets_Hackathon/Streamwater_Line_Road_Network/Streamwater.shp"
Dist_path = 'Datasets_Hackathon/Admin_layers/Assaba_Districts_layer.shp'
ref_path = 'Datasets_Hackathon/Land_Cover_Data/2010LCT.tif'

In [19]:
"""
Reproject static data
"""

# Load shapefiles
road = gpd.read_file(Road_path)
water = gpd.read_file(Water_path)
dist = gpd.read_file(Dist_path)

# Open reference raster to get CRS and other parameters
with rasterio.open(ref_path) as src_ref:
    dst_crs = src_ref.crs
    dst_transform = src_ref.transform
    dst_height = src_ref.height
    dst_width = src_ref.width
    nodata_value = src_ref.nodata
    
    # Read the reference raster data
    ref_array = src_ref.read(1)
    
    # Create a mask for valid data (where values are not nodata)
    valid_mask = ref_array != nodata_value

    # Reproject and save shapefiles with field handling
    road_reprojected = reproject_and_save_shapefile(
        road, 
        dst_crs, 
        os.path.join(reproject_path, 'road_reprojected.shp')
    )
    
    water_reprojected = reproject_and_save_shapefile(
        water, 
        dst_crs, 
        os.path.join(reproject_path, 'water_reprojected.shp')
    )
    
    dist_reprojected = reproject_and_save_shapefile(
        dist, 
        dst_crs, 
        os.path.join(reproject_path, 'dist_reprojected.shp')
    )

    # Save each reprojected shapefile as TIFF
    shp_to_tif(road_reprojected, src_ref, os.path.join(reproject_path, 'road_reprojected.tif'))
    shp_to_tif(water_reprojected, src_ref, os.path.join(reproject_path, 'water_reprojected.tif'))
    shp_to_tif(dist_reprojected, src_ref, os.path.join(reproject_path, 'dist_reprojected.tif'))

    # Create a new raster with the valid data mask
    profile = src_ref.profile.copy()
    profile.update({
        'dtype': rasterio.float64,
        'nodata': nodata_value,
        'compress': 'lzw'
    })

    # Mask the reference raster
    masked_array = np.where(valid_mask, ref_array, nodata_value).astype(rasterio.float32)

    # Save the masked raster
    masked_raster_path = os.path.join(reproject_path, '2010LCT_masked.tif')
    with rasterio.open(masked_raster_path, 'w', **profile) as dst:
        dst.write(masked_array, 1)

print("Reprojection and masking completed successfully!")

Reprojection and masking completed successfully!


# 2 Cleaning

## 2.1 Overview of invalid data

In [10]:
# overview of all data
all_data

# invalid data in each dict:
# 1. negative value:
#   -128.0 in land
#   -3.4028235e+38 in rainfall and pop
# 2. positive value:
#   65535 in GPP

{'land': {2010: array([[-128, -128, -128, ..., -128, -128, -128],
         [-128, -128, -128, ..., -128, -128, -128],
         [-128, -128, -128, ..., -128, -128, -128],
         ...,
         [-128, -128, -128, ..., -128, -128, -128],
         [-128, -128, -128, ..., -128, -128, -128],
         [-128, -128, -128, ..., -128, -128, -128]], dtype=int8),
  2011: array([[255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         ...,
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255]], dtype=uint8),
  2012: array([[255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         ...,
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255]], dtype=uint8),
  2013: array([[255, 255

In [11]:
all_data.keys()

dict_keys(['land', 'rainfall', 'pop', 'popdens', 'gpp'])

## 2.2 Functions for masking

In [25]:
def create_mask(data_dict, invalid_criteria=None):
    """
    Creates masks for data based on specified invalid criteria.

    Parameters:
    -----------
    data_dict: dict
        Dictionary with data types as keys and yearly data arrays as values.
    invalid_criteria: dict, optional
        Dictionary defining invalid conditions per data type using 'condition' lambda functions.
        Defaults to masking values less than 0 if not provided.

    Returns:
    -----------
    mask_dict: dict
        Dictionary with boolean masks where True marks invalid data.

    Notes:
    ------
    - Custom invalid criteria can be defined for each data type.
    - Default condition is values less than 0.
    """

    # Default invalid data criteria
    default_criteria = {
        'default': {'condition': lambda x: (x < 0)}
    }
    
    # Merge default criteria with provided criteria
    if invalid_criteria is None:
        invalid_criteria = default_criteria
    else:
        for key, value in default_criteria.items():
            if key not in invalid_criteria:
                invalid_criteria[key] = value
    
    # Create mask dictionary
    mask_dict = {}
    
    # Iterate through data types
    for data_type, year_data in data_dict.items():
        mask_dict[data_type] = {}
        
        # Determine criteria for this data type
        criteria = invalid_criteria.get(data_type, invalid_criteria['default'])
        condition = criteria['condition']
        
        # Create masks for each year
        for year, array in year_data.items():
            # Apply the condition to create a boolean mask
            mask_dict[data_type][year] = condition(array)
    
    return mask_dict


In [26]:
def apply_mask(data_dict, mask_dict):
    """
    Applies masks to a dictionary of data arrays.

    Parameters:
    -----------
    data_dict: dict
        Dictionary with data arrays by data type and year.
    mask_dict: dict
        Dictionary with boolean masks by data type and year, where True indicates invalid data.

    Returns:
    -----------
    masked_data_dict: dict
        Dictionary with masked data arrays, where invalid values are masked.

    Notes:
    ------
    - The function uses `numpy.ma.array()` to create masked arrays.
    - The mask is applied to the data arrays, so invalid data points 
        (those corresponding to `True` in the mask) will be masked (ignored).
    """

    masked_data_dict = {}
    
    # Iterate through data types
    for data_type, year_data in data_dict.items():
        masked_data_dict[data_type] = {}
        
        # Apply mask for each year
        for year, array in year_data.items():
            masked_data_dict[data_type][year] = np.ma.array(
                array, 
                mask=mask_dict[data_type][year]
            )
    
    return masked_data_dict

In [27]:
def dict_to_df(masked_data_dict):    
    """
    Convert masked data dictionary to a comprehensive DataFrame with location information.
    
    Parameters:
    -----------
    masked_data_dict : dict
        Nested dictionary of masked arrays
    
    Returns:
    --------
    pandas.DataFrame
        Comprehensive DataFrame with data information and coordinates
    """
    
    data_rows = []
    
    # Iterate through data types
    for data_type, year_data in masked_data_dict.items():
        # Iterate through years
        for year, masked_array in year_data.items():
            # Create a grid of row and column indices
            rows, cols = np.indices(masked_array.shape)
            
            # Create a mask for non-masked elements
            valid_mask = ~masked_array.mask
            
            # Get valid data points
            valid_data = masked_array.data[valid_mask]
            valid_rows = rows[valid_mask]
            valid_cols = cols[valid_mask]
            
            # Create rows for each valid data point
            data_rows.extend([{
                'year': year,
                'lat': valid_rows[idx],
                'lon': valid_cols[idx],
                data_type: value
            } for idx, value in enumerate(valid_data)])
    
    # Create DataFrame
    df = pd.DataFrame(data_rows)
    
    # If multiple data types exist, pivot and merge
    if len(df.columns) > 4:
        # Pivot the DataFrame to have one row per (year, lat, lon)
        df_pivoted = df.pivot_table(
            index=['year', 'lat', 'lon'], 
            values=df.columns[3:],  # Use the data type columns
            aggfunc='first'         # Use 'first' to aggregate if needed
        ).reset_index()
        
        return df_pivoted
    
    return df


In [28]:
def analyze_masked_data(masked_data_dict):
    """
    Computes summary statistics for masked data.

    Parameters:
    -----------
    masked_data_dict: dict
        Dictionary of masked arrays where keys represent data types and years.

    Returns:
    -----------
    dict
        A dictionary with summary statistics (e.g., valid points, masked percentage, 
        min, max, mean, and median) for each data type and year.
    """
    
    summary_stats = {}
    
    # Iterate through data types
    for data_type, year_data in masked_data_dict.items():
        summary_stats[data_type] = {}
        
        # Compute statistics for each year
        for year, masked_array in year_data.items():
            # Compute statistics on valid (unmasked) data
            valid_data = masked_array.compressed()
            
            summary_stats[data_type][year] = {
                'total_points': masked_array.size,
                'valid_points': len(valid_data),
                'masked_points': masked_array.size - len(valid_data),
                'masked_percentage': (masked_array.size - len(valid_data)) / masked_array.size * 100,
                'min': np.min(valid_data) if len(valid_data) > 0 else None,
                'max': np.max(valid_data) if len(valid_data) > 0 else None,
                'mean': np.mean(valid_data) if len(valid_data) > 0 else None,
                'median': np.median(valid_data) if len(valid_data) > 0 else None
            }
    
    return summary_stats

In [29]:
def main(data_dict, invalid_criteria=None):
    """
    Main processing function for data masking.
    
    Parameters:
    -----------
    data_dict : dict
        Nested dictionary of data arrays
    invalid_criteria : dict, optional
        Custom invalid data criteria
    
    Returns:
    --------
    dict
        Processed data results
    
    Example:
    --------
    >>> result = main(data_dict, invalid_criteria=lambda x: x < 0)
    >>> result['dataframe']  # Access the resulting DataFrame
    >>> result['summary']  # View the summary statistics
    """
    
    # Create masks
    mask_dict = create_mask(data_dict, invalid_criteria)
    
    # Apply masks
    masked_data_dict = apply_mask(data_dict, mask_dict)
    
    # Convert to DataFrame
    # df = dict_to_df(masked_data_dict)
    df = dict_to_df(masked_data_dict)

    # Get summary statistics
    summary = analyze_masked_data(masked_data_dict)
    
    return {
        'masked_data': masked_data_dict,
        'dataframe': df,
        'summary': summary,
        'mask_dict': mask_dict
    }

In [52]:
if __name__ == '__main__':
    print("Nested Dictionary Data Masking Module")
    print("Supports NumPy array-compatible masking for different data types")

Nested Dictionary Data Masking Module
Supports NumPy array-compatible masking for different data types


## 2.3 Processing - dynamic data - applying mask

In [None]:
# Custom criteria

custom_criteria_dynamic = {
    'land': {'condition': lambda x: np.logical_or(x < 0, x == 255)},
    'gpp': {'condition': lambda x: np.logical_or(x == 65533, x == 65535)},
    'pop': {'condition': lambda x: x < 0}
}

# Process data
prepared_dict = main(all_data, invalid_criteria=custom_criteria_dynamic)


In [83]:
prepared_dict.keys()

dict_keys(['masked_data', 'dataframe', 'summary', 'mask_dict'])

In [84]:
# check masking results
data_types = ['land', 'gpp', 'pop', 'popdens', 'rainfall']
years = [2023, 2023, 2020, 2020, 2023]

for data_type, year in zip(data_types, years):
    print(f"Masking Summary for {data_type.capitalize()} in {year}\n {prepared_dict['summary'][data_type][year]}")


Masking Summary for Land in 2023
 {'total_points': 434485, 'valid_points': 168212, 'masked_points': 266273, 'masked_percentage': 61.28473940412212, 'min': 7, 'max': 16, 'mean': 11.579542482105914, 'median': 10.0}
Masking Summary for Gpp in 2023
 {'total_points': 434485, 'valid_points': 108061, 'masked_points': 326424, 'masked_percentage': 75.12894576337503, 'min': 233.0, 'max': 3720.0, 'mean': 1159.0868, 'median': 1087.0}
Masking Summary for Pop in 2020
 {'total_points': 434485, 'valid_points': 167202, 'masked_points': 267283, 'masked_percentage': 61.51719852238857, 'min': 0.12940584, 'max': 1664.9595, 'mean': 10.732144, 'median': 4.919127}
Masking Summary for Popdens in 2020
 {'total_points': 434485, 'valid_points': 409430, 'masked_points': 25055, 'masked_percentage': 5.76659723580791, 'min': 0.05303842, 'max': 1713.4237, 'mean': 11.089264, 'median': 4.8886147}
Masking Summary for Rainfall in 2023
 {'total_points': 434485, 'valid_points': 167811, 'masked_points': 266674, 'masked_perce

In [92]:
prepared_df = prepared_dict['dataframe']
prepared_df[prepared_df['gpp']>100]

Unnamed: 0,year,lat,lon,gpp,land,pop,popdens,rainfall
99795,2010,176,392,627.0,16.0,2.201975,2.201975,145.020981
99796,2010,176,393,633.0,10.0,2.400784,2.400784,144.054916
106553,2010,188,370,695.0,16.0,3.405080,3.405080,172.263367
107115,2010,189,367,1437.0,10.0,3.903843,3.903843,173.341553
108246,2010,191,368,1492.0,10.0,5.662525,5.662525,174.425308
...,...,...,...,...,...,...,...,...
3125410,2023,766,452,1378.0,10.0,,,
3125411,2023,767,449,2433.0,10.0,,,
3125412,2023,767,450,2450.0,10.0,,,
3125413,2023,767,451,2079.0,10.0,,,


In [None]:
# save prepared_df for modeling and dashboard

# prepared_df.to_csv(os.path.join(csv_path, f"prepared_df.csv"))

## 2.4 Processing - static data

In [44]:
static_data = {}

for i in ['road', 'water', 'dist']:
    static_year_data = {}

    with rasterio.open(os.path.join(reproject_path, f'{i}_reprojected.tif')) as raster:
        raster_arr = raster.read(1)
        for year in years:
            static_year_data[year] = raster_arr
        
    static_data[i] = static_year_data

In [70]:
# Invalid criteria for static data
custom_criteria_static = {
    'road': {'condition': lambda x: x == -128},
    'water': {'condition': lambda x: x == -128},
    'dist': {'condition': lambda x: x == -128}
}

# Process data
prepared_dict_2 = main(static_data, invalid_criteria=custom_criteria_static)

In [72]:
# check masking results
data_types_2 = ['road', 'water', 'dist']
years_2 = [2023, 2023, 2023]

for data_type, year in zip(data_types_2, years_2):
    print(f"Masking Summary for {data_type.capitalize()} in {year}\n {prepared_dict_2['summary'][data_type][year]}")


Masking Summary for Road in 2023
 {'total_points': 434485, 'valid_points': 1205, 'masked_points': 433280, 'masked_percentage': 99.72266016088012, 'min': 1.0, 'max': 1.0, 'mean': 1.0, 'median': 1.0}
Masking Summary for Water in 2023
 {'total_points': 434485, 'valid_points': 4966, 'masked_points': 429519, 'masked_percentage': 98.85703764226614, 'min': 1.0, 'max': 1.0, 'mean': 1.0, 'median': 1.0}
Masking Summary for Dist in 2023
 {'total_points': 434485, 'valid_points': 168212, 'masked_points': 266273, 'masked_percentage': 61.28473940412212, 'min': 1.0, 'max': 1.0, 'mean': 1.0, 'median': 1.0}


In [83]:
static_df = prepared_dict_2['dataframe']
static_df

Unnamed: 0,year,lat,lon,dist,road,water
0,2000,0,308,,1.0,
1,2000,0,475,1.0,,
2,2000,1,308,,1.0,
3,2000,1,473,,,1.0
4,2000,1,474,1.0,,1.0
...,...,...,...,...,...,...
4059739,2023,767,449,1.0,,
4059740,2023,767,450,1.0,,
4059741,2023,767,451,1.0,,1.0
4059742,2023,768,450,1.0,,1.0


In [84]:
second_prepared_df = pd.merge(prepared_df, static_df, on=['year','lat', 'lon'], how='left')
second_prepared_df

Unnamed: 0,year,lat,lon,gpp,land,pop,popdens,rainfall,actual_lat,actual_lon,dist,road,water
0,2010,0,0,,,,0.597434,,15.035006,-12.994451,,,
1,2010,0,1,,,,0.596992,,15.035006,-12.989952,,,
2,2010,0,2,,,,0.595006,,15.035006,-12.985453,,,
3,2010,0,3,,,,0.593069,,15.035006,-12.980954,,,
4,2010,0,4,,,,0.591274,,15.035006,-12.976455,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3125410,2023,766,452,1378.0,10.0,,,,18.475537,-10.960858,1.0,,1.0
3125411,2023,767,449,2433.0,10.0,,,,18.480029,-10.974355,1.0,,
3125412,2023,767,450,2450.0,10.0,,,,18.480029,-10.969856,1.0,,
3125413,2023,767,451,2079.0,10.0,,,,18.480029,-10.965357,1.0,,1.0


## 2.5 Remove invalid data

In [85]:
clean_df = second_prepared_df.dropna(subset=['land'])
clean_df

Unnamed: 0,year,lat,lon,gpp,land,pop,popdens,rainfall,actual_lat,actual_lon,dist,road,water
475,2010,0,475,,16.0,,0.776286,,15.035006,-10.857379,1.0,,
1039,2010,1,474,,16.0,0.682958,0.709646,,15.039497,-10.861878,1.0,,1.0
1040,2010,1,475,,16.0,,0.703224,,15.039497,-10.857379,1.0,,
1603,2010,2,473,,16.0,0.675067,0.693418,,15.043989,-10.866377,1.0,,1.0
1604,2010,2,474,,16.0,0.677053,0.668584,,15.043989,-10.861878,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3125410,2023,766,452,1378.0,10.0,,,,18.475537,-10.960858,1.0,,1.0
3125411,2023,767,449,2433.0,10.0,,,,18.480029,-10.974355,1.0,,
3125412,2023,767,450,2450.0,10.0,,,,18.480029,-10.969856,1.0,,
3125413,2023,767,451,2079.0,10.0,,,,18.480029,-10.965357,1.0,,1.0


In [86]:
clean_df = clean_df.fillna(0)
clean_df

Unnamed: 0,year,lat,lon,gpp,land,pop,popdens,rainfall,actual_lat,actual_lon,dist,road,water
475,2010,0,475,0.0,16.0,0.000000,0.776286,0.0,15.035006,-10.857379,1.0,0.0,0.0
1039,2010,1,474,0.0,16.0,0.682958,0.709646,0.0,15.039497,-10.861878,1.0,0.0,1.0
1040,2010,1,475,0.0,16.0,0.000000,0.703224,0.0,15.039497,-10.857379,1.0,0.0,0.0
1603,2010,2,473,0.0,16.0,0.675067,0.693418,0.0,15.043989,-10.866377,1.0,0.0,1.0
1604,2010,2,474,0.0,16.0,0.677053,0.668584,0.0,15.043989,-10.861878,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3125410,2023,766,452,1378.0,10.0,0.000000,0.000000,0.0,18.475537,-10.960858,1.0,0.0,1.0
3125411,2023,767,449,2433.0,10.0,0.000000,0.000000,0.0,18.480029,-10.974355,1.0,0.0,0.0
3125412,2023,767,450,2450.0,10.0,0.000000,0.000000,0.0,18.480029,-10.969856,1.0,0.0,0.0
3125413,2023,767,451,2079.0,10.0,0.000000,0.000000,0.0,18.480029,-10.965357,1.0,0.0,1.0


In [None]:
clean_df.to_csv(os.path.join(csv_path, f"clean_df.csv"), index=0)

In [89]:
with open(os.path.join(csv_path, f"clean_df.csv"), mode='r', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file) 
    rows = list(csv_reader)

# 将数据写入JSON文件
with open(os.path.join(csv_path, f"clean_data.json"), mode='w', encoding='utf-8') as json_file:
    json.dump(rows, json_file, indent=4, ensure_ascii=False)

# to be updated

In [None]:
# 2010,2015,2020 population per pixel
# 1. <0 invalid data ->0
# 2. LAT LON df
# 3. calculation
all_data['gridded']

In [None]:
# Gridded land use -- get the hotspots top3 pixel
# Store transformed dataframes
df_dict = {}

# Process each year's array
for year, array in all_data['gridded'].items():
    # Replace negative values with 0
    array = np.maximum(array, 0)

    # Get row (lat) and column (lon) indices
    rows, cols = np.indices(array.shape)

    # Convert to DataFrame
    df = pd.DataFrame({
        'lat': rows.flatten(),  # Row index as latitude
        'lon': cols.flatten(),  # Column index as longitude
        'value': array.flatten()  # Flattened values
    })

    # Store in dictionary
    df_dict[year] = df
    
    # create population dataframe for finalised merged df
    population_df_list = []
    for year, df in df_dict.items():
        # Add a 'year' column to each DataFrame
        df['year'] = year
        
        # Append the DataFrame to the list
        population_df_list.append(df)

    # Concatenate all DataFrames into a single DataFrame
    population_df = pd.concat(population_df_list, ignore_index=True)
    
    population_df.to_csv('population_df.csv')

# Compute differences
df_diff_2015_2010 = df_dict[2015].copy()
df_diff_2020_2015 = df_dict[2020].copy()

# Subtract values
df_diff_2015_2010['value'] -= df_dict[2010]['value']
df_diff_2020_2015['value'] -= df_dict[2015]['value']

# rename value column to avoid confusion
df_diff_2015_2010 = df_diff_2015_2010.rename(columns={'value': 'diff'})
df_diff_2020_2015 = df_diff_2020_2015.rename(columns={'value': 'diff'})

# Sort df_diff_20xx_20xx in descending order
df_diff_2015_2010_sorted = df_diff_2015_2010.sort_values(by='diff', ascending=False)
df_diff_2020_2015_sorted = df_diff_2020_2015.sort_values(by='diff', ascending=False)

# Extract the top 3 largest and bottom 3 smallest values
top_3_2015_2010 = df_diff_2015_2010_sorted.head(3)
bottom_3_2015_2010 = df_diff_2015_2010_sorted.tail(3)
selected_2015_2010 = pd.concat([top_3_2015_2010, bottom_3_2015_2010], ignore_index=True)
selected_2015_2010['year'] = 2015
selected_2015_2010['type'] = 'population'

# Extract the top 3 largest and bottom 3 smallest values
top_3_2020_2015 = df_diff_2020_2015_sorted.head(3)
bottom_3_2020_2015 = df_diff_2020_2015_sorted.tail(3)
selected_2020_2015 = pd.concat([top_3_2020_2015, bottom_3_2020_2015], ignore_index=True)
selected_2020_2015['year'] = 2020
selected_2020_2015['type'] = 'population'


In [None]:
# GPP extract the top 3 largest and bottem 3 smallest data point
gross_df = pd.read_csv('merged_df.csv')

# Extract data for each year
df_2010 = gross_df[gross_df['year'] == 2010][['lat', 'lon', 'gross']]
df_2015 = gross_df[gross_df['year'] == 2015][['lat', 'lon', 'gross']]
df_2020 = gross_df[gross_df['year'] == 2020][['lat', 'lon', 'gross']]

# Merge on lat and lon
gross_df_2015_2010 = df_2015.merge(df_2010, on=['lat', 'lon'], suffixes=('_2015', '_2010'))
gross_df_2020_2015 = df_2020.merge(df_2015, on=['lat', 'lon'], suffixes=('_2020', '_2015'))


# Compute difference
gross_df_2015_2010['diff'] = gross_df_2015_2010['gross_2015'] - gross_df_2015_2010['gross_2010']
gross_df_2020_2015['diff'] = gross_df_2020_2015['gross_2020'] - gross_df_2020_2015['gross_2015']


# Keep only relevant columns
gross_df_2020_2015 = gross_df_2020_2015[['lat', 'lon', 'diff']]
gross_df_2015_2010 = gross_df_2015_2010[['lat', 'lon', 'diff']]

# Sort by 'value' in descending order
gross_df_diff_2015_2010_sorted = gross_df_2015_2010.sort_values(by='diff', ascending=False)
gross_df_diff_2020_2015_sorted = gross_df_2020_2015.sort_values(by='diff', ascending=False)

# Extract the top 3 largest and bottom 3 smallest values
gross_top_3_2015_2010 = gross_df_diff_2015_2010_sorted.head(3)
gross_bottom_3_2015_2010 = gross_df_diff_2015_2010_sorted.tail(3)
gross_select_2015_2010 = pd.concat([gross_top_3_2015_2010, gross_bottom_3_2015_2010], ignore_index=True)
gross_select_2015_2010['year'] = 2015
gross_select_2015_2010['type'] = 'gpp'

gross_top_3_2020_2015 = gross_df_diff_2020_2015_sorted.head(3)
gross_bottom_3_2020_2015 = gross_df_diff_2020_2015_sorted.tail(3)
gross_select_2020_2015 = pd.concat([gross_top_3_2020_2015, gross_bottom_3_2020_2015], ignore_index=True)
gross_select_2020_2015['year'] = 2020
gross_select_2020_2015['type'] = 'gpp'


In [None]:
# concat the dataframes
dfs = [selected_2015_2010, selected_2020_2015, gross_select_2015_2010, gross_select_2020_2015]
combined_df = pd.concat(dfs, ignore_index=True)

file_path = os.path.join(save_path, 'combined_population_gpp_pixcel.csv')

combined_df.to_csv(file_path, index=False)

In [None]:
arrrr = all_data['gridded'][2010]
filtered_arr = arrrr[arrrr >= 0]
len(filtered_arr)