In [None]:
import numpy as np
import pandas as pd
import re
import os

import rasterio
from rasterio.plot import show
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.features import geometry_mask

import geopandas as gpd
from pyproj import CRS

from matplotlib import pyplot as plt

# Reprojection

**CRS of Land as reprojection reference.**


In [None]:
reproject_path = 'Datasets_Hackathon/Reprojected_Data'
csv_path = 'For_dashboard'

In [None]:
# Define years list
years = range(2010, 2023)

# Define input and output paths
data_path = 'Datasets_Hackathon'

# List of data categories with their appropriate resampling methods and filename formats
datasets = [
    {'name': 'Land_Cover_Data', 'file_format': '{year}LCT.tif', 'is_reference': True},
    {'name': 'Gridded_Population_Density_Data', 'file_format': 'Assaba_Pop_{year}.tif','resampling': Resampling.bilinear},
]

# Dictionary to store data for all years and all datasets
all_data = {}
for dataset in datasets:
    short_name = dataset['name'].split('_')[0].lower()
    all_data[short_name] = {}

# Loop through each year
for year in years:
    print(f"Processing data for year {year}...")
    
    # First, open the reference dataset (GPP)
    ref_dataset = next(d for d in datasets if d['is_reference'])
    ref_file = os.path.join(data_path, ref_dataset['name'], ref_dataset['file_format'].format(year=year))
    
    try:
        with rasterio.open(ref_file) as src_ref:
            # Get reference metadata
            dst_crs = src_ref.crs
            dst_transform = src_ref.transform
            dst_height = src_ref.height
            dst_width = src_ref.width
            
            # Read reference data (land)
            land_data = src_ref.read(1)
            all_data['land'][year] = land_data
            
            # Store reference profile for output files
            profile = src_ref.profile.copy()
            profile.update(dtype=rasterio.float32, count=1)
            
            # Process each non-reference dataset
            for dataset in [d for d in datasets if not d.get('is_reference', False)]:
                dataset_name = dataset['name'].split('_')[0].lower()  # Extract short name
                
                # Construct input filename using the file format template
                input_file = os.path.join(data_path, dataset['name'], dataset['file_format'].format(year=year))
                output_file = os.path.join(reproject_path, f"{dataset_name}_reprojected_{year}.tif")
                
                # Create destination array
                dst_array = np.zeros((dst_height, dst_width), dtype=rasterio.float32)
                
                # Open and reproject
                try:
                    with rasterio.open(input_file) as src:
                        reproject(
                            source=rasterio.band(src, 1),
                            destination=dst_array,
                            src_transform=src.transform,
                            src_crs=src.crs,
                            dst_transform=dst_transform,
                            dst_crs=dst_crs,
                            resampling=dataset['resampling']
                        )
                        
                        # Store in all_data dictionary by year
                        all_data[dataset_name][year] = dst_array
                        
                        # Save reprojected data
                        with rasterio.open(output_file, 'w', **profile) as dst:
                            dst.write(dst_array, 1)
                            
                        print(f"Successfully reprojected and saved {output_file}")
                        
                except Exception as e:
                    print(f"Error processing {input_file}: {e}")
                    continue

    except Exception as e:
        print(f"Error processing year {year}: {e}")
        continue

Processing data for year 2010...
Successfully reprojected and saved Datasets_Hackathon/reprojected_data/gridded_reprojected_2010.tif
Processing data for year 2011...
Error processing Datasets_Hackathon/Gridded_Population_Density_Data/Assaba_Pop_2011.tif: Datasets_Hackathon/Gridded_Population_Density_Data/Assaba_Pop_2011.tif: No such file or directory
Processing data for year 2012...
Error processing Datasets_Hackathon/Gridded_Population_Density_Data/Assaba_Pop_2012.tif: Datasets_Hackathon/Gridded_Population_Density_Data/Assaba_Pop_2012.tif: No such file or directory
Processing data for year 2013...
Error processing Datasets_Hackathon/Gridded_Population_Density_Data/Assaba_Pop_2013.tif: Datasets_Hackathon/Gridded_Population_Density_Data/Assaba_Pop_2013.tif: No such file or directory
Processing data for year 2014...
Error processing Datasets_Hackathon/Gridded_Population_Density_Data/Assaba_Pop_2014.tif: Datasets_Hackathon/Gridded_Population_Density_Data/Assaba_Pop_2014.tif: No such file 

# Cleaning & Preparation

In [None]:
# 2010,2015,2020 population per pixel
# 1. <0 invalid data ->0
# 2. LAT LON df
# 3. calculation
all_data['gridded']

{2010: array([[-3.4028235e+38, -3.4028235e+38, -3.4028235e+38, ...,
         -3.4028235e+38, -3.4028235e+38, -3.4028235e+38],
        [-3.4028235e+38, -3.4028235e+38, -3.4028235e+38, ...,
         -3.4028235e+38, -3.4028235e+38, -3.4028235e+38],
        [-3.4028235e+38, -3.4028235e+38, -3.4028235e+38, ...,
         -3.4028235e+38, -3.4028235e+38, -3.4028235e+38],
        ...,
        [-3.4028235e+38, -3.4028235e+38, -3.4028235e+38, ...,
         -3.4028235e+38, -3.4028235e+38, -3.4028235e+38],
        [-3.4028235e+38, -3.4028235e+38, -3.4028235e+38, ...,
         -3.4028235e+38, -3.4028235e+38, -3.4028235e+38],
        [-3.4028235e+38, -3.4028235e+38, -3.4028235e+38, ...,
         -3.4028235e+38, -3.4028235e+38, -3.4028235e+38]], dtype=float32),
 2015: array([[-3.4028235e+38, -3.4028235e+38, -3.4028235e+38, ...,
         -3.4028235e+38, -3.4028235e+38, -3.4028235e+38],
        [-3.4028235e+38, -3.4028235e+38, -3.4028235e+38, ...,
         -3.4028235e+38, -3.4028235e+38, -3.4028235e+38]

In [None]:
# Gridded land use -- get the hotspots top3 pixel
# Store transformed dataframes
df_dict = {}

# Process each year's array
for year, array in all_data['gridded'].items():
    # Replace negative values with 0
    array = np.maximum(array, 0)

    # Get row (lat) and column (lon) indices
    rows, cols = np.indices(array.shape)

    # Convert to DataFrame
    df = pd.DataFrame({
        'lat': rows.flatten(),  # Row index as latitude
        'lon': cols.flatten(),  # Column index as longitude
        'value': array.flatten()  # Flattened values
    })

    # Store in dictionary
    df_dict[year] = df
    
    # create population dataframe for finalised merged df
    population_df_list = []
    for year, df in df_dict.items():
        # Add a 'year' column to each DataFrame
        df['year'] = year
        
        # Append the DataFrame to the list
        population_df_list.append(df)

    # Concatenate all DataFrames into a single DataFrame
    population_df = pd.concat(population_df_list, ignore_index=True)
    
    population_df.to_csv('population_df.csv')

# Compute differences
df_diff_2015_2010 = df_dict[2015].copy()
df_diff_2020_2015 = df_dict[2020].copy()

# Subtract values
df_diff_2015_2010['value'] -= df_dict[2010]['value']
df_diff_2020_2015['value'] -= df_dict[2015]['value']

# rename value column to avoid confusion
df_diff_2015_2010 = df_diff_2015_2010.rename(columns={'value': 'diff'})
df_diff_2020_2015 = df_diff_2020_2015.rename(columns={'value': 'diff'})

# Sort df_diff_20xx_20xx in descending order
df_diff_2015_2010_sorted = df_diff_2015_2010.sort_values(by='diff', ascending=False)
df_diff_2020_2015_sorted = df_diff_2020_2015.sort_values(by='diff', ascending=False)

# Extract the top 3 largest and bottom 3 smallest values
top_3_2015_2010 = df_diff_2015_2010_sorted.head(3)
bottom_3_2015_2010 = df_diff_2015_2010_sorted.tail(3)
selected_2015_2010 = pd.concat([top_3_2015_2010, bottom_3_2015_2010], ignore_index=True)
selected_2015_2010['year'] = 2015
selected_2015_2010['type'] = 'population'

# Extract the top 3 largest and bottom 3 smallest values
top_3_2020_2015 = df_diff_2020_2015_sorted.head(3)
bottom_3_2020_2015 = df_diff_2020_2015_sorted.tail(3)
selected_2020_2015 = pd.concat([top_3_2020_2015, bottom_3_2020_2015], ignore_index=True)
selected_2020_2015['year'] = 2020
selected_2020_2015['type'] = 'population'


In [None]:
# GPP extract the top 3 largest and bottem 3 smallest data point
gross_df = pd.read_csv('merged_df.csv')

# Extract data for each year
df_2010 = gross_df[gross_df['year'] == 2010][['lat', 'lon', 'gross']]
df_2015 = gross_df[gross_df['year'] == 2015][['lat', 'lon', 'gross']]
df_2020 = gross_df[gross_df['year'] == 2020][['lat', 'lon', 'gross']]

# Merge on lat and lon
gross_df_2015_2010 = df_2015.merge(df_2010, on=['lat', 'lon'], suffixes=('_2015', '_2010'))
gross_df_2020_2015 = df_2020.merge(df_2015, on=['lat', 'lon'], suffixes=('_2020', '_2015'))


# Compute difference
gross_df_2015_2010['diff'] = gross_df_2015_2010['gross_2015'] - gross_df_2015_2010['gross_2010']
gross_df_2020_2015['diff'] = gross_df_2020_2015['gross_2020'] - gross_df_2020_2015['gross_2015']


# Keep only relevant columns
gross_df_2020_2015 = gross_df_2020_2015[['lat', 'lon', 'diff']]
gross_df_2015_2010 = gross_df_2015_2010[['lat', 'lon', 'diff']]

# Sort by 'value' in descending order
gross_df_diff_2015_2010_sorted = gross_df_2015_2010.sort_values(by='diff', ascending=False)
gross_df_diff_2020_2015_sorted = gross_df_2020_2015.sort_values(by='diff', ascending=False)

# Extract the top 3 largest and bottom 3 smallest values
gross_top_3_2015_2010 = gross_df_diff_2015_2010_sorted.head(3)
gross_bottom_3_2015_2010 = gross_df_diff_2015_2010_sorted.tail(3)
gross_select_2015_2010 = pd.concat([gross_top_3_2015_2010, gross_bottom_3_2015_2010], ignore_index=True)
gross_select_2015_2010['year'] = 2015
gross_select_2015_2010['type'] = 'gpp'

gross_top_3_2020_2015 = gross_df_diff_2020_2015_sorted.head(3)
gross_bottom_3_2020_2015 = gross_df_diff_2020_2015_sorted.tail(3)
gross_select_2020_2015 = pd.concat([gross_top_3_2020_2015, gross_bottom_3_2020_2015], ignore_index=True)
gross_select_2020_2015['year'] = 2020
gross_select_2020_2015['type'] = 'gpp'


In [None]:
# concat the dataframes
dfs = [selected_2015_2010, selected_2020_2015, gross_select_2015_2010, gross_select_2020_2015]
combined_df = pd.concat(dfs, ignore_index=True)

file_path = os.path.join(save_path, 'combined_population_gpp_pixcel.csv')

combined_df.to_csv(file_path, index=False)

In [None]:
arrrr = all_data['gridded'][2010]
filtered_arr = arrrr[arrrr >= 0]
len(filtered_arr)

167202