In [13]:
import numpy as np
import pandas as pd
import re
import os
import csv
import json

In [14]:
import rasterio
from rasterio.plot import show
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.features import geometry_mask
import geopandas as gpd
from pyproj import CRS

In [15]:
from matplotlib import pyplot as plt

In [16]:
# pip install pydrive
# pip install google-api-python-client google-auth-httplib2

from googleapiclient.discovery import build
from google_auth_httplib2 import AuthorizedHttp
from google.oauth2 import credentials

Credentials_file = 'client_secret.json'
creds = credentials.Credentials.from_authorized_user_file(Credentials_file)

# 创建 Google Drive API 客户端
service = build('drive', 'v3', http=AuthorizedHttp(creds, http=None))

FileNotFoundError: [Errno 2] No such file or directory: 'client_secret.json'

# 1 Reprojection

**CRS of Land as reprojection reference.**


In [None]:
# Define years list
years = range(2000, 2024)

In [11]:
# Define input and output paths
data_path = 'Datasets_Hackathon'

reproject_path = 'Datasets_Hackathon/Reprojected_Data'
if not os.path.exists(reproject_path):
        os.makedirs(reproject_path)
        
csv_path = 'For_dashboard'
if not os.path.exists(csv_path):
        os.makedirs(csv_path)

## 1.1 Dynamic Data

In [None]:
"""
Reprojecting all dynamic data
"""

# List of data categories with their appropriate resampling methods and filename formats
datasets = [
    {'short_name':'land', 'name': 'Land_Cover_Data', 'file_format': '{year}LCT.tif', 'is_reference': True},
    {'short_name':'rainfall', 'name': 'Climate_Precipitation_Data', 'file_format': '{year}R.tif', 'resampling': Resampling.bilinear},
    {'short_name':'pop', 'name': 'Gridded_Population_Density_Data', 'file_format': 'Assaba_Pop_{year}.tif','resampling': Resampling.bilinear},
    {'short_name':'popdens', 'name': 'Gridded_Population_Density_Data', 'file_format': 'mrt_pd_{year}_1km.tif','resampling': Resampling.bilinear},
    {'short_name':'gpp', 'name': 'Gross_Primary_Production_GPP', 'file_format': '{year}_GP.tif','resampling': Resampling.nearest},
]

# Dictionary to store data for all years and all datasets
all_data = {d['short_name']: {} for d in datasets}

# Loop through each year
for year in years:
    print(f"Processing data for year {year}...")
    
    # First, open the reference dataset (GPP)
    ref_dataset = next(d for d in datasets if d['is_reference'])
    ref_file = os.path.join(data_path, ref_dataset['name'], ref_dataset['file_format'].format(year=year))
    
    try:
        with rasterio.open(ref_file) as src_ref:
            # Get reference metadata
            dst_crs = src_ref.crs
            dst_transform = src_ref.transform
            dst_height = src_ref.height
            dst_width = src_ref.width
            
            # Read reference data (land)
            land_data = src_ref.read(1)
            all_data['land'][year] = land_data
            
            # Store reference profile for output files
            profile = src_ref.profile.copy()
            profile.update(dtype=rasterio.float64, count=1)
            
            # Process each non-reference dataset
            for dataset in [d for d in datasets if not d.get('is_reference', False)]:
                dataset_name = dataset['short_name']  # Extract short name
                
                # Construct input filename using the file format template
                input_file = os.path.join(data_path, dataset['name'], dataset['file_format'].format(year=year))
                output_file = os.path.join(reproject_path, f"{dataset_name}_reprojected_{year}.tif")
                
                # Create destination array
                dst_array = np.zeros((dst_height, dst_width), dtype=rasterio.float32)
                
                # Open and reproject
                try:
                    with rasterio.open(input_file) as src:
                        reproject(
                            source=rasterio.band(src, 1),
                            destination=dst_array,
                            src_transform=src.transform,
                            src_crs=src.crs,
                            dst_transform=dst_transform,
                            dst_crs=dst_crs,
                            resampling=dataset['resampling']
                        )
                        
                        # Store in all_data dictionary by year
                        all_data[dataset_name][year] = dst_array
                        
                        # Save reprojected data
                        with rasterio.open(output_file, 'w', **profile) as dst:
                            dst.write(dst_array, 1)
                            
                        print(f"Successfully reprojected and saved {output_file}")
                        
                except Exception as e:
                    print(f"Error processing {input_file}: {e}")
                    continue

    except Exception as e:
        print(f"Error processing year {year}: {e}")
        continue

## 1.2 Static Data

In [None]:
def preprocess_cols(gdf):
    """
    Preprocesses GeoDataFrame for shapefile compatibility.

    Converts large integers to strings and formats datetime columns as ISO strings.

    Parameters:
    -----------
    gdf: geopandas.GeoDataFrame
        Input GeoDataFrame.

    Returns:
    -----------
    gdf_copy: geopandas.GeoDataFrame
        Processed GeoDataFrame.
    """

    gdf_copy = gdf.copy()
    
    for col in gdf_copy.columns:
        # Convert large integers to strings, eg. "osm_id" col in Water shp
        if gdf_copy[col].dtype == 'float64':
            gdf_copy[col] = gdf_copy[col].astype(str)
        
        # Handle datetime columns, eg. "date" cols
        if pd.api.types.is_datetime64_any_dtype(gdf_copy[col]): 
            # Convert to string in ISO format
            gdf_copy[col] = gdf_copy[col].dt.strftime('%Y-%m-%d %H:%M:%S')

    return gdf_copy
    

In [None]:
def reproject_and_save_shapefile(input_gdf, dst_crs, output_path):
    """
    Reprojects GeoDataFrame and saves as shapefile.

    Handles large integers, datetime, and CRS conversion.

    Parameters:
    -----------
    input_gdf: geopandas.GeoDataFrame
        Input GeoDataFrame.
        
    dst_crs: str or pyproj.CRS
        Destination CRS.

    output_path: str
        Output shapefile path.

    Returns:
    -----------
    gdf_reprojected: geopandas.GeoDataFrame
        Reprojected GeoDataFrame.
    """

    gdf_preprocessed = preprocess_cols(input_gdf)
    
    # Reproject
    gdf_reprojected = gdf_preprocessed.to_crs(dst_crs)
    
    # Save with modified field handling
    gdf_reprojected.to_file(output_path)
    
    return gdf_reprojected

In [None]:
def shp_to_tif(gdf, ref_raster, output_tif_path):
    """
    Converts GeoDataFrame to TIFF raster file based on a reference raster.

    Parameters:
    -----------
    gdf: geopandas.GeoDataFrame
        Input GeoDataFrame.
        
    ref_raster: rasterio.DatasetReader
        Reference raster dataset to use for getting transform, crs, width, height.
    
    output_tif_path: str
        Output path for the TIFF raster file.
    """
    shapes = ((geom, 1) for geom in gdf.geometry)
    
    # Use rasterio.features.rasterize to rasterize the shapefile
    rasterized_array = rasterio.features.rasterize(
        shapes=shapes,
        out_shape=(ref_raster.height, ref_raster.width),
        transform=ref_raster.transform,
        fill=ref_raster.nodata,
        dtype='float64'
    )
    
    # Create a profile for the new TIFF file
    profile = ref_raster.profile.copy()
    profile.update({
        'dtype': 'float64',
        'compress': 'lzw'
    })
    
    # Write the rasterized shapefile to a new TIFF file
    with rasterio.open(output_tif_path, 'w', **profile) as dst:
        dst.write(rasterized_array, 1)

In [None]:
# Paths
Road_path = "Datasets_Hackathon/Streamwater_Line_Road_Network/Main_Road.shp"
Water_path = "Datasets_Hackathon/Streamwater_Line_Road_Network/Streamwater.shp"
Dist_path = 'Datasets_Hackathon/Admin_layers/Assaba_Districts_layer.shp'
ref_path = 'Datasets_Hackathon/Land_Cover_Data/2010LCT.tif'

In [None]:
"""
Reproject static data
"""

# Load shapefiles
road = gpd.read_file(Road_path)
water = gpd.read_file(Water_path)
dist = gpd.read_file(Dist_path)

# Open reference raster to get CRS and other parameters
with rasterio.open(ref_path) as src_ref:
    dst_crs = src_ref.crs
    dst_transform = src_ref.transform
    dst_height = src_ref.height
    dst_width = src_ref.width
    nodata_value = src_ref.nodata
    
    # Read the reference raster data
    ref_array = src_ref.read(1)
    
    # Create a mask for valid data (where values are not nodata)
    valid_mask = ref_array != nodata_value

    # Reproject and save shapefiles with field handling
    road_reprojected = reproject_and_save_shapefile(
        road, 
        dst_crs, 
        os.path.join(reproject_path, 'road_reprojected.shp')
    )
    
    water_reprojected = reproject_and_save_shapefile(
        water, 
        dst_crs, 
        os.path.join(reproject_path, 'water_reprojected.shp')
    )
    
    dist_reprojected = reproject_and_save_shapefile(
        dist, 
        dst_crs, 
        os.path.join(reproject_path, 'dist_reprojected.shp')
    )

    # Save each reprojected shapefile as TIFF
    shp_to_tif(road_reprojected, src_ref, os.path.join(reproject_path, 'road_reprojected.tif'))
    shp_to_tif(water_reprojected, src_ref, os.path.join(reproject_path, 'water_reprojected.tif'))
    shp_to_tif(dist_reprojected, src_ref, os.path.join(reproject_path, 'dist_reprojected.tif'))

    # Create a new raster with the valid data mask
    profile = src_ref.profile.copy()
    profile.update({
        'dtype': rasterio.float64,
        'nodata': nodata_value,
        'compress': 'lzw'
    })

    # Mask the reference raster
    masked_array = np.where(valid_mask, ref_array, nodata_value).astype(rasterio.float32)

    # Save the masked raster
    masked_raster_path = os.path.join(reproject_path, '2010LCT_masked.tif')
    with rasterio.open(masked_raster_path, 'w', **profile) as dst:
        dst.write(masked_array, 1)

print("Reprojection and masking completed successfully!")

# 2 Cleaning

## 2.1 Overview of invalid data

In [None]:
# overview of all data
all_data

# invalid data in each dict:
# 1. negative value:
#   -128.0 in land
#   -3.4028235e+38 in rainfall and pop
# 2. positive value:
#   65535 in GPP

In [None]:
all_data.keys()

## 2.2 Functions for masking

In [None]:
def create_mask(data_dict, invalid_criteria=None):
    """
    Creates masks for data based on specified invalid criteria.

    Parameters:
    -----------
    data_dict: dict
        Dictionary with data types as keys and yearly data arrays as values.
    invalid_criteria: dict, optional
        Dictionary defining invalid conditions per data type using 'condition' lambda functions.
        Defaults to masking values less than 0 if not provided.

    Returns:
    -----------
    mask_dict: dict
        Dictionary with boolean masks where True marks invalid data.

    Notes:
    ------
    - Custom invalid criteria can be defined for each data type.
    - Default condition is values less than 0.
    """

    # Default invalid data criteria
    default_criteria = {
        'default': {'condition': lambda x: (x < 0)}
    }
    
    # Merge default criteria with provided criteria
    if invalid_criteria is None:
        invalid_criteria = default_criteria
    else:
        for key, value in default_criteria.items():
            if key not in invalid_criteria:
                invalid_criteria[key] = value
    
    # Create mask dictionary
    mask_dict = {}
    
    # Iterate through data types
    for data_type, year_data in data_dict.items():
        mask_dict[data_type] = {}
        
        # Determine criteria for this data type
        criteria = invalid_criteria.get(data_type, invalid_criteria['default'])
        condition = criteria['condition']
        
        # Create masks for each year
        for year, array in year_data.items():
            # Apply the condition to create a boolean mask
            mask_dict[data_type][year] = condition(array)
    
    return mask_dict


In [None]:
def apply_mask(data_dict, mask_dict):
    """
    Applies masks to a dictionary of data arrays.

    Parameters:
    -----------
    data_dict: dict
        Dictionary with data arrays by data type and year.
    mask_dict: dict
        Dictionary with boolean masks by data type and year, where True indicates invalid data.

    Returns:
    -----------
    masked_data_dict: dict
        Dictionary with masked data arrays, where invalid values are masked.

    Notes:
    ------
    - The function uses `numpy.ma.array()` to create masked arrays.
    - The mask is applied to the data arrays, so invalid data points 
        (those corresponding to `True` in the mask) will be masked (ignored).
    """

    masked_data_dict = {}
    
    # Iterate through data types
    for data_type, year_data in data_dict.items():
        masked_data_dict[data_type] = {}
        
        # Apply mask for each year
        for year, array in year_data.items():
            masked_data_dict[data_type][year] = np.ma.array(
                array, 
                mask=mask_dict[data_type][year]
            )
    
    return masked_data_dict

In [None]:
def dict_to_df(masked_data_dict):    
    """
    Convert masked data dictionary to a comprehensive DataFrame with location information.
    
    Parameters:
    -----------
    masked_data_dict : dict
        Nested dictionary of masked arrays
    
    Returns:
    --------
    pandas.DataFrame
        Comprehensive DataFrame with data information and coordinates
    """
    
    data_rows = []
    
    # Iterate through data types
    for data_type, year_data in masked_data_dict.items():
        # Iterate through years
        for year, masked_array in year_data.items():
            # Create a grid of row and column indices
            rows, cols = np.indices(masked_array.shape)
            
            # Create a mask for non-masked elements
            valid_mask = ~masked_array.mask
            
            # Get valid data points
            valid_data = masked_array.data[valid_mask]
            valid_rows = rows[valid_mask]
            valid_cols = cols[valid_mask]
            
            # Create rows for each valid data point
            data_rows.extend([{
                'year': year,
                'lat': valid_rows[idx],
                'lon': valid_cols[idx],
                data_type: value
            } for idx, value in enumerate(valid_data)])
    
    # Create DataFrame
    df = pd.DataFrame(data_rows)
    
    # If multiple data types exist, pivot and merge
    if len(df.columns) > 4:
        # Pivot the DataFrame to have one row per (year, lat, lon)
        df_pivoted = df.pivot_table(
            index=['year', 'lat', 'lon'], 
            values=df.columns[3:],  # Use the data type columns
            aggfunc='first'         # Use 'first' to aggregate if needed
        ).reset_index()
        
        return df_pivoted
    
    return df


In [None]:
def analyze_masked_data(masked_data_dict):
    """
    Computes summary statistics for masked data.

    Parameters:
    -----------
    masked_data_dict: dict
        Dictionary of masked arrays where keys represent data types and years.

    Returns:
    -----------
    dict
        A dictionary with summary statistics (e.g., valid points, masked percentage, 
        min, max, mean, and median) for each data type and year.
    """
    
    summary_stats = {}
    
    # Iterate through data types
    for data_type, year_data in masked_data_dict.items():
        summary_stats[data_type] = {}
        
        # Compute statistics for each year
        for year, masked_array in year_data.items():
            # Compute statistics on valid (unmasked) data
            valid_data = masked_array.compressed()
            
            summary_stats[data_type][year] = {
                'total_points': masked_array.size,
                'valid_points': len(valid_data),
                'masked_points': masked_array.size - len(valid_data),
                'masked_percentage': (masked_array.size - len(valid_data)) / masked_array.size * 100,
                'min': np.min(valid_data) if len(valid_data) > 0 else None,
                'max': np.max(valid_data) if len(valid_data) > 0 else None,
                'mean': np.mean(valid_data) if len(valid_data) > 0 else None,
                'median': np.median(valid_data) if len(valid_data) > 0 else None
            }
    
    return summary_stats

In [None]:
def main(data_dict, invalid_criteria=None):
    """
    Main processing function for data masking.
    
    Parameters:
    -----------
    data_dict : dict
        Nested dictionary of data arrays
    invalid_criteria : dict, optional
        Custom invalid data criteria
    
    Returns:
    --------
    dict
        Processed data results
    
    Example:
    --------
    >>> result = main(data_dict, invalid_criteria=lambda x: x < 0)
    >>> result['dataframe']  # Access the resulting DataFrame
    >>> result['summary']  # View the summary statistics
    """
    
    # Create masks
    mask_dict = create_mask(data_dict, invalid_criteria)
    
    # Apply masks
    masked_data_dict = apply_mask(data_dict, mask_dict)
    
    # Convert to DataFrame
    # df = dict_to_df(masked_data_dict)
    df = dict_to_df(masked_data_dict)

    # Get summary statistics
    summary = analyze_masked_data(masked_data_dict)
    
    return {
        'masked_data': masked_data_dict,
        'dataframe': df,
        'summary': summary,
        'mask_dict': mask_dict
    }

In [None]:
if __name__ == '__main__':
    print("Nested Dictionary Data Masking Module")
    print("Supports NumPy array-compatible masking for different data types")

## 2.3 Processing - dynamic data - applying mask

In [None]:
# Custom criteria

custom_criteria_dynamic = {
    'land': {'condition': lambda x: np.logical_or(x < 0, x == 255)},
    'gpp': {'condition': lambda x: np.logical_or(x == 65533, x == 65535)},
    'pop': {'condition': lambda x: x < 0}
}

# Process data
prepared_dict = main(all_data, invalid_criteria=custom_criteria_dynamic)


In [None]:
prepared_dict.keys()

In [None]:
# check masking results
data_types = ['land', 'gpp', 'pop', 'popdens', 'rainfall']
show_years = [2023, 2023, 2020, 2020, 2023]

for data_type, year in zip(data_types, show_years):
    print(f"Masking Summary for {data_type.capitalize()} in {year}\n {prepared_dict['summary'][data_type][year]}")


In [None]:
prepared_df = prepared_dict['dataframe']
prepared_df[prepared_df['gpp']>100]

## 2.4 Processing - static data

In [None]:
static_data = {}

for i in ['road', 'water', 'dist']:
    static_year_data = {}

    with rasterio.open(os.path.join(reproject_path, f'{i}_reprojected.tif')) as raster:
        raster_arr = raster.read(1)
        for year in years:
            static_year_data[year] = raster_arr
        
    static_data[i] = static_year_data

In [None]:
# Invalid criteria for static data
custom_criteria_static = {
    'road': {'condition': lambda x: x == -128},
    'water': {'condition': lambda x: x == -128},
    'dist': {'condition': lambda x: x == -128}
}

# Process data
prepared_dict_2 = main(static_data, invalid_criteria=custom_criteria_static)

In [None]:
# check masking results
data_types_2 = ['road', 'water', 'dist']
years_2 = [2023, 2023, 2023]

for data_type, year in zip(data_types_2, years_2):
    print(f"Masking Summary for {data_type.capitalize()} in {year}\n {prepared_dict_2['summary'][data_type][year]}")


In [None]:
static_df = prepared_dict_2['dataframe']
static_df

In [None]:
second_prepared_df = pd.merge(prepared_df, static_df, on=['year','lat', 'lon'], how='left')
second_prepared_df

## 2.5 Finalise dataset preparation

- Remove invalid data;
- Combine dynamic and static data;
- Add actual latitude and longitude.

In [None]:
# drop invalid data
clean_df = second_prepared_df.dropna(subset=['land'])

In [None]:
# missing value in dynamic data like 'pop' and static data like 'road', 'water' is not invalid 
# but states that there is 0 people/no road/no water etc.
# so fill with 0
clean_df = clean_df.fillna(0)

In [None]:
# import import_ipynb
import import_ipynb
from Actual_Coords_Projection import gen_coords

lat_center = 16.759763
lon_center = -11.725705
pixel_resolution = 463.312716525
rows, cols = 769, 565

coords_dict = gen_coords(lat_center, lon_center, pixel_resolution, rows, cols)
coords_dict

clean_df['actual_lat'] = clean_df.apply(lambda row: coords_dict.get((row['lat'], row['lon']), (None, None))[0], axis=1)
clean_df['actual_lon'] = clean_df.apply(lambda row: coords_dict.get((row['lat'], row['lon']), (None, None))[1], axis=1)

In [None]:
clean_df[clean_df.select_dtypes(include=['float64']).columns] = clean_df.select_dtypes(include=['float64']).astype('float32')
clean_df

In [None]:
print(f"Clean Data info\n:{clean_df.info()}")
print(f"Clean Data Null value summary\n:{clean_df.isnull().sum()}")
print(f"Clean Data NaN value summary\n:{clean_df.isna().sum()}")

In [None]:
# pip install pydrive

In [None]:
# save file to local (git res.)
# clean_df.to_json(os.path.join(csv_path,f"clean_data.json"))
clean_df.to_csv(os.path.join(csv_path,f"clean_data.csv"), index=0)

In [12]:
# 删除本地的CSV文件
if os.path.exists(os.path.join(csv_path, f"clean_data.csv")):
    os.remove(os.path.join(csv_path, f"clean_data.csv"))
    print(f'Local file {os.path.join(csv_path, f"clean_data.csv")} has been deleted.')
else:
    print(f'File {os.path.join(csv_path, f"clean_data.csv")} does not exist.')

Local file For_dashboard/clean_data.csv has been deleted.


In [None]:
# 定义文件路径
file_path = os.path.join(csv_path, f"clean_data.csv")
folder_id = 'START_Hackathon'

# 上传文件到指定文件夹
media = MediaFileUpload(CSV_FILE, mimetype='text/csv')
if FOLDER_ID:
    file_metadata['parents'] = [FOLDER_ID]
file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
print(f"File ID: {file.get('id')}")

# 删除本地的CSV文件
if os.path.exists(file_path):
    os.remove(file_path)
    print(f'Local file {file_path} has been deleted.')
else:
    print(f'File {file_path} does not exist.')

In [None]:
# clean_df.to_csv(os.path.join(csv_path, f"clean_df.csv"), index=0)
# with open(os.path.join(csv_path, f"clean_df.csv"), mode='r', encoding='utf-8') as csv_file:
#     csv_reader = csv.DictReader(csv_file) 
#     rows = list(csv_reader)

# # save csv to json
# with open(os.path.join(csv_path, f"clean_data.json"), mode='w', encoding='utf-8') as json_file:
#     json.dump(rows, json_file, indent=4, ensure_ascii=False)