# Phase 5: Python Analysis (Hybrid Approach)

This notebook implements **Phase 5** of the Cologne Green Space Analysis workflow.
It uses `geopandas` and `pandas` to analyze the zonal statistics calculated in QGIS and stored in the GeoPackage database.

**Prerequisites:**
- You must have completed **Phase 3** (Zonal Statistics) in QGIS.
- The file `cologne_analysis.gpkg` must exist in the project root and contain the layer `veedel_with_stats`.

### Step 5.1: Load Data from GeoPackage

In [None]:
import geopandas as gpd
import pandas as pd
import os

# Define path to GeoPackage
# Assuming this notebook is in 'Notebooks/' and the gpkg is in the root
gpkg_path = os.path.abspath(os.path.join("..", "cologne_analysis.gpkg"))

print(f"Loading data from: {gpkg_path}")

try:
    # Read from your GeoPackage database
    gdf = gpd.read_file(
        gpkg_path,
        layer="veedel_with_stats"
    )

    print("‚úÖ Data loaded from GeoPackage database!")
    print(f"Total Veedel: {len(gdf)}")

    # Display first few rows
    display(gdf.head())

except Exception as e:
    print(f"‚ùå Error loading data: {e}")
    print("Make sure you have completed Phase 3 and saved the layer as 'veedel_with_stats'.")

Loading data from: /Users/rk/Sync/courses/correleraid/cologne_green_project/cologne_green_analysis.gpkg
‚ùå Error loading data: /Users/rk/Sync/courses/correleraid/cologne_green_project/cologne_green_analysis.gpkg: No such file or directory
Make sure you have completed Phase 3 and saved the layer as 'veedel_with_stats'.


### Step 5.2: Basic Statistics

In [None]:
if 'gdf' in locals():
    print("\nüìä NDVI Statistics:")
    # Check if 'ndvi_mean' exists, if not try to find a similar column
    if 'ndvi_mean' not in gdf.columns:
        print("Column 'ndvi_mean' not found. Available columns:", gdf.columns)
    else:
        print(gdf['ndvi_mean'].describe())

### Step 5.3: Identify Greenest and Neglected Areas

In [None]:
if 'gdf' in locals() and 'ndvi_mean' in gdf.columns:
    # Find greenest
    # Adjust 'NAME' to your actual name column if different (e.g., 'STADTVIERTEL')
    name_col = 'NAME' if 'NAME' in gdf.columns else gdf.columns[1] # Fallback to 2nd column

    top10 = gdf.nlargest(10, 'ndvi_mean')[[name_col, 'ndvi_mean']]
    print("\nüå≥ Top 10 Greenest Veedel:")
    print(top10.to_string(index=False))

    # Find neglected (NDVI < 0.25)
    neglected = gdf[gdf['ndvi_mean'] < 0.25][[name_col, 'ndvi_mean']]
    print(f"\n‚ö†Ô∏è  Potentially Neglected Veedel (NDVI < 0.25): {len(neglected)}")
    print(neglected.to_string(index=False))

### Step 5.4: Categorize and Save Results

In [None]:
if 'gdf' in locals() and 'ndvi_mean' in gdf.columns:
    # Calculate green space category
    def categorize_green(ndvi):
        if pd.isna(ndvi):
            return 'Unknown'
        if ndvi < 0.2:
            return 'Critical'
        elif ndvi < 0.3:
            return 'Low'
        elif ndvi < 0.4:
            return 'Moderate'
        else:
            return 'Good'

    gdf['green_category'] = gdf['ndvi_mean'].apply(categorize_green)
    gdf['green_rank'] = gdf['ndvi_mean'].rank(ascending=False, method='min')

    # Save back to GeoPackage as a NEW layer
    try:
        gdf.to_file(
            gpkg_path,
            layer="veedel_analysis_final",
            driver="GPKG"
        )
        print("\n‚úÖ Analysis saved to database as 'veedel_analysis_final' layer!")
    except Exception as e:
        print(f"\n‚ùå Error saving to GeoPackage: {e}")

    # Also export rankings as CSV
    output_csv = os.path.abspath(os.path.join("..", "data", "outputs", "veedel_greenness_ranking.csv"))

    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)

    rankings = gdf[[name_col, 'ndvi_mean', 'green_category', 'green_rank']].copy()
    rankings = rankings.sort_values('green_rank')
    rankings.to_csv(
        output_csv,
        index=False
    )

    print(f"‚úÖ CSV exported to: {output_csv}")