# Geo-Enrichment Dataset Analysis

**Purpose**: Explore the geo-enrichment dataset combining energy metrics with OSM-based spatial features

**Date**: January 16, 2026

## Objectives
1. Explore energy metrics vs. OSM features relationships
2. Analyze regional patterns (NUTS0 and NUTS2)
3. Identify correlations between infrastructure and energy consumption
4. Visualize spatial distributions of features
5. Generate insights for ML model development

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Database connection
DB_CONFIG = {
    'host': '172.18.0.1',
    'port': 5432,
    'database': 'lianel_energy',
    'user': 'airflow',
    'password': 'P9xK2mN7vQ4wR8tY3sL6hJ5nB1cV0zX'
}

connection_string = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
engine = create_engine(connection_string)

print("‚úÖ Database connection established")

## 1. Load Geo-Enrichment Dataset

In [None]:
# Load geo-enrichment dataset
query = """
SELECT 
    region_id,
    level_code,
    cntr_code,
    region_name,
    year,
    total_energy_gwh,
    renewable_energy_gwh,
    fossil_energy_gwh,
    pct_renewable,
    area_km2,
    power_plant_count,
    power_generator_count,
    power_substation_count,
    industrial_area_km2,
    railway_station_count,
    airport_count,
    power_plant_density_per_km2,
    power_generator_density_per_km2,
    industrial_area_pct,
    energy_density_gwh_per_km2,
    renewable_density_gwh_per_km2,
    osm_feature_count
FROM ml_dataset_geo_enrichment_v1
WHERE year >= 2020
ORDER BY year DESC, region_id
"""

df = pd.read_sql(query, engine)
print(f"‚úÖ Loaded {len(df):,} records")
print(f"   Years: {df['year'].min()} - {df['year'].max()}")
print(f"   Regions: {df['region_id'].nunique()}")
print(f"   NUTS0: {len(df[df['level_code'] == 0])}")
print(f"   NUTS2: {len(df[df['level_code'] == 2])}")
print(f"   With OSM features: {len(df[df['osm_feature_count'] > 0])}")

df.head()

## 2. Energy vs. OSM Features Analysis

In [None]:
# Filter to regions with OSM features and latest year
df_osm = df[(df['osm_feature_count'] > 0) & (df['year'] == df['year'].max())].copy()

if len(df_osm) > 0:
    # Correlation analysis
    numeric_cols = [
        'total_energy_gwh', 'renewable_energy_gwh', 'fossil_energy_gwh',
        'power_plant_count', 'power_generator_count', 'power_substation_count',
        'industrial_area_km2', 'railway_station_count', 'airport_count',
        'energy_density_gwh_per_km2', 'power_plant_density_per_km2',
        'industrial_area_pct'
    ]
    
    corr_matrix = df_osm[numeric_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(14, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix: Energy vs. OSM Features', fontsize=16, pad=20)
    plt.tight_layout()
    plt.show()
    
    print("\nüìä Key Correlations:")
    print(f"Energy vs Power Plants: {corr_matrix.loc['total_energy_gwh', 'power_plant_count']:.3f}")
    print(f"Energy vs Industrial Area: {corr_matrix.loc['total_energy_gwh', 'industrial_area_km2']:.3f}")
    print(f"Energy Density vs Power Plant Density: {corr_matrix.loc['energy_density_gwh_per_km2', 'power_plant_density_per_km2']:.3f}")
else:
    print("‚ö†Ô∏è No data with OSM features found")

## 3. Regional Patterns Analysis

In [None]:
# Analyze by country (NUTS0)
df_nuts0 = df[df['level_code'] == 0].copy()
df_nuts0_latest = df_nuts0[df_nuts0['year'] == df_nuts0['year'].max()]

if len(df_nuts0_latest) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Energy consumption by country
    ax1 = axes[0, 0]
    top_energy = df_nuts0_latest.nlargest(10, 'total_energy_gwh')
    ax1.barh(top_energy['cntr_code'], top_energy['total_energy_gwh'] / 1000)
    ax1.set_xlabel('Total Energy (TWh)')
    ax1.set_title('Top 10 Countries by Energy Consumption')
    ax1.grid(axis='x', alpha=0.3)
    
    # Renewable percentage by country
    ax2 = axes[0, 1]
    renewable_sorted = df_nuts0_latest.sort_values('pct_renewable', ascending=False).head(10)
    ax2.barh(renewable_sorted['cntr_code'], renewable_sorted['pct_renewable'])
    ax2.set_xlabel('Renewable Energy (%)')
    ax2.set_title('Top 10 Countries by Renewable Percentage')
    ax2.grid(axis='x', alpha=0.3)
    
    # Energy density
    ax3 = axes[1, 0]
    density_sorted = df_nuts0_latest.nlargest(10, 'energy_density_gwh_per_km2')
    ax3.barh(density_sorted['cntr_code'], density_sorted['energy_density_gwh_per_km2'])
    ax3.set_xlabel('Energy Density (GWh/km¬≤)')
    ax3.set_title('Top 10 Countries by Energy Density')
    ax3.grid(axis='x', alpha=0.3)
    
    # Renewable density
    ax4 = axes[1, 1]
    renewable_density = df_nuts0_latest.nlargest(10, 'renewable_density_gwh_per_km2')
    ax4.barh(renewable_density['cntr_code'], renewable_density['renewable_density_gwh_per_km2'])
    ax4.set_xlabel('Renewable Density (GWh/km¬≤)')
    ax4.set_title('Top 10 Countries by Renewable Density')
    ax4.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìä Country Summary (Latest Year: {df_nuts0_latest['year'].max()})")
    print(df_nuts0_latest[['cntr_code', 'total_energy_gwh', 'pct_renewable', 
                           'energy_density_gwh_per_km2']].describe())

## 4. OSM Features Analysis (NUTS2 Regions)

In [None]:
# Analyze NUTS2 regions with OSM features
df_nuts2_osm = df[(df['level_code'] == 2) & (df['osm_feature_count'] > 0) & 
                  (df['year'] == df['year'].max())].copy()

if len(df_nuts2_osm) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # Power plant count vs energy
    ax1 = axes[0, 0]
    ax1.scatter(df_nuts2_osm['power_plant_count'], df_nuts2_osm['total_energy_gwh'] / 1000, alpha=0.6)
    ax1.set_xlabel('Power Plant Count')
    ax1.set_ylabel('Total Energy (TWh)')
    ax1.set_title('Power Plants vs Energy Consumption')
    ax1.grid(alpha=0.3)
    
    # Industrial area vs energy
    ax2 = axes[0, 1]
    ax2.scatter(df_nuts2_osm['industrial_area_km2'], df_nuts2_osm['total_energy_gwh'] / 1000, alpha=0.6)
    ax2.set_xlabel('Industrial Area (km¬≤)')
    ax2.set_ylabel('Total Energy (TWh)')
    ax2.set_title('Industrial Area vs Energy Consumption')
    ax2.grid(alpha=0.3)
    
    # Power plant density vs energy density
    ax3 = axes[0, 2]
    ax3.scatter(df_nuts2_osm['power_plant_density_per_km2'], 
                df_nuts2_osm['energy_density_gwh_per_km2'], alpha=0.6)
    ax3.set_xlabel('Power Plant Density (per km¬≤)')
    ax3.set_ylabel('Energy Density (GWh/km¬≤)')
    ax3.set_title('Power Plant Density vs Energy Density')
    ax3.grid(alpha=0.3)
    
    # Airport count distribution
    ax4 = axes[1, 0]
    ax4.hist(df_nuts2_osm['airport_count'], bins=20, edgecolor='black', alpha=0.7)
    ax4.set_xlabel('Airport Count')
    ax4.set_ylabel('Number of Regions')
    ax4.set_title('Distribution of Airport Count')
    ax4.grid(axis='y', alpha=0.3)
    
    # Railway station count distribution
    ax5 = axes[1, 1]
    ax5.hist(df_nuts2_osm['railway_station_count'], bins=20, edgecolor='black', alpha=0.7)
    ax5.set_xlabel('Railway Station Count')
    ax5.set_ylabel('Number of Regions')
    ax5.set_title('Distribution of Railway Station Count')
    ax5.grid(axis='y', alpha=0.3)
    
    # Industrial area percentage
    ax6 = axes[1, 2]
    ax6.hist(df_nuts2_osm['industrial_area_pct'], bins=20, edgecolor='black', alpha=0.7)
    ax6.set_xlabel('Industrial Area (%)')
    ax6.set_ylabel('Number of Regions')
    ax6.set_title('Distribution of Industrial Area Percentage')
    ax6.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìä NUTS2 Regions with OSM Features Summary")
    print(df_nuts2_osm[['power_plant_count', 'power_generator_count', 'industrial_area_km2',
                        'railway_station_count', 'airport_count']].describe())
else:
    print("‚ö†Ô∏è No NUTS2 data with OSM features found")

In [None]:
# Analyze relationship between infrastructure and renewable energy
if len(df_nuts2_osm) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Power plants vs renewable percentage
    ax1 = axes[0, 0]
    ax1.scatter(df_nuts2_osm['power_plant_count'], df_nuts2_osm['pct_renewable'], alpha=0.6)
    ax1.set_xlabel('Power Plant Count')
    ax1.set_ylabel('Renewable Energy (%)')
    ax1.set_title('Power Plants vs Renewable Percentage')
    ax1.grid(alpha=0.3)
    
    # Power generators vs renewable percentage
    ax2 = axes[0, 1]
    ax2.scatter(df_nuts2_osm['power_generator_count'], df_nuts2_osm['pct_renewable'], alpha=0.6)
    ax2.set_xlabel('Power Generator Count')
    ax2.set_ylabel('Renewable Energy (%)')
    ax2.set_title('Power Generators vs Renewable Percentage')
    ax2.grid(alpha=0.3)
    
    # Industrial area vs renewable percentage
    ax3 = axes[1, 0]
    ax3.scatter(df_nuts2_osm['industrial_area_km2'], df_nuts2_osm['pct_renewable'], alpha=0.6)
    ax3.set_xlabel('Industrial Area (km¬≤)')
    ax3.set_ylabel('Renewable Energy (%)')
    ax3.set_title('Industrial Area vs Renewable Percentage')
    ax3.grid(alpha=0.3)
    
    # Power plant density vs renewable density
    ax4 = axes[1, 1]
    ax4.scatter(df_nuts2_osm['power_plant_density_per_km2'], 
                df_nuts2_osm['renewable_density_gwh_per_km2'], alpha=0.6)
    ax4.set_xlabel('Power Plant Density (per km¬≤)')
    ax4.set_ylabel('Renewable Density (GWh/km¬≤)')
    ax4.set_title('Power Plant Density vs Renewable Density')
    ax4.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Statistical summary
    print("\nüìä Infrastructure vs Renewable Energy Correlations:")
    print(f"Power Plants vs Renewable %: {df_nuts2_osm['power_plant_count'].corr(df_nuts2_osm['pct_renewable']):.3f}")
    print(f"Power Generators vs Renewable %: {df_nuts2_osm['power_generator_count'].corr(df_nuts2_osm['pct_renewable']):.3f}")
    print(f"Industrial Area vs Renewable %: {df_nuts2_osm['industrial_area_km2'].corr(df_nuts2_osm['pct_renewable']):.3f}")
    print(f"Power Plant Density vs Renewable Density: {df_nuts2_osm['power_plant_density_per_km2'].corr(df_nuts2_osm['renewable_density_gwh_per_km2']):.3f}")

## 6. Regional Clustering Insights

This analysis helps identify regions with similar energy and infrastructure profiles.

In [None]:
# Identify regions with similar profiles
if len(df_nuts2_osm) > 0:
    # Create feature groups
    df_nuts2_osm['energy_category'] = pd.cut(df_nuts2_osm['total_energy_gwh'] / 1000,
                                             bins=[0, 10, 50, 100, float('inf')],
                                             labels=['Low', 'Medium', 'High', 'Very High'])
    
    df_nuts2_osm['infrastructure_category'] = pd.cut(df_nuts2_osm['power_plant_count'],
                                                     bins=[0, 50, 200, 400, float('inf')],
                                                     labels=['Low', 'Medium', 'High', 'Very High'])
    
    # Cross-tabulation
    crosstab = pd.crosstab(df_nuts2_osm['energy_category'], 
                          df_nuts2_osm['infrastructure_category'],
                          margins=True)
    
    print("üìä Energy vs Infrastructure Categories:")
    print(crosstab)
    
    # Top regions by different metrics
    print("\nüèÜ Top 5 Regions by Energy Consumption:")
    print(df_nuts2_osm.nlargest(5, 'total_energy_gwh')[['region_id', 'region_name', 
                                                         'total_energy_gwh', 'power_plant_count']])
    
    print("\nüèÜ Top 5 Regions by Power Plant Count:")
    print(df_nuts2_osm.nlargest(5, 'power_plant_count')[['region_id', 'region_name',
                                                          'power_plant_count', 'total_energy_gwh']])
    
    print("\nüèÜ Top 5 Regions by Industrial Area:")
    print(df_nuts2_osm.nlargest(5, 'industrial_area_km2')[['region_id', 'region_name',
                                                            'industrial_area_km2', 'total_energy_gwh']])
    
    print("\nüèÜ Top 5 Regions by Renewable Percentage:")
    print(df_nuts2_osm.nlargest(5, 'pct_renewable')[['region_id', 'region_name',
                                                      'pct_renewable', 'renewable_energy_gwh']])

## 7. Key Insights and Recommendations

### Summary of Findings

In [None]:
print("üìã Key Insights:")
print("=" * 60)

if len(df_nuts2_osm) > 0:
    print(f"\n1. Data Coverage:")
    print(f"   - {len(df_nuts2_osm)} NUTS2 regions with OSM features")
    print(f"   - {df_nuts2_osm['cntr_code'].nunique()} countries represented")
    print(f"   - Average power plants per region: {df_nuts2_osm['power_plant_count'].mean():.1f}")
    print(f"   - Average industrial area: {df_nuts2_osm['industrial_area_km2'].mean():.2f} km¬≤")
    
    print(f"\n2. Energy Patterns:")
    print(f"   - Average energy consumption: {df_nuts2_osm['total_energy_gwh'].mean() / 1000:.1f} TWh")
    print(f"   - Average renewable percentage: {df_nuts2_osm['pct_renewable'].mean():.1f}%")
    print(f"   - Average energy density: {df_nuts2_osm['energy_density_gwh_per_km2'].mean():.2f} GWh/km¬≤")
    
    print(f"\n3. Infrastructure Patterns:")
    print(f"   - Power plants: {df_nuts2_osm['power_plant_count'].sum():,} total")
    print(f"   - Power generators: {df_nuts2_osm['power_generator_count'].sum():,} total")
    print(f"   - Industrial area: {df_nuts2_osm['industrial_area_km2'].sum():.2f} km¬≤ total")
    print(f"   - Railway stations: {df_nuts2_osm['railway_station_count'].sum():,} total")
    print(f"   - Airports: {df_nuts2_osm['airport_count'].sum():,} total")
    
    print(f"\n4. Recommendations for ML Models:")
    print(f"   - Use power plant count and density as features for energy forecasting")
    print(f"   - Include industrial area as a predictor of energy consumption")
    print(f"   - Consider infrastructure density for regional clustering")
    print(f"   - Use renewable percentage with infrastructure for policy analysis")
else:
    print("‚ö†Ô∏è Limited data available - expand OSM coverage for better insights")

print("\n" + "=" * 60)