In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Database connection
DB_CONFIG = {
    'host': '172.18.0.1',
    'port': 5432,
    'database': 'lianel_energy',
    'user': 'airflow',
    'password': 'P9xK2mN7vQ4wR8tY3sL6hJ5nB1cV0zX'
}

connection_string = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
engine = create_engine(connection_string)

print("✅ Database connection established")

## 1. Load and Explore ML Dataset Features

In [None]:
# Load all features from ML forecasting dataset
query = """
SELECT 
    cntr_code,
    year,
    -- Target variables
    total_energy_gwh,
    renewable_energy_gwh,
    fossil_energy_gwh,
    -- Time features
    year_index,
    is_first_year,
    is_last_year,
    -- Lagged features
    lag_1_year_total_energy_gwh,
    lag_2_year_total_energy_gwh,
    lag_3_year_total_energy_gwh,
    lag_1_year_renewable_gwh,
    lag_2_year_renewable_gwh,
    -- YoY changes
    yoy_change_total_energy_pct,
    yoy_change_renewable_pct,
    yoy_change_absolute_gwh,
    -- Rolling statistics
    rolling_3y_mean_total_energy_gwh,
    rolling_5y_mean_total_energy_gwh,
    rolling_3y_mean_renewable_gwh,
    rolling_5y_mean_renewable_gwh,
    -- Trend indicators
    trend_3y_slope,
    trend_5y_slope,
    is_increasing_trend,
    is_decreasing_trend,
    -- Percentages
    pct_renewable,
    pct_fossil,
    -- Spatial features
    area_km2,
    energy_density_gwh_per_km2,
    feature_count
FROM ml_dataset_forecasting_v1
WHERE year >= 2018  -- Filter incomplete years
ORDER BY cntr_code, year
"""

df = pd.read_sql(query, engine)
print(f"✅ Loaded {len(df)} records")
print(f"Features: {len(df.columns)}")
print(f"\nFeature list:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

df.head()