# ML Feature Analysis

**Purpose**: Analyze ML dataset features, feature importance, distributions, and relationships

**Date**: January 12, 2026

## Objectives
1. Analyze feature distributions and statistics
2. Calculate feature importance and correlations
3. Identify feature relationships and interactions
4. Assess feature quality for ML models
5. Visualize feature patterns

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Database connection
DB_CONFIG = {
    'host': '172.18.0.1',
    'port': 5432,
    'database': 'lianel_energy',
    'user': 'airflow',
    'password': 'P9xK2mN7vQ4wR8tY3sL6hJ5nB1cV0zX'
}

connection_string = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
engine = create_engine(connection_string)

print("âœ… Database connection established")

## 1. Load and Explore ML Dataset Features

In [None]:
# Load all features from ML forecasting dataset
query = """
SELECT 
    cntr_code,
    year,
    -- Target variables
    total_energy_gwh,
    renewable_energy_gwh,
    fossil_energy_gwh,
    -- Time features
    year_index,
    is_first_year,
    is_last_year,
    -- Lagged features
    lag_1_year_total_energy_gwh,
    lag_2_year_total_energy_gwh,
    lag_3_year_total_energy_gwh,
    lag_1_year_renewable_gwh,
    lag_2_year_renewable_gwh,
    -- YoY changes
    yoy_change_total_energy_pct,
    yoy_change_renewable_pct,
    yoy_change_absolute_gwh,
    -- Rolling statistics
    rolling_3y_mean_total_energy_gwh,
    rolling_5y_mean_total_energy_gwh,
    rolling_3y_mean_renewable_gwh,
    rolling_5y_mean_renewable_gwh,
    -- Trend indicators
    trend_3y_slope,
    trend_5y_slope,
    is_increasing_trend,
    is_decreasing_trend,
    -- Percentages
    pct_renewable,
    pct_fossil,
    -- Spatial features
    area_km2,
    energy_density_gwh_per_km2,
    feature_count
FROM ml_dataset_forecasting_v1
WHERE year >= 2018  -- Filter incomplete years
ORDER BY cntr_code, year
"""

df = pd.read_sql(query, engine)
print(f"âœ… Loaded {len(df)} records")
print(f"Features: {len(df.columns)}")
print(f"\nFeature list:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

df.head()

## 2. Feature Statistics and Distributions

In [None]:
# Select numeric features for analysis
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
# Remove country code and year (not features)
numeric_features = [f for f in numeric_features if f not in ['year', 'year_index']]

print(f"ðŸ“Š Analyzing {len(numeric_features)} numeric features")

# Calculate comprehensive statistics
feature_stats = df[numeric_features].describe().T
feature_stats['missing_count'] = df[numeric_features].isnull().sum()
feature_stats['missing_pct'] = (feature_stats['missing_count'] / len(df)) * 100
feature_stats['zero_count'] = [((df[col] == 0).sum() if df[col].dtype in [np.int64, np.float64] else 0) for col in numeric_features]
feature_stats['skewness'] = [stats.skew(df[col].dropna()) for col in numeric_features]
feature_stats['kurtosis'] = [stats.kurtosis(df[col].dropna()) for col in numeric_features]

print("\nðŸ“ˆ Feature Statistics Summary:")
print(feature_stats[['count', 'mean', 'std', 'min', 'max', 'missing_pct', 'skewness']].round(2).head(20))

# Visualize feature distributions
n_features = len(numeric_features)
n_cols = 4
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
axes = axes.flatten()

for i, feature in enumerate(numeric_features[:min(20, len(numeric_features))]):
    ax = axes[i]
    data = df[feature].dropna()
    if len(data) > 0:
        ax.hist(data, bins=30, alpha=0.7, edgecolor='black')
        ax.set_title(f'{feature}\n(mean={data.mean():.2f})', fontsize=9)
        ax.set_xlabel('Value')
        ax.set_ylabel('Frequency')
        ax.grid(True, alpha=0.3)
    else:
        ax.text(0.5, 0.5, 'No data', ha='center', va='center')
        ax.set_title(feature, fontsize=9)

# Hide unused subplots
for i in range(len(numeric_features), len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plt.show()

## 3. Feature Correlations and Relationships

In [None]:
# Calculate correlation matrix for key features
key_features = [
    'total_energy_gwh', 'renewable_energy_gwh', 'fossil_energy_gwh',
    'pct_renewable', 'pct_fossil',
    'lag_1_year_total_energy_gwh', 'lag_2_year_total_energy_gwh',
    'yoy_change_total_energy_pct', 'yoy_change_renewable_pct',
    'rolling_3y_mean_total_energy_gwh', 'rolling_5y_mean_total_energy_gwh',
    'trend_3y_slope', 'trend_5y_slope',
    'energy_density_gwh_per_km2', 'area_km2'
]

# Filter to features that exist
key_features = [f for f in key_features if f in df.columns]

corr_matrix = df[key_features].corr()

# Visualize correlation matrix
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, 
            xticklabels=True, yticklabels=True, fontsize=8)
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Find highly correlated features
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.8:
            high_corr_pairs.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_val
            ))

print("\nðŸ”— Highly Correlated Feature Pairs (|r| > 0.8):")
if high_corr_pairs:
    for feat1, feat2, corr in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True):
        print(f"  {feat1} <-> {feat2}: {corr:.3f}")
else:
    print("  No highly correlated pairs found")

# Correlation with target variable
target = 'total_energy_gwh'
if target in df.columns:
    target_corr = df[numeric_features].corrwith(df[target]).sort_values(ascending=False)
    print(f"\nðŸŽ¯ Correlation with Target ({target}):")
    print(target_corr.head(15).to_string())

## 4. Feature Importance Analysis

In [None]:
# Simple feature importance using variance and correlation with target
target = 'total_energy_gwh'

if target in df.columns:
    feature_importance = pd.DataFrame({
        'feature': numeric_features,
        'variance': [df[col].var() if df[col].dtype in [np.int64, np.float64] else 0 for col in numeric_features],
        'correlation_with_target': [df[col].corr(df[target]) if col != target else 1.0 for col in numeric_features],
        'mean_abs_value': [df[col].abs().mean() if df[col].dtype in [np.int64, np.float64] else 0 for col in numeric_features]
    })
    
    # Normalize for importance score
    feature_importance['variance_norm'] = (feature_importance['variance'] - feature_importance['variance'].min()) / (feature_importance['variance'].max() - feature_importance['variance'].min() + 1e-10)
    feature_importance['corr_norm'] = feature_importance['correlation_with_target'].abs()
    feature_importance['importance_score'] = (
        feature_importance['variance_norm'] * 0.3 +
        feature_importance['corr_norm'] * 0.7
    )
    
    feature_importance = feature_importance.sort_values('importance_score', ascending=False)
    
    print("ðŸ“Š Feature Importance Ranking:")
    print(feature_importance[['feature', 'correlation_with_target', 'variance', 'importance_score']].head(20).to_string(index=False))
    
    # Visualize top features
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: Top features by importance
    ax1 = axes[0]
    top_features = feature_importance.head(15)
    ax1.barh(top_features['feature'], top_features['importance_score'], color='steelblue', alpha=0.7)
    ax1.set_xlabel('Importance Score')
    ax1.set_title('Top 15 Features by Importance Score')
    ax1.grid(True, alpha=0.3, axis='x')
    
    # Plot 2: Correlation with target
    ax2 = axes[1]
    top_corr = feature_importance.nlargest(15, 'correlation_with_target')
    colors = ['green' if x > 0 else 'red' for x in top_corr['correlation_with_target']]
    ax2.barh(top_corr['feature'], top_corr['correlation_with_target'], color=colors, alpha=0.7)
    ax2.set_xlabel('Correlation with Target')
    ax2.set_title('Top 15 Features by Correlation with Target')
    ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
    ax2.grid(True, alpha=0.3, axis='x')
    
    plt.tight_layout()
    plt.show()

## 5. Feature Quality Assessment

### Summary
- Feature distributions and statistics analyzed
- Correlations and relationships identified
- Feature importance calculated
- Ready for ML model training