In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist
import math
import optuna
from optuna.trial import TrialState
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, Tuple, List
import warnings
warnings.filterwarnings('ignore')

# ==================== HYPERPARAMETERS & CONFIG ====================
class Config:
    # Fixed parameters
    NUM_NODES = 13
    NUM_FEATURES = 10
    TARGET_COLS = ['NO2', 'O3', 'CO', 'SO2', 'PM10', 'PM2.5']
    TARGET_DIM = 6
    STATIONS = [101, 102, 105, 106, 107, 109, 111, 112, 113, 119, 120, 121, 122]
    
    # Optuna tuning ranges
    HIDDEN_DIM_RANGE = (32, 128)  # Increased range for better exploration
    LEARNING_RATE_RANGE = (1e-4, 1e-2)
    BATCH_SIZE_RANGE = (16, 64)
    SEQ_LEN_RANGE = (12, 48)  # Multiple options for sequence length
    DROPOUT_RANGE = (0.1, 0.5)
    
    # Training
    N_TRIALS = 30  # Number of Optuna trials
    EPOCHS = 50  # Increased epochs for better convergence
    EARLY_STOPPING_PATIENCE = 10
    
    # Graph parameters
    GRAPH_THRESHOLD = 0.1
    GRAPH_DENSITY_TARGET = (0.2, 0.5)

# ==================== DATA PREPROCESSING ====================
def preprocess_seoul_data(df: pd.DataFrame, scalers: Dict, look_back_window: int = 24, 
                         target_cols: List[str] = Config.TARGET_COLS) -> Tuple[torch.Tensor, torch.Tensor]:
    """Enhanced preprocessing with better error handling and feature engineering"""
    df = df.copy()
    
    # Ensure Date format and sort
    df['Measurement date'] = pd.to_datetime(df['Measurement date'])
    df = df.sort_values(by=['Measurement date', 'Station code'])
    
    # Enhanced time encoding
    df['hour'] = df['Measurement date'].dt.hour
    df['day_of_week'] = df['Measurement date'].dt.dayofweek
    df['month'] = df['Measurement date'].dt.month
    
    # Multiple cyclical features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7.0)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7.0)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12.0)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12.0)
    
    # Feature engineering: add lag features (previous hour)
    feature_cols = target_cols + ['hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos']
    
    # Get sorted stations
    stations = sorted(df['Station code'].unique())
    num_stations = len(stations)
    
    processed_features = []
    
    for feat in feature_cols:
        values = df[feat].values.reshape(-1, num_stations)
        val_df = pd.DataFrame(values)
        
        # Advanced interpolation
        val_df = val_df.interpolate(method='linear', limit_direction='both', limit=3)
        val_df = val_df.bfill().ffill()
        
        # Normalize only physical pollutants
        if feat in target_cols:
            scaler = MinMaxScaler()
            values_norm = scaler.fit_transform(val_df.values)
            scalers[feat] = scaler
        else:
            values_norm = val_df.values
        
        processed_features.append(values_norm)
    
    # Stack features: (Time, Nodes, Features)
    data_block = np.stack(processed_features, axis=-1)
    
    # Create sequences
    X, Y = [], []
    total_time_steps = data_block.shape[0]
    target_indices = [feature_cols.index(col) for col in target_cols]
    
    for i in range(total_time_steps - look_back_window - 1):
        X.append(data_block[i:i + look_back_window, :, :])
        Y.append(data_block[i + look_back_window, :, target_indices])
    
    X_tensor = torch.FloatTensor(np.array(X))
    Y_tensor = torch.FloatTensor(np.array(Y))
    
    # Ensure correct shape
    if Y_tensor.dim() == 2:
        Y_tensor = Y_tensor.unsqueeze(-1)
    
    return X_tensor, Y_tensor

# ==================== GRAPH CONSTRUCTION ====================
def get_adjacency_matrix_dynamic(lat_lon_df: pd.DataFrame, 
                                threshold: float = Config.GRAPH_THRESHOLD,
                                k_nearest: int = 5) -> torch.Tensor:
    """Enhanced graph construction with k-nearest neighbors and adaptive thresholding"""
    
    # Calculate pairwise distances
    coords = lat_lon_df[['Latitude', 'Longitude']].values
    dists = cdist(coords, coords, metric='euclidean')
    
    # Calculate adaptive sigma
    non_zero_dists = dists[dists > 0]
    sigma = np.std(non_zero_dists)
    
    # Build adjacency matrix with Gaussian kernel
    num_nodes = len(lat_lon_df)
    adj = np.zeros((num_nodes, num_nodes))
    
    for i in range(num_nodes):
        # Use k-nearest neighbors approach
        distances_i = dists[i]
        nearest_indices = np.argsort(distances_i)[1:k_nearest+1]  # Exclude self
        
        for j in range(num_nodes):
            if i == j:
                adj[i][j] = 1.0
            elif j in nearest_indices:
                d = dists[i][j]
                weight = np.exp(- (d**2) / (2 * sigma**2))  # Gaussian kernel
                adj[i][j] = weight
            else:
                adj[i][j] = 0.0
    
    # Apply threshold
    adj[adj < threshold] = 0.0
    
    # Ensure connectivity (add self-loops if needed)
    for i in range(num_nodes):
        if np.sum(adj[i]) == 0:
            adj[i][i] = 1.0
    
    # Normalize
    row_sum = np.sum(adj, axis=1)
    row_sum[row_sum == 0] = 1.0
    adj_norm = adj / row_sum[:, np.newaxis]
    
    # Calculate graph statistics
    num_edges = np.count_nonzero(adj) - num_nodes
    density = num_edges / (num_nodes * (num_nodes - 1))
    
    print(f"Graph Statistics:")
    print(f"  Sigma: {sigma:.6f}")
    print(f"  Edges: {num_edges}")
    print(f"  Density: {density:.2%}")
    print(f"  K-nearest neighbors: {k_nearest}")
    
    return torch.tensor(adj_norm, dtype=torch.float32)

# ==================== ENHANCED TGCN MODEL ====================
class EnhancedTGCN(nn.Module):
    def __init__(self, num_nodes: int, num_features: int, hidden_dim: int, 
                 output_dim: int, adj_matrix: torch.Tensor, dropout_rate: float = 0.3):
        super(EnhancedTGCN, self).__init__()
        
        self.num_nodes = num_nodes
        self.hidden_dim = hidden_dim
        self.adj = adj_matrix
        
        # Enhanced GCN with multiple layers
        self.gcn1 = nn.Linear(num_features, hidden_dim)
        self.gcn2 = nn.Linear(hidden_dim, hidden_dim)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout_rate)
        
        # Enhanced LSTM with bidirectional capability
        self.lstm = nn.LSTM(
            input_size=hidden_dim, 
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            dropout=dropout_rate,
            bidirectional=True
        )
        
        # Attention mechanism
        self.attention = nn.MultiheadAttention(
            embed_dim=hidden_dim * 2,  # Bidirectional
            num_heads=4,
            dropout=dropout_rate,
            batch_first=True
        )
        
        # Final prediction layers
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
        # Batch normalization
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        
        # Activation
        self.relu = nn.ReLU()
        
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LSTM):
                for name, param in m.named_parameters():
                    if 'weight_ih' in name:
                        nn.init.xavier_uniform_(param.data)
                    elif 'weight_hh' in name:
                        nn.init.orthogonal_(param.data)
                    elif 'bias' in name:
                        nn.init.constant_(param.data, 0)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        batch_size, seq_len, num_nodes, num_features = x.size()
        
        # Reshape for GCN: (batch*seq, nodes, features)
        x_reshaped = x.view(-1, num_nodes, num_features)
        
        # GCN layers with adjacency matrix
        ax = torch.matmul(self.adj, x_reshaped)
        gcn_out1 = self.relu(self.gcn1(ax))
        gcn_out1 = self.dropout(gcn_out1)
        
        ax2 = torch.matmul(self.adj, gcn_out1)
        gcn_out2 = self.relu(self.gcn2(ax2))
        gcn_out2 = self.dropout(gcn_out2)
        
        # Reshape for LSTM: (batch, nodes, seq, hidden)
        gcn_out2 = gcn_out2.view(batch_size, seq_len, num_nodes, self.hidden_dim)
        lstm_input = gcn_out2.permute(0, 2, 1, 3).contiguous()
        lstm_input = lstm_input.view(batch_size * num_nodes, seq_len, self.hidden_dim)
        
        # LSTM processing
        lstm_out, _ = self.lstm(lstm_input)
        
        # Attention mechanism
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        
        # Take the last time step
        last_out = attn_out[:, -1, :]
        
        # Final fully connected layers
        out = self.relu(self.bn1(self.fc1(last_out)))
        out = self.dropout(out)
        out = self.fc2(out)
        
        # Reshape to (batch, nodes, output_dim)
        out = out.view(batch_size, num_nodes, -1)
        
        return out

# ==================== TRAINING UTILITIES ====================
class EarlyStopping:
    def __init__(self, patience: int = 10, min_delta: float = 0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss: float):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0
        return self.early_stop

def create_data_loaders(X: torch.Tensor, Y: torch.Tensor, batch_size: int, 
                       train_ratio: float = 0.8, val_ratio: float = 0.1) -> Tuple:
    """Create train, validation, and test data loaders"""
    total_samples = len(X)
    
    train_idx = int(total_samples * train_ratio)
    val_idx = int(total_samples * (train_ratio + val_ratio))
    
    # Split data
    X_train, Y_train = X[:train_idx], Y[:train_idx]
    X_val, Y_val = X[train_idx:val_idx], Y[train_idx:val_idx]
    X_test, Y_test = X[val_idx:], Y[val_idx:]
    
    # Create datasets
    train_dataset = TensorDataset(X_train, Y_train)
    val_dataset = TensorDataset(X_val, Y_val)
    test_dataset = TensorDataset(X_test, Y_test)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    print(f"Data splits: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")
    
    return train_loader, val_loader, test_loader

def train_epoch(model: nn.Module, loader: DataLoader, criterion: nn.Module, 
               optimizer: optim.Optimizer, device: torch.device) -> float:
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        predictions = model(batch_x)
        
        # Ensure shapes match
        if predictions.shape != batch_y.shape:
            predictions = predictions[:, :, :batch_y.shape[-1]]
        
        loss = criterion(predictions, batch_y)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(loader)

def validate(model: nn.Module, loader: DataLoader, criterion: nn.Module, 
            device: torch.device) -> Tuple[float, Dict]:
    """Validate model and return metrics"""
    model.eval()
    total_loss = 0
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for batch_x, batch_y in loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            predictions = model(batch_x)
            
            # Ensure shapes match
            if predictions.shape != batch_y.shape:
                predictions = predictions[:, :, :batch_y.shape[-1]]
            
            loss = criterion(predictions, batch_y)
            total_loss += loss.item()
            
            all_preds.append(predictions.cpu().numpy())
            all_targets.append(batch_y.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    
    # Calculate additional metrics
    if all_preds:
        preds = np.concatenate(all_preds, axis=0)
        targets = np.concatenate(all_targets, axis=0)
        
        # Calculate MAE and RMSE for each pollutant
        metrics = {}
        for i, col in enumerate(Config.TARGET_COLS):
            if i < preds.shape[-1]:
                mae = np.mean(np.abs(preds[..., i] - targets[..., i]))
                rmse = np.sqrt(np.mean((preds[..., i] - targets[..., i])**2))
                metrics[f'{col}_MAE'] = mae
                metrics[f'{col}_RMSE'] = rmse
        
        metrics['overall_loss'] = avg_loss
    else:
        metrics = {'overall_loss': avg_loss}
    
    return avg_loss, metrics

# ==================== OPTUNA HYPERPARAMETER TUNING ====================
def objective(trial, train_loader, val_loader, adj_matrix, device):
    """Optuna objective function for hyperparameter optimization"""
    
    # Suggest hyperparameters
    hidden_dim = trial.suggest_int('hidden_dim', *Config.HIDDEN_DIM_RANGE)
    learning_rate = trial.suggest_float('learning_rate', *Config.LEARNING_RATE_RANGE, log=True)
    batch_size = trial.suggest_int('batch_size', *Config.BATCH_SIZE_RANGE)
    dropout_rate = trial.suggest_float('dropout_rate', *Config.DROPOUT_RANGE)
    
    # Model architecture choices
    use_bidirectional = trial.suggest_categorical('use_bidirectional', [True, False])
    num_lstm_layers = trial.suggest_int('num_lstm_layers', 1, 3)
    use_attention = trial.suggest_categorical('use_attention', [True, False])
    
    # Create model with suggested parameters
    model = EnhancedTGCN(
        num_nodes=Config.NUM_NODES,
        num_features=Config.NUM_FEATURES,
        hidden_dim=hidden_dim,
        output_dim=Config.TARGET_DIM,
        adj_matrix=adj_matrix,
        dropout_rate=dropout_rate
    ).to(device)
    
    # Optimizer with suggested learning rate
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    
    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=5, verbose=False
    )
    
    # Loss function (weighted for multi-output)
    criterion = nn.MSELoss()
    
    # Early stopping
    early_stopping = EarlyStopping(patience=Config.EARLY_STOPPING_PATIENCE)
    
    # Training loop
    best_val_loss = float('inf')
    
    for epoch in range(Config.EPOCHS):
        # Train
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        
        # Validate
        val_loss, val_metrics = validate(model, val_loader, criterion, device)
        
        # Learning rate scheduling
        scheduler.step(val_loss)
        
        # Report intermediate value to Optuna
        trial.report(val_loss, epoch)
        
        # Handle pruning
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        # Update best loss
        if val_loss < best_val_loss:
            best_val_loss = val_loss
        
        # Early stopping
        if early_stopping(val_loss):
            break
    
    return best_val_loss

# ==================== MAIN EXECUTION ====================
def main():
    print("=" * 60)
    print("TGCN HYPERPARAMETER TUNING AND MODEL TRAINING")
    print("=" * 60)
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load data
    print("\n1. Loading and preprocessing data...")
    import kagglehub
    path = kagglehub.dataset_download("bappekim/air-pollution-in-seoul")
    
    df = pd.read_csv(path + '/AirPollutionSeoul/Measurement_summary.csv')
    df_station = pd.read_csv(path + '/AirPollutionSeoul/Original Data/Measurement_station_info.csv')
    
    # Filter stations
    df = df[df['Station code'].isin(Config.STATIONS)]
    df_station = df_station[df_station['Station code'].isin(Config.STATIONS)]
    df_station = df_station.sort_values('Station code')
    
    # Generate adjacency matrix
    print("\n2. Generating adjacency matrix...")
    adj_matrix = get_adjacency_matrix_dynamic(df_station)
    torch.save(adj_matrix, "adj_matrix.pt")
    print("Adjacency matrix saved to 'adj_matrix.pt'")
    
    # Visualize adjacency matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(adj_matrix.numpy(), cmap='viridis', 
                xticklabels=Config.STATIONS, yticklabels=Config.STATIONS)
    plt.title("Spatial Graph Connections")
    plt.tight_layout()
    plt.savefig('adjacency_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Preprocess data with default sequence length
    scalers = {}
    train_x, train_y = preprocess_seoul_data(df, scalers, look_back_window=24)
    
    # Save scalers
    joblib.dump(scalers, "seoul_scalers.pkl")
    print("Scalers saved to 'seoul_scalers.pkl'")
    
    print(f"\n3. Data shape: Input {train_x.shape}, Target {train_y.shape}")
    
    # Create data loaders for tuning (using smaller subset for speed)
    train_loader, val_loader, _ = create_data_loaders(
        train_x, train_y, 
        batch_size=32,  # Default for tuning
        train_ratio=0.7,
        val_ratio=0.15
    )
    
    # Hyperparameter tuning with Optuna
    print("\n4. Starting hyperparameter tuning with Optuna...")
    print(f"   Number of trials: {Config.N_TRIALS}")
    
    study = optuna.create_study(
        direction='minimize',
        sampler=optuna.samplers.TPESampler(seed=42),
        pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)
    )
    
    study.optimize(
        lambda trial: objective(trial, train_loader, val_loader, adj_matrix, device),
        n_trials=Config.N_TRIALS,
        show_progress_bar=True
    )
    
    # Display best hyperparameters
    print("\n5. Best hyperparameters found:")
    print("=" * 40)
    for key, value in study.best_params.items():
        print(f"{key:20}: {value}")
    print(f"{'Best validation loss':20}: {study.best_value:.6f}")
    
    # Train final model with best hyperparameters
    print("\n6. Training final model with best hyperparameters...")
    
    best_params = study.best_params
    
    # Create final data loaders with optimal batch size
    final_train_loader, final_val_loader, test_loader = create_data_loaders(
        train_x, train_y,
        batch_size=best_params['batch_size'],
        train_ratio=0.8,
        val_ratio=0.1
    )
    
    # Create final model
    final_model = EnhancedTGCN(
        num_nodes=Config.NUM_NODES,
        num_features=Config.NUM_FEATURES,
        hidden_dim=best_params['hidden_dim'],
        output_dim=Config.TARGET_DIM,
        adj_matrix=adj_matrix,
        dropout_rate=best_params['dropout_rate']
    ).to(device)
    
    # Optimizer and scheduler
    optimizer = optim.AdamW(
        final_model.parameters(), 
        lr=best_params['learning_rate'],
        weight_decay=1e-4
    )
    
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=7, verbose=True
    )
    
    criterion = nn.MSELoss()
    early_stopping = EarlyStopping(patience=15)
    
    # Training history
    train_history = []
    val_history = []
    
    print("\nTraining Progress:")
    print("-" * 60)
    
    for epoch in range(Config.EPOCHS):
        # Train
        train_loss = train_epoch(final_model, final_train_loader, criterion, optimizer, device)
        train_history.append(train_loss)
        
        # Validate
        val_loss, val_metrics = validate(final_model, final_val_loader, criterion, device)
        val_history.append(val_loss)
        
        # Learning rate scheduling
        scheduler.step(val_loss)
        
        # Print progress
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch+1:3d}/{Config.EPOCHS} | "
                  f"Train Loss: {train_loss:.6f} | "
                  f"Val Loss: {val_loss:.6f} | "
                  f"LR: {optimizer.param_groups[0]['lr']:.6f}")
        
        # Early stopping
        if early_stopping(val_loss):
            print(f"\nEarly stopping triggered at epoch {epoch+1}")
            break
    
    # Final evaluation on test set
    print("\n7. Final evaluation on test set...")
    test_loss, test_metrics = validate(final_model, test_loader, criterion, device)
    
    print("\nTest Set Performance:")
    print("=" * 60)
    print(f"Overall Test Loss: {test_loss:.6f}")
    
    # Print metrics for each pollutant
    for i, col in enumerate(Config.TARGET_COLS):
        if f'{col}_MAE' in test_metrics:
            print(f"{col:6} - MAE: {test_metrics[f'{col}_MAE']:.4f}, "
                  f"RMSE: {test_metrics[f'{col}_RMSE']:.4f}")
    
    # Save final model
    torch.save(final_model.state_dict(), 'model_weights.pth')
    print("\nFinal model saved to 'model_weights.pth'")
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(train_history, label='Train Loss', alpha=0.8)
    plt.plot(val_history, label='Validation Loss', alpha=0.8)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training History')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    plt.plot(train_history, label='Train Loss', alpha=0.8)
    plt.plot(val_history, label='Validation Loss', alpha=0.8)
    plt.yscale('log')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (log scale)')
    plt.title('Training History (Log Scale)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('training_history.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Visualize predictions vs actuals
    print("\n8. Generating prediction visualizations...")
    
    final_model.eval()
    with torch.no_grad():
        test_batch_x, test_batch_y = next(iter(test_loader))
        test_batch_x, test_batch_y = test_batch_x.to(device), test_batch_y.to(device)
        predictions = final_model(test_batch_x)
        
        # Inverse transform predictions for PM2.5 (index 5)
        pm25_predictions = predictions[0, :, 5].cpu().numpy()
        pm25_actual = test_batch_y[0, :, 5].cpu().numpy()
        
        # Inverse transform using scaler
        if 'PM2.5' in scalers:
            pm25_predictions_original = scalers['PM2.5'].inverse_transform(
                pm25_predictions.reshape(1, -1)
            ).flatten()
            pm25_actual_original = scalers['PM2.5'].inverse_transform(
                pm25_actual.reshape(1, -1)
            ).flatten()
        else:
            pm25_predictions_original = pm25_predictions
            pm25_actual_original = pm25_actual
        
        # Plot predictions vs actuals
        plt.figure(figsize=(14, 6))
        
        plt.subplot(1, 2, 1)
        bars = plt.bar(range(len(Config.STATIONS)), pm25_predictions_original, 
                       alpha=0.7, label='Predicted')
        plt.bar(range(len(Config.STATIONS)), pm25_actual_original, 
                alpha=0.5, label='Actual')
        plt.xlabel('Station')
        plt.ylabel('PM2.5 Value')
        plt.title('PM2.5 Predictions vs Actuals (All Stations)')
        plt.xticks(range(len(Config.STATIONS)), Config.STATIONS, rotation=45)
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 2, 2)
        plt.scatter(pm25_actual_original, pm25_predictions_original, alpha=0.6)
        plt.plot([min(pm25_actual_original), max(pm25_actual_original)], 
                 [min(pm25_actual_original), max(pm25_actual_original)], 
                 'r--', alpha=0.8, label='Perfect Prediction')
        plt.xlabel('Actual PM2.5')
        plt.ylabel('Predicted PM2.5')
        plt.title('Prediction Scatter Plot')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('predictions_vs_actuals.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    # Save study results
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()
    
    # Create study DataFrame and save
    study_df = study.trials_dataframe()
    study_df.to_csv('optuna_study_results.csv', index=False)
    print("\nOptuna study results saved to 'optuna_study_results.csv'")
    
    print("\n" + "=" * 60)
    print("HYPERPARAMETER TUNING COMPLETE!")
    print("=" * 60)
    print("\nSaved files:")
    print("  1. adj_matrix.pt - Spatial graph adjacency matrix")
    print("  2. seoul_scalers.pkl - Feature normalization scalers")
    print("  3. model_weights.pth - Best model weights")
    print("  4. training_history.png - Training curves")
    print("  5. adjacency_matrix.png - Graph visualization")
    print("  6. predictions_vs_actuals.png - Prediction quality")
    print("  7. optuna_study_results.csv - All tuning trials")
    
    # Create inference-ready model class (simplified version)
    print("\n\nFor inference, use this simplified model class:")
    print("=" * 60)
    inference_code = '''
import torch
import torch.nn as nn
import math

class TGCN(nn.Module):
    def __init__(self, num_nodes, num_features, hidden_dim, output_dim, adj_matrix):
        super(TGCN, self).__init__()
        self.num_nodes = num_nodes
        self.hidden_dim = hidden_dim
        self.adj = adj_matrix
        
        self.gcn_weight = nn.Parameter(torch.FloatTensor(num_features, hidden_dim))
        self.gcn_bias = nn.Parameter(torch.FloatTensor(hidden_dim))
        
        self.lstm = nn.LSTM(input_size=hidden_dim, hidden_size=hidden_dim, 
                           batch_first=True, num_layers=2)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)
        self._init_weights()
    
    def _init_weights(self):
        stdv = 1.0 / math.sqrt(self.hidden_dim)
        self.gcn_weight.data.uniform_(-stdv, stdv)
        self.gcn_bias.data.uniform_(-stdv, stdv)
    
    def forward(self, x):
        batch_size, seq_len, num_nodes, _ = x.size()
        x_reshaped = x.view(-1, num_nodes, x.size(3))
        ax = torch.matmul(self.adj, x_reshaped)
        gcn_out = torch.relu(torch.matmul(ax, self.gcn_weight) + self.gcn_bias)
        gcn_out = self.dropout(gcn_out)
        gcn_out = gcn_out.view(batch_size, seq_len, num_nodes, self.hidden_dim)
        lstm_input = gcn_out.permute(0, 2, 1, 3).contiguous().view(
            batch_size * num_nodes, seq_len, self.hidden_dim)
        lstm_out, _ = self.lstm(lstm_input)
        out = self.fc(lstm_out[:, -1, :])
        return out.view(batch_size, num_nodes, -1)
'''
    print(inference_code)
    
    return final_model, scalers, adj_matrix, best_params

if __name__ == "__main__":
    # Run the complete pipeline
    best_model, scalers, adj_matrix, best_params = main()

OSError: [WinError 1114] A dynamic link library (DLL) initialization routine failed. Error loading "c:\Users\ASUS\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\lib\c10.dll" or one of its dependencies.