In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import joblib
import matplotlib.pyplot as plt

# Initial random seed setting
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Hyperparameters
alpha = 0.1  # Smoothness penalty coefficient
beta = 0.1   # Monthly weight coefficient
epochs = 5000
input_window = 12
output_window = 9
n_features = 5  # PC1 to PC5 (updated from 4 to 5)
batch_size = 32

# Define LSTM architecture - using only one model with two hidden layers
lstm_architecture = [32, 16]  # 2-layer

# Custom loss function
def create_custom_flow_loss(alpha, beta):
    def custom_flow_loss(y_true, y_pred):
        month_weights = tf.constant([3, 3, 3, 2, 2, 1, 1, 1, 1], dtype=tf.float32)
        mse_loss = tf.keras.losses.MSE(y_true, y_pred)
        
        # Smoothness penalty
        smoothness_penalty = tf.reduce_mean(tf.square(y_pred[:, 1:] - y_pred[:, :-1]))
        
        # Monthly weighted loss
        monthly_weight_loss = tf.reduce_mean(month_weights * tf.square(y_true - y_pred))
        
        return mse_loss + alpha * smoothness_penalty + beta * monthly_weight_loss
    return custom_flow_loss

class EnhancedDataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=5)  # Using 5 components as requested
        self.detrend_params = {}  # Will store detrending parameters
        
    def fit_transform(self, df):
        # Step 1: Detrend the significant variables (T and V)
        df_detrended = df.copy()
        features_to_detrend = ['T', 'V']  # Focus on statistically significant variables
        
        for col in features_to_detrend:
            values = df[col].values
            x = np.arange(len(values))
            slope, intercept = np.polyfit(x, values, 1)
            trend = slope * x + intercept
            
            # Store detrending parameters for future transformations
            self.detrend_params[col] = {'slope': slope, 'intercept': intercept}
            
            # Replace original with detrended and add trend as new feature
            df_detrended[col] = values - trend
            df_detrended[f'{col}_trend'] = trend
        
        # Step 2: Engineer additional features
        # Add key interaction features
        df_detrended['T_V_ratio'] = df['T'] / df['V']
        df_detrended['seasonal_T'] = self._extract_seasonal_component(df, 'T')
        df_detrended['seasonal_P'] = self._extract_seasonal_component(df, 'P')
        
        # Step 3: Select final feature set for PCA
        feature_columns = ['T', 'V', 'P', 'F', 'T_trend', 'V_trend', 
                          'T_V_ratio', 'seasonal_T', 'seasonal_P']
        features = df_detrended[feature_columns].values
        
        # Step 4: Apply scaling and PCA
        features_scaled = self.scaler.fit_transform(features)
        pca_features = self.pca.fit_transform(features_scaled)
        
        # Print explained variance to understand component importance
        explained_variance = self.pca.explained_variance_ratio_
        print(f"Explained variance by 5 PCs: {explained_variance}")
        print(f"Total variance explained: {sum(explained_variance):.4f}")
        
        return pca_features
    
    def _extract_seasonal_component(self, df, column):
        # Extract monthly seasonal components
        series = df[column]
        grouped = series.groupby(pd.DatetimeIndex(df['Date']).month)
        monthly_means = df[column].copy()
        
        for month, group in grouped:
            mask = pd.DatetimeIndex(df['Date']).month == month
            monthly_means.loc[mask] = group.mean()
            
        return series - monthly_means
        
    def transform(self, df):
        # Apply same transformations to new data
        df_detrended = df.copy()
        
        # Apply stored detrending parameters
        for col, params in self.detrend_params.items():
            x = np.arange(len(df))
            trend = params['slope'] * x + params['intercept']
            df_detrended[col] = df[col] - trend
            df_detrended[f'{col}_trend'] = trend
        
        # Create same engineered features
        df_detrended['T_V_ratio'] = df['T'] / df['V']
        df_detrended['seasonal_T'] = self._extract_seasonal_component(df, 'T')
        df_detrended['seasonal_P'] = self._extract_seasonal_component(df, 'P')
        
        # Select same feature set and apply transformations
        feature_columns = ['T', 'V', 'P', 'F', 'T_trend', 'V_trend', 
                          'T_V_ratio', 'seasonal_T', 'seasonal_P']
        features = df_detrended[feature_columns].values
        features_scaled = self.scaler.transform(features)
        pca_features = self.pca.transform(features_scaled)
        
        return pca_features
    
    def save(self, filename):
        preprocessor_dict = {
            'scaler': self.scaler,
            'pca': self.pca,
            'detrend_params': self.detrend_params,
        }
        joblib.dump(preprocessor_dict, filename)
    
    @classmethod
    def load(cls, filename):
        preprocessor = cls()
        loaded_dict = joblib.load(filename)
        preprocessor.scaler = loaded_dict['scaler']
        preprocessor.pca = loaded_dict['pca']
        preprocessor.detrend_params = loaded_dict['detrend_params']
        return preprocessor

def prepare_sequences(features, target):
    X, y = [], []
    for i in range(len(features) - input_window - output_window + 1):
        X.append(features[i:(i + input_window)])
        y.append(target[i + input_window:i + input_window + output_window])
    return np.array(X), np.array(y)

def build_model(architecture):
    # Ensure clean state for model creation
    model = Sequential()
    
    # First LSTM layer
    model.add(LSTM(architecture[0], activation='relu', return_sequences=True if len(architecture) > 1 else False,
                  input_shape=(input_window, n_features)))
    
    # Middle LSTM layers
    for i in range(1, len(architecture) - 1):
        model.add(LSTM(architecture[i], activation='relu', return_sequences=True))
    
    # Last LSTM layer
    if len(architecture) > 1:
        model.add(LSTM(architecture[-1], activation='relu'))
    
    # Output layer
    model.add(Dense(output_window, activation='relu'))
    
    custom_loss = create_custom_flow_loss(alpha, beta)
    model.compile(optimizer='adam', loss=custom_loss)
    return model

def plot_train_test_losses(train_history, val_history, test_losses, save_path='train_val_test_losses.png'):
    plt.figure(figsize=(15, 8))
    
    # Plot training loss
    plt.plot(train_history, label='Training Loss', color='blue', alpha=0.8)
    
    # Plot validation loss
    plt.plot(val_history, label='Validation Loss (1943-1962)', color='green', alpha=0.8)
    
    # Plot test loss for each epoch
    plt.plot(test_losses, label='Test Loss (1923-1942)', color='red', alpha=0.8)
    
    plt.title('Training, Validation and Test Loss Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')  # Use log scale for better visualization
    plt.grid(True, which="both", ls="-", alpha=0.2)
    plt.legend()
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()

def get_model_name(architecture, alpha, beta):
    # Create name based on number of hidden layers and neurons
    arch_str = f"h{len(architecture)}_" + "_".join(map(str, architecture))
    return f"enhanced_lstm_{arch_str}_a{alpha}_b{beta}_pc5"

def main():
    try:
        df_full = pd.read_csv('Data/Clean/Orov_clean.csv')
        
        # Convert date column to datetime
        df_full['Date'] = pd.to_datetime(df_full['Date'])
        
        # Preprocess ALL data at once with enhanced preprocessing
        print("Applying enhanced preprocessing with detrending and feature engineering...")
        preprocessor = EnhancedDataPreprocessor()
        all_features = preprocessor.fit_transform(df_full)
        
        # Save preprocessor for future use
        preprocessor.save('enhanced_preprocessor_pc5.joblib')
        print("Preprocessor saved to enhanced_preprocessor_pc5.joblib")
        
        # Define date ranges for splits
        test_start_date = pd.to_datetime('1923-01-01')
        test_end_date = pd.to_datetime('1942-12-31')
        train_start_date = pd.to_datetime('1963-01-01')
        validation_start_date = pd.to_datetime('1943-01-01')
        validation_end_date = pd.to_datetime('1962-12-31')
        
        # Create masks for each subset
        test_mask = (df_full['Date'] >= test_start_date) & (df_full['Date'] <= test_end_date)
        validation_mask = (df_full['Date'] >= validation_start_date) & (df_full['Date'] <= validation_end_date)
        train_mask = df_full['Date'] >= train_start_date
        
        # Print dataset sizes
        print(f"Full dataset size: {len(df_full)}")
        print(f"Training dataset size (1963-2021): {sum(train_mask)}")
        print(f"Validation dataset size (1943-1962): {sum(validation_mask)}")
        print(f"Test dataset size (1923-1942): {sum(test_mask)}")
        
        # Prepare sequences for each subset
        X_train, y_train = prepare_sequences(
            all_features[train_mask], 
            df_full.loc[train_mask, 'F'].values
        )
        
        X_val, y_val = prepare_sequences(
            all_features[validation_mask], 
            df_full.loc[validation_mask, 'F'].values
        )
        
        X_test, y_test = prepare_sequences(
            all_features[test_mask], 
            df_full.loc[test_mask, 'F'].values
        )
        
        print(f"Training sequences: {X_train.shape}")
        print(f"Validation sequences: {X_val.shape}")
        print(f"Test sequences: {X_test.shape}")
        
        # Clear everything for clean start
        tf.keras.backend.clear_session()
        
        # Reset random seeds for reproducibility
        np.random.seed(RANDOM_SEED)
        tf.random.set_seed(RANDOM_SEED)
        
        # Get model name
        model_name = get_model_name(lstm_architecture, alpha, beta)
        print(f"\nTraining model: {model_name}")
        print(f"Architecture: {lstm_architecture}")
        
        # Build model
        model = build_model(lstm_architecture)
        print(model.summary())
        
        # Create custom loss function for evaluation
        custom_loss = create_custom_flow_loss(alpha, beta)
        
        # Store losses for tracking
        train_losses = []
        val_losses = []
        test_losses = []
        
        # Set up callbacks
        # 1. Early stopping based on validation loss
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=200,  # Number of epochs with no improvement to wait before stopping
            restore_best_weights=True,  # Restore model weights from the epoch with the best value of the monitored quantity
            verbose=1
        )
        
        # 2. Custom callback to evaluate on test data after each epoch
        class TestEvaluationCallback(tf.keras.callbacks.Callback):
            def on_epoch_end(self, epoch, logs=None):
                # Store training and validation losses from logs
                train_losses.append(logs['loss'])
                val_losses.append(logs['val_loss'])
                
                # Evaluate on test data
                test_loss = model.evaluate(X_test, y_test, verbose=0)
                test_losses.append(test_loss)
                
                if (epoch + 1) % 100 == 0:  # Print every 100 epochs
                    print(f"Epoch {epoch + 1} - Train: {logs['loss']:.4f}, Val: {logs['val_loss']:.4f}, Test: {test_loss:.4f}")
        
        # Train model with proper validation split and callbacks
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),  # Properly use validation data
            epochs=epochs,
            batch_size=batch_size,
            verbose=0,
            callbacks=[
                early_stopping,
                TestEvaluationCallback()
            ]
        )
        
        # Save model
        model.save(f'{model_name}.keras')
        print(f"Model saved as {model_name}.keras")
        
        # Plot train, validation and test losses
        plot_train_test_losses(
            train_losses, 
            val_losses, 
            test_losses, 
            save_path=f'{model_name}_train_val_test_losses.png'
        )
        
        # Print final losses and training information
        final_train_loss = train_losses[-1]
        final_val_loss = val_losses[-1]
        final_test_loss = test_losses[-1]
        
        print(f"\nModel training stopped after {len(train_losses)} epochs")
        if len(train_losses) < epochs:
            print(f"Early stopping activated (patience={early_stopping.patience})")
            
        print(f"Best epoch: {np.argmin(val_losses) + 1}")
        print(f"Final training loss: {final_train_loss:.4f}")
        print(f"Final validation loss: {final_val_loss:.4f}")
        print(f"Final test loss: {final_test_loss:.4f}")
        print(f"Best validation loss: {min(val_losses):.4f}")
        
        # Generate predictions for test data
        predictions = model.predict(X_test)
        
        # Plot actual vs predicted for the first few test samples
        n_samples = min(5, len(predictions))
        plt.figure(figsize=(15, 10))
        
        for i in range(n_samples):
            plt.subplot(n_samples, 1, i+1)
            plt.plot(y_test[i], 'b-', label='Actual')
            plt.plot(predictions[i], 'r--', label='Predicted')
            plt.legend()
            plt.title(f'Test Sample {i+1}')
            plt.ylabel('Flow')
            
        plt.tight_layout()
        plt.savefig(f'{model_name}_predictions.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        print("Training and evaluation completed successfully!")
        print(f"Results and plots saved with prefix: {model_name}")
            
    except Exception as e:
        print(f"An error occurred during training: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Applying enhanced preprocessing with detrending and feature engineering...
Explained variance by 5 PCs: [0.44724241 0.236193   0.15201838 0.09396612 0.05300994]
Total variance explained: 0.9824
Preprocessor saved to enhanced_preprocessor_pc5.joblib
Full dataset size: 1200
Training dataset size (1963-2021): 705
Validation dataset size (1943-1962): 240
Test dataset size (1923-1942): 240
Training sequences: (685, 12, 5)
Validation sequences: (220, 12, 5)
Test sequences: (220, 12, 5)

Training model: enhanced_lstm_h2_32_16_a0.1_b0.1_pc5
Architecture: [32, 16]


  super().__init__(**kwargs)


None
Epoch 100 - Train: 103949.9453, Val: 408276.1875, Test: 1429634.3750
Epoch 200 - Train: 64954.0664, Val: 501435.2812, Test: 2227711.2500
Epoch 234: early stopping
Restoring model weights from the end of the best epoch: 34.
Model saved as enhanced_lstm_h2_32_16_a0.1_b0.1_pc5.keras

Model training stopped after 234 epochs
Early stopping activated (patience=200)
Best epoch: 34
Final training loss: 62072.8828
Final validation loss: 410917.6875
Final test loss: 1700400.1250
Best validation loss: 176773.7812
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Training and evaluation completed successfully!
Results and plots saved with prefix: enhanced_lstm_h2_32_16_a0.1_b0.1_pc5


In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import joblib
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# Initial random seed setting
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Hyperparameters
alpha = 0.1  # Smoothness penalty coefficient
beta = 0.1   # Monthly weight coefficient
epochs = 5000
input_window = 12
output_window = 9
batch_size = 32

# Define LSTM architecture - using only one model with two hidden layers
lstm_architecture = [32, 16]  # 2-layer

# Custom loss function
def create_custom_flow_loss(alpha, beta):
    def custom_flow_loss(y_true, y_pred):
        month_weights = tf.constant([3, 3, 3, 2, 2, 1, 1, 1, 1], dtype=tf.float32)
        mse_loss = tf.keras.losses.MSE(y_true, y_pred)
        
        # Smoothness penalty
        smoothness_penalty = tf.reduce_mean(tf.square(y_pred[:, 1:] - y_pred[:, :-1]))
        
        # Monthly weighted loss
        monthly_weight_loss = tf.reduce_mean(month_weights * tf.square(y_true - y_pred))
        
        return mse_loss + alpha * smoothness_penalty + beta * monthly_weight_loss
    return custom_flow_loss

class BasicDataPreprocessor:
    """Original preprocessing approach: Just PCA with 4 PCs"""
    def __init__(self):
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=4)  # Original: 4 components
        
    def fit_transform(self, df):
        features = df[['T', 'V', 'P', 'F']].values
        features_scaled = self.scaler.fit_transform(features)
        pca_features = self.pca.fit_transform(features_scaled)
        
        # Print explained variance
        explained_variance = self.pca.explained_variance_ratio_
        print(f"Basic PCA - Explained variance by 4 PCs: {explained_variance}")
        print(f"Basic PCA - Total variance explained: {sum(explained_variance):.4f}")
        
        return pca_features
    
    def transform(self, df):
        features = df[['T', 'V', 'P', 'F']].values
        features_scaled = self.scaler.transform(features)
        pca_features = self.pca.transform(features_scaled)
        return pca_features
    
    def save(self, filename):
        preprocessor_dict = {
            'scaler': self.scaler,
            'pca': self.pca
        }
        joblib.dump(preprocessor_dict, filename)
    
    @classmethod
    def load(cls, filename):
        preprocessor = cls()
        loaded_dict = joblib.load(filename)
        preprocessor.scaler = loaded_dict['scaler']
        preprocessor.pca = loaded_dict['pca']
        return preprocessor

class EnhancedDataPreprocessor:
    """Enhanced preprocessing approach: Detrending + Feature Engineering + PCA with 5 PCs"""
    def __init__(self):
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=5)  # Enhanced: 5 components
        self.detrend_params = {}  # Will store detrending parameters
        
    def fit_transform(self, df):
        # Step 1: Detrend the significant variables (T and V)
        df_detrended = df.copy()
        features_to_detrend = ['T', 'V']  # Focus on statistically significant variables
        
        for col in features_to_detrend:
            values = df[col].values
            x = np.arange(len(values))
            slope, intercept = np.polyfit(x, values, 1)
            trend = slope * x + intercept
            
            # Store detrending parameters for future transformations
            self.detrend_params[col] = {'slope': slope, 'intercept': intercept}
            
            # Replace original with detrended and add trend as new feature
            df_detrended[col] = values - trend
            df_detrended[f'{col}_trend'] = trend
        
        # Step 2: Engineer additional features
        # Add key interaction features
        df_detrended['T_V_ratio'] = df['T'] / df['V']
        df_detrended['seasonal_T'] = self._extract_seasonal_component(df, 'T')
        df_detrended['seasonal_P'] = self._extract_seasonal_component(df, 'P')
        
        # Step 3: Select final feature set for PCA
        feature_columns = ['T', 'V', 'P', 'F', 'T_trend', 'V_trend', 
                          'T_V_ratio', 'seasonal_T', 'seasonal_P']
        features = df_detrended[feature_columns].values
        
        # Step 4: Apply scaling and PCA
        features_scaled = self.scaler.fit_transform(features)
        pca_features = self.pca.fit_transform(features_scaled)
        
        # Print explained variance
        explained_variance = self.pca.explained_variance_ratio_
        print(f"Enhanced PCA - Explained variance by 5 PCs: {explained_variance}")
        print(f"Enhanced PCA - Total variance explained: {sum(explained_variance):.4f}")
        
        return pca_features
    
    def _extract_seasonal_component(self, df, column):
        # Extract monthly seasonal components
        series = df[column]
        grouped = series.groupby(pd.DatetimeIndex(df['Date']).month)
        monthly_means = df[column].copy()
        
        for month, group in grouped:
            mask = pd.DatetimeIndex(df['Date']).month == month
            monthly_means.loc[mask] = group.mean()
            
        return series - monthly_means
        
    def transform(self, df):
        # Apply same transformations to new data
        df_detrended = df.copy()
        
        # Apply stored detrending parameters
        for col, params in self.detrend_params.items():
            x = np.arange(len(df))
            trend = params['slope'] * x + params['intercept']
            df_detrended[col] = df[col] - trend
            df_detrended[f'{col}_trend'] = trend
        
        # Create same engineered features
        df_detrended['T_V_ratio'] = df['T'] / df['V']
        df_detrended['seasonal_T'] = self._extract_seasonal_component(df, 'T')
        df_detrended['seasonal_P'] = self._extract_seasonal_component(df, 'P')
        
        # Select same feature set and apply transformations
        feature_columns = ['T', 'V', 'P', 'F', 'T_trend', 'V_trend', 
                          'T_V_ratio', 'seasonal_T', 'seasonal_P']
        features = df_detrended[feature_columns].values
        features_scaled = self.scaler.transform(features)
        pca_features = self.pca.transform(features_scaled)
        
        return pca_features
    
    def save(self, filename):
        preprocessor_dict = {
            'scaler': self.scaler,
            'pca': self.pca,
            'detrend_params': self.detrend_params,
        }
        joblib.dump(preprocessor_dict, filename)
    
    @classmethod
    def load(cls, filename):
        preprocessor = cls()
        loaded_dict = joblib.load(filename)
        preprocessor.scaler = loaded_dict['scaler']
        preprocessor.pca = loaded_dict['pca']
        preprocessor.detrend_params = loaded_dict['detrend_params']
        return preprocessor

def prepare_sequences(features, target):
    X, y = [], []
    for i in range(len(features) - input_window - output_window + 1):
        X.append(features[i:(i + input_window)])
        y.append(target[i + input_window:i + input_window + output_window])
    return np.array(X), np.array(y)

def build_model(architecture, n_features):
    # Ensure clean state for model creation
    model = Sequential()
    
    # First LSTM layer
    model.add(LSTM(architecture[0], activation='relu', return_sequences=True if len(architecture) > 1 else False,
                  input_shape=(input_window, n_features)))
    
    # Middle LSTM layers
    for i in range(1, len(architecture) - 1):
        model.add(LSTM(architecture[i], activation='relu', return_sequences=True))
    
    # Last LSTM layer
    if len(architecture) > 1:
        model.add(LSTM(architecture[-1], activation='relu'))
    
    # Output layer
    model.add(Dense(output_window, activation='relu'))
    
    custom_loss = create_custom_flow_loss(alpha, beta)
    model.compile(optimizer='adam', loss=custom_loss)
    return model

def plot_comparison_losses(basic_losses, enhanced_losses, save_path='preprocessing_comparison.png'):
    """Plot losses from both preprocessing approaches on the same graph"""
    plt.figure(figsize=(15, 8))
    
    # Extract losses
    basic_train, basic_val, basic_test = basic_losses
    enhanced_train, enhanced_val, enhanced_test = enhanced_losses
    
    # Plot training losses
    plt.plot(basic_train, color='blue', linestyle='-', alpha=0.7, label='Basic - Training')
    plt.plot(enhanced_train, color='blue', linestyle='--', alpha=0.7, label='Enhanced - Training')
    
    # Plot validation losses
    plt.plot(basic_val, color='green', linestyle='-', alpha=0.7, label='Basic - Validation')
    plt.plot(enhanced_val, color='green', linestyle='--', alpha=0.7, label='Enhanced - Validation')
    
    # Plot test losses
    plt.plot(basic_test, color='red', linestyle='-', alpha=0.7, label='Basic - Test')
    plt.plot(enhanced_test, color='red', linestyle='--', alpha=0.7, label='Enhanced - Test')
    
    # Add custom legend with color groups
    legend_elements = [
        Line2D([0], [0], color='blue', lw=2, label='Training Loss'),
        Line2D([0], [0], color='green', lw=2, label='Validation Loss'),
        Line2D([0], [0], color='red', lw=2, label='Test Loss'),
        Line2D([0], [0], color='black', linestyle='-', lw=2, label='Basic Preprocessing'),
        Line2D([0], [0], color='black', linestyle='--', lw=2, label='Enhanced Preprocessing')
    ]
    
    plt.legend(handles=legend_elements, loc='upper right')
    
    plt.title('Comparison of Basic vs Enhanced Preprocessing')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')  # Use log scale for better visualization
    plt.grid(True, which="both", ls="-", alpha=0.2)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    return save_path

def get_model_name(architecture, alpha, beta, approach):
    # Create name based on number of hidden layers and neurons
    arch_str = f"h{len(architecture)}_" + "_".join(map(str, architecture))
    return f"lstm_{approach}_{arch_str}_a{alpha}_b{beta}"

def train_model(X_train, y_train, X_val, y_val, X_test, y_test, n_features, approach):
    """Train model with early stopping and return loss histories"""
    # Clear everything for clean start
    tf.keras.backend.clear_session()
    
    # Reset random seeds for reproducibility
    np.random.seed(RANDOM_SEED)
    tf.random.set_seed(RANDOM_SEED)
    
    # Get model name
    model_name = get_model_name(lstm_architecture, alpha, beta, approach)
    print(f"\nTraining model: {model_name}")
    
    # Build model
    model = build_model(lstm_architecture, n_features)
    
    # Set up callbacks and loss tracking
    train_losses = []
    val_losses = []
    test_losses = []
    
    # Early stopping callback
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=200,
        restore_best_weights=True,
        verbose=1
    )
    
    # Callback to track test loss
    class TestEvaluationCallback(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            # Store training and validation losses from logs
            train_losses.append(logs['loss'])
            val_losses.append(logs['val_loss'])
            
            # Evaluate on test data
            test_loss = model.evaluate(X_test, y_test, verbose=0)
            test_losses.append(test_loss)
            
            if (epoch + 1) % 100 == 0:  # Print every 100 epochs
                print(f"Epoch {epoch + 1} - Train: {logs['loss']:.4f}, Val: {logs['val_loss']:.4f}, Test: {test_loss:.4f}")
    
    # Train model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        verbose=0,
        callbacks=[
            early_stopping,
            TestEvaluationCallback()
        ]
    )
    
    # Save model
    model.save(f'{model_name}.keras')
    
    # Print final performance
    final_train_loss = train_losses[-1]
    final_val_loss = val_losses[-1]
    final_test_loss = test_losses[-1]
    
    print(f"\nModel training stopped after {len(train_losses)} epochs")
    if len(train_losses) < epochs:
        print(f"Early stopping activated (patience={early_stopping.patience})")
        
    print(f"Best epoch: {np.argmin(val_losses) + 1}")
    print(f"Final training loss: {final_train_loss:.4f}")
    print(f"Final validation loss: {final_val_loss:.4f}")
    print(f"Final test loss: {final_test_loss:.4f}")
    print(f"Best validation loss: {min(val_losses):.4f}")
    
    return model, (train_losses, val_losses, test_losses)

def main():
    try:
        df_full = pd.read_csv('Data/Clean/Orov_clean.csv')
        
        # Convert date column to datetime
        df_full['Date'] = pd.to_datetime(df_full['Date'])
        
        # Define date ranges for splits
        test_start_date = pd.to_datetime('1923-01-01')
        test_end_date = pd.to_datetime('1942-12-31')
        train_start_date = pd.to_datetime('1963-01-01')
        validation_start_date = pd.to_datetime('1943-01-01')
        validation_end_date = pd.to_datetime('1962-12-31')
        
        # Create masks for each subset
        test_mask = (df_full['Date'] >= test_start_date) & (df_full['Date'] <= test_end_date)
        validation_mask = (df_full['Date'] >= validation_start_date) & (df_full['Date'] <= validation_end_date)
        train_mask = df_full['Date'] >= train_start_date
        
        # Print dataset sizes
        print(f"Full dataset size: {len(df_full)}")
        print(f"Training dataset size (1963-2021): {sum(train_mask)}")
        print(f"Validation dataset size (1943-1962): {sum(validation_mask)}")
        print(f"Test dataset size (1923-1942): {sum(test_mask)}")
        
        #------------------------------------------------------------------------------
        # Scenario 1: Basic preprocessing (just PCA with 4 components)
        #------------------------------------------------------------------------------
        print("\n=== Scenario 1: Basic Preprocessing (PCA with 4 components) ===")
        
        # Initialize and fit basic preprocessor on all data
        basic_preprocessor = BasicDataPreprocessor()
        basic_features = basic_preprocessor.fit_transform(df_full)
        
        # Save basic preprocessor
        basic_preprocessor.save('basic_preprocessor.joblib')
        
        # Prepare sequences
        basic_X_train, basic_y_train = prepare_sequences(
            basic_features[train_mask], 
            df_full.loc[train_mask, 'F'].values
        )
        
        basic_X_val, basic_y_val = prepare_sequences(
            basic_features[validation_mask], 
            df_full.loc[validation_mask, 'F'].values
        )
        
        basic_X_test, basic_y_test = prepare_sequences(
            basic_features[test_mask], 
            df_full.loc[test_mask, 'F'].values
        )
        
        # Train basic model
        basic_model, basic_losses = train_model(
            basic_X_train, basic_y_train,
            basic_X_val, basic_y_val,
            basic_X_test, basic_y_test,
            n_features=4,  # 4 PCs
            approach="basic"
        )
        
        #------------------------------------------------------------------------------
        # Scenario 2: Enhanced preprocessing (Detrending + Feature Engineering + PCA)
        #------------------------------------------------------------------------------
        print("\n=== Scenario 2: Enhanced Preprocessing (Detrending + Feature Engineering + PCA) ===")
        
        # Initialize and fit enhanced preprocessor on all data
        enhanced_preprocessor = EnhancedDataPreprocessor()
        enhanced_features = enhanced_preprocessor.fit_transform(df_full)
        
        # Save enhanced preprocessor
        enhanced_preprocessor.save('enhanced_preprocessor.joblib')
        
        # Prepare sequences
        enhanced_X_train, enhanced_y_train = prepare_sequences(
            enhanced_features[train_mask], 
            df_full.loc[train_mask, 'F'].values
        )
        
        enhanced_X_val, enhanced_y_val = prepare_sequences(
            enhanced_features[validation_mask], 
            df_full.loc[validation_mask, 'F'].values
        )
        
        enhanced_X_test, enhanced_y_test = prepare_sequences(
            enhanced_features[test_mask], 
            df_full.loc[test_mask, 'F'].values
        )
        
        # Train enhanced model
        enhanced_model, enhanced_losses = train_model(
            enhanced_X_train, enhanced_y_train,
            enhanced_X_val, enhanced_y_val,
            enhanced_X_test, enhanced_y_test,
            n_features=5,  # 5 PCs
            approach="enhanced"
        )
        
        #------------------------------------------------------------------------------
        # Compare Results
        #------------------------------------------------------------------------------
        print("\n=== Comparing Preprocessing Approaches ===")
        
        # Plot comparison of training, validation, and test losses
        comparison_plot = plot_comparison_losses(basic_losses, enhanced_losses, 'preprocessing_comparison_losses.png')
        print(f"Comparison plot saved to {comparison_plot}")
        
        # Calculate and report improvement percentages
        basic_best_val = min(basic_losses[1])
        enhanced_best_val = min(enhanced_losses[1])
        val_improvement = ((basic_best_val - enhanced_best_val) / basic_best_val) * 100
        
        basic_best_test = min(basic_losses[2])
        enhanced_best_test = min(enhanced_losses[2])
        test_improvement = ((basic_best_test - enhanced_best_test) / basic_best_test) * 100
        
        print(f"\nBest validation loss - Basic: {basic_best_val:.4f}, Enhanced: {enhanced_best_val:.4f}")
        print(f"Validation improvement: {val_improvement:.2f}%")
        
        print(f"Best test loss - Basic: {basic_best_test:.4f}, Enhanced: {enhanced_best_test:.4f}")
        print(f"Test improvement: {test_improvement:.2f}%")
        
        #------------------------------------------------------------------------------
        # Generate and Compare Predictions
        #------------------------------------------------------------------------------
        # Make predictions with both models
        basic_predictions = basic_model.predict(basic_X_test)
        enhanced_predictions = enhanced_model.predict(enhanced_X_test)
        
        # Plot sample predictions
        n_samples = 3  # Number of samples to plot
        plt.figure(figsize=(15, 12))
        
        for i in range(n_samples):
            plt.subplot(n_samples, 1, i+1)
            plt.plot(basic_y_test[i], 'k-', linewidth=2, label='Actual')
            plt.plot(basic_predictions[i], 'b--', linewidth=1.5, label='Basic Preprocessing')
            plt.plot(enhanced_predictions[i], 'r--', linewidth=1.5, label='Enhanced Preprocessing')
            plt.legend()
            plt.title(f'Test Sample {i+1}')
            plt.ylabel('Flow')
            
        plt.tight_layout()
        plt.savefig('preprocessing_comparison_predictions.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        print("\nComparison predictions plot saved to preprocessing_comparison_predictions.png")
        print("\nPreprocessing comparison analysis completed successfully!")
            
    except Exception as e:
        print(f"An error occurred during analysis: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Full dataset size: 1200
Training dataset size (1963-2021): 705
Validation dataset size (1943-1962): 240
Test dataset size (1923-1942): 240

=== Scenario 1: Basic Preprocessing (PCA with 4 components) ===
Basic PCA - Explained variance by 4 PCs: [0.71998728 0.19835198 0.07562216 0.00603858]
Basic PCA - Total variance explained: 1.0000

Training model: lstm_basic_h2_32_16_a0.1_b0.1


  super().__init__(**kwargs)


Epoch 100 - Train: 129918.3906, Val: 102819.2812, Test: 100848.6250
Epoch 200 - Train: 91359.2734, Val: 138433.0312, Test: 117339.0703
Epoch 245: early stopping
Restoring model weights from the end of the best epoch: 45.

Model training stopped after 245 epochs
Early stopping activated (patience=200)
Best epoch: 45
Final training loss: 81492.0234
Final validation loss: 151824.1562
Final test loss: 122227.5391
Best validation loss: 97051.4766

=== Scenario 2: Enhanced Preprocessing (Detrending + Feature Engineering + PCA) ===
Enhanced PCA - Explained variance by 5 PCs: [0.44724241 0.236193   0.15201838 0.09396612 0.05300994]
Enhanced PCA - Total variance explained: 0.9824

Training model: lstm_enhanced_h2_32_16_a0.1_b0.1


  super().__init__(**kwargs)


Epoch 100 - Train: 92307.0391, Val: 363780.0000, Test: 2022806.5000
Epoch 200 - Train: 65822.2891, Val: 372287.4062, Test: 1776052.6250
Epoch 278: early stopping
Restoring model weights from the end of the best epoch: 78.

Model training stopped after 278 epochs
Early stopping activated (patience=200)
Best epoch: 78
Final training loss: 56768.9336
Final validation loss: 374259.5625
Final test loss: 1487142.1250
Best validation loss: 178161.4688

=== Comparing Preprocessing Approaches ===
Comparison plot saved to preprocessing_comparison_losses.png

Best validation loss - Basic: 97051.4766, Enhanced: 178161.4688
Validation improvement: -83.57%
Best test loss - Basic: 96535.7734, Enhanced: 205529.6719
Test improvement: -112.91%
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step

Comparison predictions plot saved to preprocessing_comparison_predictions.png

Preprocessing comparison analysis comple