In [1]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('Data/Clean/Orov_clean.csv')
columns_to_analyze = ['Date','T', 'V', 'P', 'F']
df = df[columns_to_analyze]
df

Unnamed: 0,Date,T,V,P,F
0,10/1/1921,50.711506,9.680717,1.173617,81.200
1,11/1/1921,42.727805,6.427311,2.666764,100.000
2,12/1/1921,36.407265,3.427978,10.194451,194.000
3,1/1/1922,32.000000,2.775463,2.969249,192.000
4,2/1/1922,32.000000,2.077533,14.370800,422.000
...,...,...,...,...,...
1195,5/1/2021,54.776440,11.438191,0.265424,136.858
1196,6/1/2021,66.966054,19.161114,0.136572,88.619
1197,7/1/2021,73.537018,24.017705,0.086182,70.112
1198,8/1/2021,70.023603,21.625689,0.024613,56.391


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.activations import relu
import joblib
import matplotlib.pyplot as plt

# Initial random seed setting
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Hyperparameters
alpha = 0.1  # Smoothness penalty coefficient
beta = 0.1   # Monthly weight coefficient
epochs = 5000
input_window = 12
output_window = 9
n_features = 4  # PC1 to PC4
batch_size = 32

# Define LSTM architecture - using only one model with two hidden layers
lstm_architecture = [32, 16]  # 2-layer

# Custom loss function
def create_custom_flow_loss(alpha, beta):
    def custom_flow_loss(y_true, y_pred):
        month_weights = tf.constant([3, 3, 3, 2, 2, 1, 1, 1, 1], dtype=tf.float32)
        mse_loss = tf.keras.losses.MSE(y_true, y_pred)
        
        # Smoothness penalty
        smoothness_penalty = tf.reduce_mean(tf.square(y_pred[:, 1:] - y_pred[:, :-1]))
        
        # Monthly weighted loss
        monthly_weight_loss = tf.reduce_mean(month_weights * tf.square(y_true - y_pred))
        
        return mse_loss + alpha * smoothness_penalty + beta * monthly_weight_loss
    return custom_flow_loss

class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=4)
        
    def fit_transform(self, df):
        features = df[['T', 'V', 'P', 'F']].values
        features_scaled = self.scaler.fit_transform(features)
        pca_features = self.pca.fit_transform(features_scaled)
        return pca_features
    
    def transform(self, df):
        features = df[['T', 'V', 'P', 'F']].values
        features_scaled = self.scaler.transform(features)
        pca_features = self.pca.transform(features_scaled)
        return pca_features
    
    def save(self, filename):
        preprocessor_dict = {
            'scaler': self.scaler,
            'pca': self.pca
        }
        joblib.dump(preprocessor_dict, filename)
    
    @classmethod
    def load(cls, filename):
        preprocessor = cls()
        loaded_dict = joblib.load(filename)
        preprocessor.scaler = loaded_dict['scaler']
        preprocessor.pca = loaded_dict['pca']
        return preprocessor

def prepare_sequences(features, target):
    X, y = [], []
    for i in range(len(features) - input_window - output_window + 1):
        X.append(features[i:(i + input_window)])
        y.append(target[i + input_window:i + input_window + output_window])
    return np.array(X), np.array(y)

def build_model(architecture):
    # Ensure clean state for model creation
    model = Sequential()
    
    # First LSTM layer
    model.add(LSTM(architecture[0], activation='relu', return_sequences=True if len(architecture) > 1 else False,
                  input_shape=(input_window, n_features)))
    
    # Middle LSTM layers
    for i in range(1, len(architecture) - 1):
        model.add(LSTM(architecture[i], activation='relu', return_sequences=True))
    
    # Last LSTM layer
    if len(architecture) > 1:
        model.add(LSTM(architecture[-1], activation='relu'))
    
    # Output layer
    model.add(Dense(output_window, activation='relu'))
    
    custom_loss = create_custom_flow_loss(alpha, beta)
    model.compile(optimizer='adam', loss=custom_loss)
    return model

def plot_train_test_losses(train_history, test_losses, save_path='train_test_losses.png'):
    plt.figure(figsize=(15, 8))
    
    # Plot training loss
    plt.plot(train_history['loss'], label='Training Loss', color='blue', alpha=0.8)
    
    # Plot test loss for each epoch
    plt.plot(test_losses, label='Test Loss', color='red', alpha=0.8)
    
    plt.title('Training and Test Loss Comparison (1943-2021 vs 1923-1942)')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')  # Use log scale for better visualization
    plt.grid(True, which="both", ls="-", alpha=0.2)
    plt.legend()
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()

def get_model_name(architecture, alpha, beta):
    # Create name based on number of hidden layers and neurons
    arch_str = f"h{len(architecture)}_" + "_".join(map(str, architecture))
    return f"lstm_{arch_str}_a{alpha}_b{beta}_1943_2021"

def evaluate_on_test_data(model, X_test, y_test, custom_loss):
    """Evaluate the model on test data and return the loss."""
    return model.evaluate(X_test, y_test, verbose=0)

def train_and_evaluate_model(df_train, df_test):
    # Initialize and fit preprocessor on training data
    preprocessor = DataPreprocessor()
    pca_features_train = preprocessor.fit_transform(df_train)
    
    # Save preprocessor
    preprocessor.save('preprocessor_1943_2021.joblib')
    
    # Transform test data using the same preprocessor
    pca_features_test = preprocessor.transform(df_test)
    
    # Prepare sequences for training and testing
    X_train, y_train = prepare_sequences(pca_features_train, df_train['F'].values)
    X_test, y_test = prepare_sequences(pca_features_test, df_test['F'].values)
    
    # Clear everything for clean start
    tf.keras.backend.clear_session()
    
    # Reset random seeds for reproducibility
    np.random.seed(RANDOM_SEED)
    tf.random.set_seed(RANDOM_SEED)
    
    # Get model name
    model_name = get_model_name(lstm_architecture, alpha, beta)
    print(f"\nTraining model: {model_name}")
    print(f"Architecture: {lstm_architecture}")
    
    # Build model
    model = build_model(lstm_architecture)
    print(model.summary())
    
    # Create custom loss function for evaluation
    custom_loss = create_custom_flow_loss(alpha, beta)
    
    # Store test losses for each epoch
    test_losses = []
    
    # Custom callback to evaluate on test data after each epoch
    class TestEvaluationCallback(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            test_loss = evaluate_on_test_data(self.model, X_test, y_test, custom_loss)
            test_losses.append(test_loss)
            if (epoch + 1) % 100 == 0:  # Print every 100 epochs
                print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {logs['loss']:.4f} - Test Loss: {test_loss:.4f}")
    
    # Train model with callback
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        verbose=0,
        callbacks=[TestEvaluationCallback()]
    )
    
    # Save model
    model.save(f'{model_name}.keras')
    
    # Plot train and test losses
    plot_train_test_losses(history.history, test_losses, save_path=f'{model_name}_train_test_losses.png')
    
    # Print final losses
    final_train_loss = history.history['loss'][-1]
    final_test_loss = test_losses[-1]
    print(f"\nFinal training loss: {final_train_loss:.4f}")
    print(f"Final test loss: {final_test_loss:.4f}")
    
    return history.history, test_losses, preprocessor

def main():
    try:
        df_full = pd.read_csv('Data/Clean/Orov_clean.csv')
        
        # Convert date column to datetime if it's not already
        if 'Date' in df_full.columns:
            df_full['Date'] = pd.to_datetime(df_full['Date'])
            
            # Define train period (1943-2021) and test period (1923-1942)
            train_start_date = pd.to_datetime('1943-01-01')
            test_start_date = pd.to_datetime('1923-01-01')
            test_end_date = pd.to_datetime('1942-12-31')
            
            # Filter data for training and testing
            df_train = df_full[df_full['Date'] >= train_start_date]
            df_test = df_full[(df_full['Date'] >= test_start_date) & (df_full['Date'] <= test_end_date)]
            
            print(f"Full dataset size: {len(df_full)}")
            print(f"Training dataset size (1943-2021): {len(df_train)}")
            print(f"Test dataset size (1923-1942): {len(df_test)}")
            
            # Train model and evaluate on test data
            print("Starting training process...")
            train_history, test_losses, preprocessor = train_and_evaluate_model(df_train, df_test)
            print("\nTraining and evaluation completed successfully!")
            
            # Save final plot
            print("Train-test loss plot saved.")
        else:
            print("Error: Dataset does not contain a 'Date' column. Please adjust the code to match your date column name.")
            
    except Exception as e:
        print(f"An error occurred during training: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Full dataset size: 1200
Training dataset size (1943-2021): 945
Test dataset size (1923-1942): 240
Starting training process...


Training model: lstm_h2_32_16_a0.1_b0.1_1943_2021
Architecture: [32, 16]


  super().__init__(**kwargs)


None
Epoch 100/5000 - Train Loss: 111465.2812 - Test Loss: 83630.3672
Epoch 200/5000 - Train Loss: 83203.8828 - Test Loss: 101200.0156
Epoch 300/5000 - Train Loss: 70841.6562 - Test Loss: 110538.4062
Epoch 400/5000 - Train Loss: 58147.6719 - Test Loss: 104707.1172
Epoch 500/5000 - Train Loss: 69494.2031 - Test Loss: 129351.0156
Epoch 600/5000 - Train Loss: 49491.5977 - Test Loss: 126448.3438
Epoch 700/5000 - Train Loss: 40558.8945 - Test Loss: 131068.9062
Epoch 800/5000 - Train Loss: 37590.0312 - Test Loss: 134244.2656
Epoch 900/5000 - Train Loss: 38138.4492 - Test Loss: 145591.6875
Epoch 1000/5000 - Train Loss: 33566.5547 - Test Loss: 149449.1875
Epoch 1100/5000 - Train Loss: 30338.3398 - Test Loss: 155379.4688
Epoch 1200/5000 - Train Loss: 28242.7461 - Test Loss: 154576.5938
Epoch 1300/5000 - Train Loss: 27615.6191 - Test Loss: 150224.4219
Epoch 1400/5000 - Train Loss: 28563.2109 - Test Loss: 167964.2188
Epoch 1500/5000 - Train Loss: 39347.8477 - Test Loss: 144514.1875
Epoch 1600/500