In [1]:
import pandas as pd
import numpy as np
import threading
import time
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from typing import Dict, List, Tuple, Optional, Any
from queue import Queue
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
import datetime

warnings.filterwarnings('ignore')
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
File_Path = '/Users/shreyasravi/PycharmProjects/Embedded-Systems/London_Weather.csv'

A thread-safe class for preprocessing weather data, implementing feature selection, and preparing data chunks for LSTM training and inference processes. This class acts as a producer in a producer-consumer pattern, sending data to both training and inference threads.

Semaphore is like a counter which is used to control the number of threads that can access a shared resource at the same time. It is a signaling mechanism like traffic light block threads when resources are not available and allow threads when the resources are available.

In the LSTM, there are three key gates:

1. The Forget Gate manages what should be forgotten.
2. The Input Gate manages what should be kept, and
3. The Output Gate manages what information is stored in the carry and hidden states.

Each of these gates comprise of their own neural network layer that handles the mathematics needed to retain the relevant data we need to store in both the short-term memory and long-term memory.

This class acts as a consumer in the producer-consumer pattern,
consuming data from the DataPreprocessing class and training an LSTM model.

Thread-safe class for inferring using a trained LSTM model for weather forecasting.

This class acts as a consumer in a producer-consumer pattern, consuming test data
from the preprocessing thread and generating predictions using the trained model.

In [2]:
class Preprocessing:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None
        self.scaler = None
        self.normalized_data = None
        self.train_size = None
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        self.T = 20  # Number of timesteps to look while predicting

    def load_data(self):
        self.data = pd.read_csv(self.file_path)
        print(f"Data loaded with shape: {self.data.shape}")
        return self.data

    def clean_data(self):
        # Convert date to datetime format
        self.data['date'] = pd.to_datetime(self.data['date'], format='%Y%m%d')

        # Fill missing values in snow_depth with 0
        self.data['snow_depth'].fillna(0, inplace=True)

        # Drop rows with any NaN values
        self.data.dropna(inplace=True)

        print(f"Data cleaned. New shape: {self.data.shape}")
        return self.data

    def prepare_data_for_model(self, train_ratio=0.8, target_column='mean_temp'):
        # Separate features and target
        input_data = self.data.drop(['date'], axis=1)
        targets = self.data[target_column].values

        # Define dimensions
        D = input_data.shape[1]  # Number of features
        N = len(input_data) - self.T

        # Calculate train size
        self.train_size = int(len(input_data) * train_ratio)

        # Normalize input data
        self.scaler = StandardScaler()
        self.scaler.fit(input_data[:self.train_size + self.T - 1])
        self.normalized_data = self.scaler.transform(input_data)

        # Prepare X_train and y_train
        self.X_train = np.zeros((self.train_size, self.T, D))
        self.y_train = np.zeros((self.train_size, 1))

        for t in range(self.train_size):
            self.X_train[t, :, :] = self.normalized_data[t:t+self.T]
            self.y_train[t] = targets[t+self.T]

        # Prepare X_test and y_test
        self.X_test = np.zeros((N - self.train_size, self.T, D))
        self.y_test = np.zeros((N - self.train_size, 1))

        for i in range(N - self.train_size):
            t = i + self.train_size
            self.X_test[i, :, :] = self.normalized_data[t:t+self.T]
            self.y_test[i] = targets[t+self.T]

        # Convert to PyTorch tensors
        self.X_train = torch.from_numpy(self.X_train.astype(np.float32))
        self.y_train = torch.from_numpy(self.y_train.astype(np.float32))
        self.X_test = torch.from_numpy(self.X_test.astype(np.float32))
        self.y_test = torch.from_numpy(self.y_test.astype(np.float32))

        print(f"Data prepared for model -> X_train shape: {self.X_train.shape}, y_train shape: {self.y_train.shape}")
        return self.X_train, self.y_train, self.X_test, self.y_test

    def get_train_test_data(self):
        return self.X_train, self.y_train, self.X_test, self.y_test

    def get_original_data(self):
        return self.data

    def get_scaler(self):
        return self.scaler

In [3]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.M = hidden_dim
        self.L = layer_dim

        self.rnn = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=layer_dim,
            batch_first=True
        )
        # batch_first to have (batch_dim, seq_dim, feature_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, X):
        # Set device - ideally this would be a class property set during initialization
        device = X.device

        # Initial hidden state and cell state
        h0 = torch.zeros(self.L, X.size(0), self.M).to(device)
        c0 = torch.zeros(self.L, X.size(0), self.M).to(device)

        out, (hn, cn) = self.rnn(X, (h0.detach(), c0.detach()))

        # h(T) at the final time step
        out = self.fc(out[:, -1, :])
        return out

In [4]:
class Training:
    def __init__(self):
        self.model = None
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else
                                   "mps" if torch.backends.mps.is_available() else "cpu")
        self.train_losses = None
        self.test_losses = None
        print(f"Using device: {self.device}")

    def create_model(self, input_dim, hidden_dim=512, layer_dim=2, output_dim=1):
        self.model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
        self.model.to(self.device)
        return self.model

    def train_model(self, X_train, y_train, X_test, y_test, learning_rate=0.01, epochs=200):
        if self.model is None:
            raise ValueError("Model not created. Call create_model first.")

        # Move data to device
        X_train, y_train = X_train.to(self.device), y_train.to(self.device)
        X_test, y_test = X_test.to(self.device), y_test.to(self.device)

        # Loss and optimizer
        criterion = nn.MSELoss()
        optimizer = torch.optim.SGD(
            self.model.parameters(),
            lr=learning_rate,
            momentum=0.9,
            weight_decay=1e-4
        )

        self.train_losses = np.zeros(epochs)
        self.test_losses = np.zeros(epochs)

        for epoch in range(epochs):
            optimizer.zero_grad()

            # Forward pass
            outputs = self.model(X_train)
            loss = criterion(outputs, y_train)

            # Backpropagation
            loss.backward()
            optimizer.step()

            # Train loss
            self.train_losses[epoch] = loss.item()

            # Test loss
            test_outputs = self.model(X_test)
            test_loss = criterion(test_outputs, y_test)
            self.test_losses[epoch] = test_loss.item()

            if (epoch + 1) % 10 == 0:
                print(f'Epoch {epoch+1}/{epochs}, Train Loss: {loss.item():.3f}, Test Loss: {test_loss.item():.3f}')

        return self.train_losses, self.test_losses

    def save_model(self, model_path, weights_path):
        if self.model is None:
            raise ValueError("Model not created. Call create_model first.")

        torch.save(self.model, model_path)
        torch.save(self.model.state_dict(), weights_path)
        print(f"Model saved to {model_path} and weights saved to {weights_path}")

    def load_model(self, model_path=None, weights_path=None):
        if model_path is not None:
            self.model = torch.load(model_path, map_location=self.device)
            print(f"Model loaded from {model_path}")
        elif weights_path is not None and self.model is not None:
            self.model.load_state_dict(torch.load(weights_path, map_location=self.device))
            print(f"Weights loaded from {weights_path}")
        else:
            raise ValueError("Either model_path or (weights_path and a created model) must be provided")

        return self.model

    def get_model(self):
        return self.model

    def get_losses(self):
        return self.train_losses, self.test_losses

In [5]:
class Inference:
    def __init__(self, model=None, device=None):
        self.model = model
        self.device = device if device is not None else torch.device(
            "cuda:0" if torch.cuda.is_available() else
            "mps" if torch.backends.mps.is_available() else "cpu"
        )
        self.predictions = None

    def set_model(self, model):
        self.model = model
        self.model.to(self.device)

    def predict(self, X_test):
        if self.model is None:
            raise ValueError("Model not set. Call set_model first.")

        # Move data to device
        X_test = X_test.to(self.device)

        # Set model to evaluation mode
        self.model.eval()

        # Generate predictions
        self.predictions = []
        with torch.no_grad():
            for i in range(len(X_test)):
                input_ = X_test[i].reshape(1, X_test.shape[1], X_test.shape[2])
                p = self.model(input_)[0, 0].item()
                self.predictions.append(p)

        print(f"Generated {len(self.predictions)} predictions")
        return self.predictions

    def get_predictions(self):
        return self.predictions

In [6]:
class Analysis:
    def __init__(self):
        self.true_values = None
        self.predictions = None
        self.metrics = {}

    def set_data(self, true_values, predictions):
        if isinstance(true_values, torch.Tensor):
            self.true_values = true_values.cpu().detach().numpy()
        else:
            self.true_values = true_values

        self.predictions = predictions

    def calculate_metrics(self):
        if self.true_values is None or self.predictions is None:
            raise ValueError("Data not set. Call set_data first.")

        # Calculate MAE
        self.metrics['MAE'] = mean_absolute_error(self.true_values, self.predictions)

        # Calculate MSE
        self.metrics['MSE'] = mean_squared_error(self.true_values, self.predictions)

        # Calculate RMSE
        self.metrics['RMSE'] = np.sqrt(self.metrics['MSE'])

        print(f"Metrics calculated - MAE: {self.metrics['MAE']:.3f}, MSE: {self.metrics['MSE']:.3f}, RMSE: {self.metrics['RMSE']:.3f}")
        return self.metrics

    def save_metrics(self, file_path):
        if not self.metrics:
            raise ValueError("Metrics not calculated. Call calculate_metrics first.")

        # Create a DataFrame from the metrics
        metrics_df = pd.DataFrame([self.metrics])

    def analyze_error_distribution(self):
        if self.true_values is None or self.predictions is None:
            raise ValueError("Data not set. Call set_data first.")

        # Calculate errors
        errors = self.true_values.flatten() - np.array(self.predictions)

        # Basic statistics of errors
        error_stats = {
            'mean': np.mean(errors),
            'std': np.std(errors),
            'min': np.min(errors),
            'max': np.max(errors)
        }

        print(f"Error statistics - Mean: {error_stats['mean']:.3f}, Std: {error_stats['std']:.3f}, Min: {error_stats['min']:.3f}, Max: {error_stats['max']:.3f}")
        return errors, error_stats

    def get_metrics(self):
        return self.metrics

In [7]:
class Display:
    def __init__(self):
        self.data = None
        self.predictions = None
        self.dates = None

    def set_data(self, original_data, predictions, start_idx=None):
        self.data = original_data
        self.predictions = predictions

        # Prepare the plot DataFrame
        plot_len = len(predictions)
        if start_idx is None:
            start_idx = -plot_len

        self.plot_df = original_data[['date', 'mean_temp']].copy(deep=True)
        self.plot_df = self.plot_df.iloc[start_idx:]
        self.plot_df['prediction'] = predictions
        self.plot_df.set_index('date', inplace=True)

    def plot_results(self, title=None, figsize=(20, 10)):
        if self.plot_df is None:
            raise ValueError("Data not set. Call set_data first.")

        plt.figure(figsize=figsize)
        plt.plot(self.plot_df['mean_temp'], label='Actual Temperature', linewidth=1)
        plt.plot(self.plot_df['prediction'], label='Predicted Temperature', linewidth=1)
        plt.xlabel('Date')
        plt.ylabel('Temperature (°C)')
        plt.legend(loc='lower right')

        if title:
            plt.title(title)

        plt.tight_layout()
        plt.show()

    def plot_by_year(self, figsize=(20, 10)):
        if self.plot_df is None:
            raise ValueError("Data not set. Call set_data first.")

        # Group data by year
        plot_df_by_years = []
        for y in self.plot_df.index.year.unique():
            plot_df_by_years.append((y, self.plot_df.loc[self.plot_df.index.year == y]))

        # Plot each year separately
        for year, year_df in plot_df_by_years:
            plt.figure(figsize=figsize)
            plt.plot(year_df['mean_temp'], label='Actual Temperature', linewidth=1)
            plt.plot(year_df['prediction'], label='Predicted Temperature', linewidth=1)
            plt.xlabel('Date')
            plt.ylabel('Temperature (°C)')
            plt.legend(loc='lower right')
            plt.title(f'Temperature in {year}')
            plt.tight_layout()
            plt.show()

    def plot_error_histogram(self, errors, bins=25, figsize=(12, 8)):
        plt.figure(figsize=figsize)
        plt.hist(errors, bins=bins)
        plt.xlabel('Temperature Difference (Actual - Predicted)')
        plt.ylabel('Count')
        plt.title('Distribution of Prediction Errors')
        plt.tight_layout()
        plt.show()

    def plot_training_history(self, train_losses, test_losses, figsize=(12, 8)):
        plt.figure(figsize=figsize)
        plt.plot(train_losses, label='Train Loss')
        plt.plot(test_losses, label='Test Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.title('Training History')
        plt.tight_layout()
        plt.show()

In [8]:
class SharedData:
    def __init__(self):
        self.data = {}

        # Semaphores for synchronization
        self.preprocess_train_empty = threading.Semaphore(1)  # Initially empty for training data
        self.preprocess_train_full = threading.Semaphore(0)   # Initially no data available

        self.train_infer_empty = threading.Semaphore(1)       # Initially empty for model
        self.train_infer_full = threading.Semaphore(0)        # Initially no model available

        self.infer_analysis_empty = threading.Semaphore(1)    # Initially empty for predictions
        self.infer_analysis_full = threading.Semaphore(0)     # Initially no predictions available

        self.analysis_display_empty = threading.Semaphore(1)  # Initially empty for metrics
        self.analysis_display_full = threading.Semaphore(0)   # Initially no metrics available

        # Mutex for shared data access
        self.mutex = threading.Semaphore(1)

In [9]:
def preprocess_task(shared_data, file_path):
    print("Starting preprocessing task...")

    # Create preprocessing object
    preprocessor = Preprocessing(file_path)

    # Load and clean data
    preprocessor.load_data()
    preprocessor.clean_data()

    # Prepare data for model
    X_train, y_train, X_test, y_test = preprocessor.prepare_data_for_model(target_column='mean_temp')

    # Store data in shared object
    shared_data.preprocess_train_empty.acquire()
    shared_data.mutex.acquire()

    shared_data.data['X_train'] = X_train
    shared_data.data['y_train'] = y_train
    shared_data.data['X_test'] = X_test
    shared_data.data['y_test'] = y_test
    shared_data.data['preprocessor'] = preprocessor

    shared_data.mutex.release()
    shared_data.preprocess_train_full.release()

    print("Preprocessing task completed.")

def train_task(shared_data, epochs=200):
    print("Waiting for preprocessing to complete...")

    # Wait for preprocessing to complete
    shared_data.preprocess_train_full.acquire()
    shared_data.mutex.acquire()

    # Get data from shared object
    X_train = shared_data.data['X_train']
    y_train = shared_data.data['y_train']
    X_test = shared_data.data['X_test']
    y_test = shared_data.data['y_test']

    shared_data.mutex.release()
    shared_data.preprocess_train_empty.release()

    print("Starting training task...")

    # Create and train model
    trainer = Training()
    input_dim = X_train.shape[2]  # Number of features
    trainer.create_model(input_dim=input_dim)

    train_losses, test_losses = trainer.train_model(
        X_train, y_train, X_test, y_test,
        learning_rate=0.01,
        epochs=epochs
    )

    # Save the model
    trainer.save_model('lstm_model.pt', 'lstm_weights.pt')

    # Store model in shared object
    shared_data.train_infer_empty.acquire()
    shared_data.mutex.acquire()

    shared_data.data['model'] = trainer.get_model()
    shared_data.data['train_losses'] = train_losses
    shared_data.data['test_losses'] = test_losses

    shared_data.mutex.release()
    shared_data.train_infer_full.release()

    print("Training task completed.")

def infer_task(shared_data):
    print("Waiting for training to complete...")

    # Wait for training to complete
    shared_data.train_infer_full.acquire()
    shared_data.mutex.acquire()

    # Get model and test data from shared object
    model = shared_data.data['model']
    X_test = shared_data.data['X_test']
    y_test = shared_data.data['y_test']

    shared_data.mutex.release()
    shared_data.train_infer_empty.release()

    print("Starting inference task...")

    # Create inference object and generate predictions
    inference = Inference()
    inference.set_model(model)
    predictions = inference.predict(X_test)

    # Store predictions in shared object
    shared_data.infer_analysis_empty.acquire()
    shared_data.mutex.acquire()

    shared_data.data['predictions'] = predictions
    shared_data.data['true_values'] = y_test

    shared_data.mutex.release()
    shared_data.infer_analysis_full.release()

    print("Inference task completed.")

def analysis_task(shared_data):
    print("Waiting for inference to complete...")

    # Wait for inference to complete
    shared_data.infer_analysis_full.acquire()
    shared_data.mutex.acquire()

    # Get predictions and true values from shared object
    predictions = shared_data.data['predictions']
    true_values = shared_data.data['true_values']

    shared_data.mutex.release()
    shared_data.infer_analysis_empty.release()

    print("Starting analysis task...")

    # Create analysis object and calculate metrics
    analyzer = Analysis()
    analyzer.set_data(true_values, predictions)
    metrics = analyzer.calculate_metrics()

    # Analyze error distribution
    errors, error_stats = analyzer.analyze_error_distribution()

    # Store metrics and errors in shared object
    shared_data.analysis_display_empty.acquire()
    shared_data.mutex.acquire()

    shared_data.data['metrics'] = metrics
    shared_data.data['errors'] = errors
    shared_data.data['error_stats'] = error_stats

    shared_data.mutex.release()
    shared_data.analysis_display_full.release()

    print("Analysis task completed.")

def display_task(shared_data):
    print("Waiting for analysis to complete...")

    # Wait for analysis to complete
    shared_data.analysis_display_full.acquire()
    shared_data.mutex.acquire()

    # Get data from shared object
    metrics = shared_data.data['metrics']
    errors = shared_data.data['errors']
    predictions = shared_data.data['predictions']
    preprocessor = shared_data.data['preprocessor']
    train_losses = shared_data.data['train_losses']
    test_losses = shared_data.data['test_losses']

    shared_data.mutex.release()
    shared_data.analysis_display_empty.release()

    print("Starting display task...")

    # Get original data from preprocessor
    original_data = preprocessor.get_original_data()

    # Create display object and set data
    displayer = Display()
    displayer.set_data(original_data, predictions)

    # Plot results
    displayer.plot_results(title='LSTM Temperature Forecast')

    # Plot by year
    displayer.plot_by_year()

    # Plot error histogram
    displayer.plot_error_histogram(errors)

    # Plot training history
    displayer.plot_training_history(train_losses, test_losses)

    print("Display task completed.")

    # Print summary of metrics
    print("\nModel Performance Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

def run_pipeline(file_path, epochs=200):
    # Create shared data object
    shared_data = SharedData()

    # Create threads
    preprocess_thread = threading.Thread(target=preprocess_task, args=(shared_data, file_path))
    train_thread = threading.Thread(target=train_task, args=(shared_data, epochs))
    infer_thread = threading.Thread(target=infer_task, args=(shared_data,))
    analysis_thread = threading.Thread(target=analysis_task, args=(shared_data,))
    display_thread = threading.Thread(target=display_task, args=(shared_data,))

    # Start threads
    start_time = time.time()

    preprocess_thread.start()
    train_thread.start()
    infer_thread.start()
    analysis_thread.start()
    display_thread.start()

    # Wait for all threads to complete
    preprocess_thread.join()
    train_thread.join()
    infer_thread.join()
    analysis_thread.join()
    display_thread.join()

    end_time = time.time()

    # Print execution time
    print(f"\nTotal execution time: {end_time - start_time:.2f} seconds")

In [10]:
if __name__ == "__main__":
    # Run the pipeline
    run_pipeline('London_Weather.csv', epochs=200)

Starting preprocessing task...
Waiting for preprocessing to complete...
Waiting for training to complete...
Waiting for inference to complete...
Waiting for analysis to complete...
Data loaded with shape: (15341, 10)
Data cleaned. New shape: (15261, 10)
Data prepared for model -> X_train shape: torch.Size([12208, 20, 9]), y_train shape: torch.Size([12208, 1])
Preprocessing task completed.
Starting training task...
Using device: mps


Exception in thread Thread-6 (train_task):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/threading.py", line 1010, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/0b/tynds1f1557gyf7jj3cn07jc0000gn/T/ipykernel_59985/2036808291.py", line 52, in train_task
  File "/var/folders/0b/tynds1f1557gyf7jj3cn07jc0000gn/T/ipykernel_59985/2835125935.py", line 39, in train_model
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  

KeyboardInterrupt: 