# Import the Needed Imports

In [1]:
import os
import re
import copy
import pickle
import json
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV, cross_val_score, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import pacf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from scipy.signal import periodogram
import statsmodels.api as sm


#conda install -c conda-forge xgboost  




# Class DataLoader to Loads CSV files from the specified train  folders into individual datasets (DataFrames),and drops the 'anomaly' column if it exists. The datasets are indexed by the number in the file name.

In [2]:

class DataLoader:
    @staticmethod
    def load_data(train_folder):
        """
        Loads CSV files from the specified train and test folders into individual datasets (DataFrames),
        and drops the 'anomaly' column if it exists. The datasets are indexed by the number in the file name.

        Args:
        - train_folder (str): Path to the folder containing the train CSV files.
        Returns:
        - train_datasets (dict): A dictionary where keys are the numbers extracted from train file names
                                 and values are their corresponding DataFrames.
        """

        # Get the list of train and test CSV files
        train_files = [f for f in os.listdir(train_folder) if f.endswith('.csv')]
        # Helper function to load a CSV, drop 'anomaly' column if it exists, and extract the number from the file name
        def load_and_clean(filepath, filename):
            df = pd.read_csv(filepath)
            if 'anomaly' in df.columns:
                df = df.drop(columns=['anomaly'])
            # Extract the number from the filename (e.g., train_9.csv -> 9)
            match = re.search(r'\d+', filename)
            if match:
                file_number = int(match.group())  # Convert extracted number to int
            else:
                raise ValueError(f"No number found in file name: {filename}")
            return file_number, df

        # Load train datasets
        train_datasets = {}
        for f in train_files:
            file_number, df = load_and_clean(os.path.join(train_folder, f), f)
            train_datasets[file_number] = df  # Use the extracted number as the key

        return train_datasets

# Define the Dateset_Processing class  

In [None]:

# Specify the JSON file path
json_file_path = 'train_datasets_summary.json'
with open(json_file_path, 'r') as json_file:
        train_dataset_summary = json.load(json_file)

print(train_dataset_summary[str(505)])
class Dataset_Processing():
    def __init__(self, df, key_number, value_column_name):
        """
        Initialize the Dataset_Processing class with the given parameters.
        
        Parameters:
        df (pd.DataFrame): The DataFrame containing the dataset.
        key_number (int): Key or index associated with the dataset (used for missing value filling).
        value_column_name (str): The column name for the target variable (e.g., 'value').
        type (str): Dataset type, either 'train' or 'test'.
        """
        self.df = copy.deepcopy(df)  # Create a copy of the DataFrame to avoid modifying the original data
        self.value = value_column_name  # Name of the column containing target values
        self.key_number = key_number  # Key or index to identify the dataset

            


    def data_processing(self, lags=5, threshold=0.2):
        """
        Process the dataset by extracting time features, filling missing values, 
        selecting lags, adding lag features, and extracting Fourier features.

        Parameters:
        lags (int): Number of lags to consider when selecting lag features.
        threshold (float): Threshold for PACF values to determine the number of lags.
        """
        self.extract_time_features()  # Extract time-related features
        self.fill_missing_values(self.key_number)  # Fill missing values in the dataset
        num_lags = train_dataset_summary[str(self.key_number)]['num_lags']
        # self.add_leg_feature(num_lags)  # Add lag features based on the number of selected lags
        self.fill_missing_values(self.key_number)  # Refill missing values after adding lag features
        self.extract_fourier_features()  # Extract Fourier features


    def extract_time_features(self):
        """
        Converts the 'timestamp' column to datetime format, extracts date and time features,
        and sorts the DataFrame based on the 'timestamp' column.
        """
        self.df['timestamp'] = pd.to_datetime(self.df['timestamp'], errors='coerce')  # Convert timestamp to datetime
        self.df = self.df.sort_values(by='timestamp')  # Sort by the 'timestamp' column

        # Extract various time-related features
        self.df['year'] = self.df['timestamp'].dt.year
        self.df['month'] = self.df['timestamp'].dt.month
        self.df['day'] = self.df['timestamp'].dt.day
        self.df['dayofweek'] = self.df['timestamp'].dt.dayofweek  # 0 = Monday, 6 = Sunday
        self.df['hour'] = self.df['timestamp'].dt.hour
        self.df['minute'] = self.df['timestamp'].dt.minute
        # Drop the original 'timestamp' column
        self.df.drop(columns=['timestamp'], inplace=True)


    def fill_missing_values(self, key):
        """
        Fill missing values in the 'value' column and lag columns.

        Parameters:
        key (int): Key or index of the dataset (used to match with the training set when filling test data).
        """
        # Forward fill to fill missing values in the 'value' column
        self.df['value'] = self.df['value'].ffill()

        # num_lags = train_dataset_summary[str(self.key_number)]['num_lags']
        num_lags = 3
  
        # For test data, fill missing values using the corresponding training data
        # self.df[leg_cols] = self.df[leg_cols].fillna(train_dataset_summary[str(self.key_number)]['mean_lags'])
        self.df['value'] = self.df['value'].fillna(self.df['value'].rolling(window=num_lags).mean()).fillna(0)


    def select_lags_pacf(self, lags, threshold):
        """
        Select the number of significant lags based on the PACF (Partial Autocorrelation Function).

        Parameters:
        lags (int): Number of lags to consider for PACF.
        threshold (float): Threshold for PACF values to select significant lags.

        Returns:
        int: The number of significant lags.
        """
        # Calculate PACF values for the target variable
        pacf_values = pacf(self.df[self.value], nlags=lags)

        # List to store significant lags based on PACF values
        significant_lags = []

        # Loop through PACF values and store the significant lags
        for i, val in enumerate(pacf_values):
            if val > threshold or (val < 0 and val < (-1 * threshold)):
                significant_lags.append(i)
            else:
                break  # Stop when the PACF value doesn't meet the threshold

        return len(significant_lags) - 1  # Return the number of significant lags


    

    def extract_fourier_features(self):
        """
        Extract Fourier transformation-based features for time-related periodicity.
        """
        # Fourier features for hour, day, and month periodicity using sine and cosine transformations
        self.df['fourier_hour_sin'] = np.sin(2 * np.pi * self.df['hour'] / 24)
        self.df['fourier_hour_cos'] = np.cos(2 * np.pi * self.df['hour'] / 24)

        self.df['fourier_day_sin'] = np.sin(2 * np.pi * self.df['dayofweek'] / 7)
        self.df['fourier_day_cos'] = np.cos(2 * np.pi * self.df['dayofweek'] / 7)

        self.df['fourier_month_sin'] = np.sin(2 * np.pi * self.df['month'] / 12)
        self.df['fourier_month_cos'] = np.cos(2 * np.pi * self.df['month'] / 12)


    def extract_trend(self, window_size=2):
        """
        Extract trend features from the time series using a moving average with a specified window size.

        Parameters:
        window_size (int): The window size for the moving average (default is 2).
        """
        # Calculate the moving average
        trend = self.df[self.value].rolling(window=window_size, min_periods=1).mean()

        # Add the trend as a new column in the DataFrame
        self.df['trend'] = trend



# Define the TimeSeriesPlotter, which is used for visualization purposes.

In [29]:
class TimeSeriesPlotter:
    def __init__(self, df):
        """
        Initializes the TimeSeriesPlotter with the given DataFrame.

        Args:
        - df (pd.DataFrame): The DataFrame containing the time series data.
        """
        self.df = copy.deepcopy(df)

    def plot_pacf_graph(self, value_col, alpha=0.05):
        """
        Plots the Partial Autocorrelation Function (PACF) for the specified value column.

        Args:
        - value_col (str): The column name for the values.
        - alpha (float): The significance level for the confidence interval (default: 0.05).

        Returns:
        - None: Displays the PACF plot.
        """
        plt.figure(figsize=(10, 6))
        plot_pacf(self.df[value_col], lags=10, alpha=alpha)
        plt.title('Partial Autocorrelation Function (PACF)')
        plt.xlabel('Lags')
        plt.ylabel('PACF')
        plt.show()
        
    @staticmethod
    def plot_actual_vs_predicted(y_true, y_pred, title="plot_actual_vs_predicted", save_path=None):
        """Plots and optionally saves the actual vs predicted time series.

        Args:
            y_true (array-like): Actual values.
            y_pred (array-like): Predicted values.
            title (str): The title of the plot.
            save_path (str, optional): If provided, the plot will be saved at this location. Defaults to None.
        """
        plt.figure(figsize=(10, 6))
        plt.plot(y_true, label='Actual', color='blue')
        plt.plot(y_pred, label='Predicted', linestyle='--', color='orange')
        plt.title(title)
        plt.xlabel('Time')
        plt.ylabel('Value')
        plt.legend()
        plt.grid(True)

        if save_path:
            plt.savefig(save_path)
            print(f"Plot saved to {save_path}")
        else:
            plt.show()
        
        plt.close()

    def plot_aggregated_data(self, value_col='value'):
        """
        Extracts features for grouping (day of week, month, year) and plots the aggregated data.

        Args:
        - value_col (str): The column name for the values to plot (default: 'value').

        Returns:
        - None: The function displays the plots.
        """
        # Aggregating the data
        weekly_data = self.df.groupby('dayofweek')[value_col].sum().reset_index()
        monthly_data = self.df.groupby('month')[value_col].sum().reset_index()
        yearly_data = self.df.groupby('year')[value_col].sum().reset_index()

        # Create a figure with subplots
        plt.figure(figsize=(15, 8))

        # Plot for day of week
        plt.subplot(3, 1, 1)
        plt.plot(weekly_data['dayofweek'], weekly_data[value_col],
                 label='Weekly Data', color='blue', linestyle='--', marker='o')
        plt.xlabel('Day of Week (0=Mon, 6=Sun)')
        plt.ylabel(f'Sum of {value_col}')
        plt.title('Weekly Aggregated Data')
        plt.xticks(rotation=0)
        plt.legend()

        # Plot for month
        plt.subplot(3, 1, 2)
        plt.plot(monthly_data['month'], monthly_data[value_col],
                 label='Monthly Data', color='green', linestyle='--', marker='o')
        plt.xlabel('Month')
        plt.ylabel(f'Sum of {value_col}')
        plt.title('Monthly Aggregated Data')
        plt.xticks(rotation=0)
        plt.legend()

        # Plot for year
        plt.subplot(3, 1, 3)
        plt.plot(yearly_data['year'], yearly_data[value_col],
                 label='Yearly Data', color='orange', linestyle='--', marker='o')
        plt.xlabel('Year')
        plt.ylabel(f'Sum of {value_col}')
        plt.title('Yearly Aggregated Data')
        plt.xticks(yearly_data['year'], rotation=45)

        # Add legend and adjust layout
        plt.legend()
        plt.tight_layout()

        # Show the plots
        plt.show()

    def plot_time_series(self, title='Time Series Data Plot', xlabel='Time', ylabel='Value', color='blue', rotation=45):
        """
        Plots a time series from a DataFrame using year, month, day, hour, minute, second.

        Args:
        - title (str): The title of the plot (default: 'Time Series Data Plot').
        - xlabel (str): The label for the x-axis (default: 'Time').
        - ylabel (str): The label for the y-axis (default: 'Value').
        - color (str): The color of the plot line (default: 'blue').
        - rotation (int): The rotation angle for the x-axis labels (default: 45).

        Returns:
        - None: The function displays the plot.
        """
        # Create a timestamp column from year, month, day, hour, minute, second
        self.df['timestamp'] = pd.to_datetime(self.df[['year', 'month', 'day', 'hour', 'minute', 'second']])

        # Create a new figure for the plot
        plt.figure(figsize=(10, 6))

        # Plot the data
        plt.plot(self.df['timestamp'], self.df['value'], label='Value over Time', color=color)

        # Add labels and title
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.title(title)

        # Add a legend
        plt.legend()

        # Rotate the x-axis labels for better readability
        plt.xticks(rotation=rotation)

        # Adjust the layout for better display
        plt.tight_layout()

        # Show the plot
        plt.show()


# function time_series_train_val_split to split not in random way 

In [31]:
def time_series_train_val_split(dataset, train_size=0.8):
    """
    Splits the dataset into train and validation sets based on time.
    
    Parameters:
    dataset (DataFrame): The time series dataset.
    train_size (float): Proportion of the dataset to include in the train split (between 0 and 1).

    Returns:
    train (DataFrame): Training dataset.
    val (DataFrame): Validation dataset.
    """
    # Calculate the index for the split
    split_idx = int(len(dataset) * train_size)
    
    # Split the dataset while preserving temporal order
    train = dataset.iloc[:split_idx]
    val = dataset.iloc[split_idx:]
    
    return train, val

# Define Evaluator class which is used to evaluate the model

In [33]:
class Evaluator:
    @staticmethod
    def evaluate_model_on_data(model_filename, X, y,lag=1):
        """Evaluates the model performance on training and test data with lagged features."""
        # Load the model
        best_model = ML_Model_Loader.load_model(model_filename)
        
        # Create lagged features for the training data
        X_lagged = []
        y_lagged = []

        for i in range(len(X) - lag):
            X_lagged.append(X[i:i + lag])  # Append the lagged features
            y_lagged.append(y[i + lag])    # Append the target variable for the next time step

        X_lagged = np.array(X_lagged)
        y_lagged = np.array(y_lagged)
        n_samples, n_lags, n_features = X_lagged.shape
        X_lagged = X_lagged.reshape(n_samples, n_lags * n_features)  # Shape (115, 32)

        # Fit the model on the lagged training data
        best_model.fit(X_lagged, y_lagged)

        # Make predictions on the lagged training data
        train_predictions = best_model.predict(X_lagged)

        # Evaluate the model's performance on the training data
        train_mse = mean_squared_error(y_lagged, train_predictions)
        train_rmse = np.sqrt(train_mse)
        print(f"Training RMSE: {train_rmse}")

        return train_rmse, train_predictions


# Define class MLModelGetter to get the best model with the best parameters

In [35]:
class MLModelGetter:
    def __init__(self, train_datasets_split, val_datasets_split, models):
        """
        Initializes the MLModelGetter with training and validation datasets, as well as models and their parameter grids.
        
        Args:
            train_datasets_split (dict): Dictionary of training datasets.
            val_datasets_split (dict): Dictionary of validation datasets.
            models (dict): Dictionary of models and corresponding hyperparameter grids.
        """
        self.train_datasets_split = copy.deepcopy(train_datasets_split)
        self.val_datasets_split = copy.deepcopy(val_datasets_split)
        self.models = models
        self.best_models_info = {}
        self.X_train_dict = {}
        self.y_train_dict = {}
        self.X_val_dict = {}
        self.y_val_dict = {}

    def prepare_data(self):
        """
        Prepares the features (X) and target values (y) for both training and validation datasets by splitting the data.
        """
        for key, train in self.train_datasets_split.items():
            val = self.val_datasets_split[key]
            self.y_train_dict[key] = train['value']
            self.X_train_dict[key] = train
            self.y_val_dict[key] = val['value']
            self.X_val_dict[key] = val
            


    def get_best_models(self):
        """
        Finds and returns the best models for each dataset by performing grid search with cross-validation.
        Chooses the best model based on the lowest validation MSE (val_mse).
        Saves the best models and their information.
    
        Returns:
            dict: A dictionary containing information about the best models for each dataset.
        """
        self.prepare_data()
    
        for key in self.train_datasets_split.keys():
            X_train = self.X_train_dict[key]
            X_train = X_train.reset_index(drop=True)

            y_train = self.y_train_dict[key]
            y_train = y_train.reset_index(drop=True)

            X_val = self.X_val_dict[key]
            X_val = X_val.reset_index(drop=True)
            
            y_val = self.y_val_dict[key]
            y_val = y_val.reset_index(drop=True)

            lag=train_dataset_summary[str(key)]['num_lags']

            best_params = {}
            best_scores = {}
            best_val_mse = float('inf')  # Initialize with a high value to find the minimum
            best_model_info = None  # To store the best model's information

            # Loop through each model and perform GridSearchCV
            for model_name, (model, param_grid) in self.models.items():
                grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
                # Create lagged features for X_train
                X_train_legged, y_train_legged = [], []
                for i in range(0, len(X_train) - lag):
                    if (i + lag) < len(y_val):
                        X_train_legged.append(X_train[i:i + lag]) 
                        y_train_legged.append(y_train[i + lag])
            
                X_train_legged = np.array(X_train_legged)
                y_train_legged = np.array(y_train_legged)
                n_samples, n_lags, n_features = X_train_legged.shape
                X_train_legged = X_train_legged.reshape(n_samples, n_lags * n_features)  # Shape (115, 32)

                y_train_legged = np.array(y_train_legged)  # Shape (115,)            
                # Fit GridSearchCV with the lagged training data
                grid_search.fit(X_train_legged, y_train_legged)
                # Get the best model from GridSearchCV
                best_model = grid_search.best_estimator_

                # Create lagged features for X_val
                X_val_legged, y_val_legged = [], []
                for i in range(0, len(X_val) - lag):
                    if (i + lag) < len(y_val):
                        X_val_legged.append(X_val[i:i + lag])  
                        y_val_legged.append(y_val[i + lag])
            
                X_val_legged = np.array(X_val_legged)
                y_val_legged = np.array(y_val_legged)
                n_samples, n_lags, n_features = X_val_legged.shape
                X_val_legged = X_val_legged.reshape(n_samples, n_lags * n_features)  # Shape (115, 32)
                y_val_legged = np.array(y_train_legged)  # Shape (115,)

                # Predict on the validation set and calculate MSE
                y_val_pred = best_model.predict(X_val_legged)
                val_mse = mean_squared_error(y_val_legged, y_val_pred)
                y_val_pred = best_model.predict(X_val_legged)
                val_mse = mean_squared_error(y_val_legged, y_val_pred)

                # Get the best parameters and score (cross-validated training score)
                best_params[model_name] = grid_search.best_params_
                best_scores[model_name] = -grid_search.best_score_  # Negate to get the positive MSE

                # If current model has a lower validation MSE, update the best model info
                if val_mse < best_val_mse:
                    best_val_mse = val_mse
                    best_model_info = {
                        'Best Model': model_name,
                        'Best Parameters': grid_search.best_params_,
                        'Best Validation MSE': val_mse,
                        'Model Instance': best_model  # Save the best model instance
                        }

            # Store the best model info for the current dataset
            if best_model_info:
                self.best_models_info[key] = best_model_info

                # Print the best model details for the current dataset
                print(f"Best Model for dataset {key}: {best_model_info['Best Model']}")
                print(f"Best Parameters for dataset {key}: {best_model_info['Best Parameters']}")
                print(f"Best Validation MSE for dataset {key}: {best_model_info['Best Validation MSE']}\n")

                # Save the best model to a pickle file
                self.save_best_model(best_model_info['Best Model'], key, best_model_info['Model Instance'])

        return self.best_models_info


    def save_best_model(self, model_name, dataset_index, model_instance):
        """
        Saves the best model to a pickle file for later use.
        
        Args:
            model_name (str): The name of the best model.
            dataset_index (str/int): The index or key of the dataset.
            model_instance: The instance of the best model to be saved.
        """
        # Create a filename based on the dataset index
        filename = f"{dataset_index}_model.pkl"
        # Save the model to a pickle file
        with open(filename, 'wb') as file:
            pickle.dump(model_instance, file)
        print(f"Saved best model for dataset {dataset_index} as {filename}")


# Define ML_Model_Loader which helps in save and load the model in pickle file form

In [37]:
class ML_Model_Loader:
    @staticmethod
    def save_model(model, filename):
        """Saves a model to a pickle file.

        Args:
            model: The model object to save.
            filename (str): The name of the file to save the model to.
        """
        with open(filename, 'wb') as file:
            pickle.dump(model, file)

    @staticmethod
    def load_model(filename):
        """Loads a model from a pickle file.

        Args:
            filename (str): The name of the file to load the model from.

        Returns:
            The loaded model.
        """
        with open(filename, 'rb') as file:
            model = pickle.load(file)
        return model

# Read the datasets

In [39]:
train_folder = "Train"
train_datasets = DataLoader.load_data(train_folder)

# Doing preprossing for both train_datasets

In [41]:
for key in train_datasets:
    data_processor = Dataset_Processing(train_datasets[key], key, 'value')
    data_processor.data_processing()
    data_processor.extract_trend()
    # Get the DataFrame from the data processor
    train_datasets[key] = data_processor.df




train_datasets[102].head()



Unnamed: 0,value,year,month,day,dayofweek,hour,minute,fourier_hour_sin,fourier_hour_cos,fourier_day_sin,fourier_day_cos,fourier_month_sin,fourier_month_cos,trend
0,0.450116,2021,7,1,3,0,0,0.0,1.0,0.433884,-0.900969,-0.5,-0.866025,0.450116
1,-0.39345,2021,7,2,4,0,0,0.0,1.0,-0.433884,-0.900969,-0.5,-0.866025,0.028333
2,-0.853038,2021,7,3,5,0,0,0.0,1.0,-0.974928,-0.222521,-0.5,-0.866025,-0.623244
3,-0.865014,2021,7,4,6,0,0,0.0,1.0,-0.781831,0.62349,-0.5,-0.866025,-0.859026
4,0.014318,2021,7,5,0,0,0,0.0,1.0,0.0,1.0,-0.5,-0.866025,-0.425348


# Split the  dataset into training and validation parts 

In [43]:
train_datasets_split = {}
val_datasets_split = {}
X_train_dict = {}
y_train_dict = {}
X_val_dict = {}
y_val_dict = {}


# Splitting the datasets
for key, dataset in train_datasets.items():
    train, val = time_series_train_val_split(dataset, train_size=0.8)
    train_datasets_split[key] = train
    val_datasets_split[key] = val

    # Prepare training data
    y_train_dict[key] = train['value']
    X_train_dict[key] = train

    # Prepare validation data
    y_val_dict[key] = val['value']
    X_val_dict[key] = val



# Get the best model for every dataset

In [22]:
# Example usage for evaluating all datasets and saving plots using TimeSeriesPlotter
for dataset_index in train_datasets_split.keys():
    # Define file names based on the dataset index
    model_filename = f"{dataset_index}_model.pkl"  # Use the index for each saved model file
    
    # Get the corresponding training and test datasets
    X_train = X_train_dict[dataset_index]
    y_train = y_train_dict[dataset_index]
    # Evaluate the model using the Evaluator class
    train_rmse, train_predictions=  Evaluator.evaluate_model_on_data(model_filename, X_train, y_train,train_dataset_summary[str(dataset_index)]['num_lags'])

    # Print the RMSE results for each dataset
    print(f"Dataset {dataset_index} - Training RMSE: {train_rmse}")

    # Define file paths for saving the plots
    training_plot_path = f'plot_Training_{dataset_index}.png'

    # Use TimeSeriesPlotter to plot and save the results for training data
    TimeSeriesPlotter.plot_actual_vs_predicted(y_train, train_predictions, f'Training: Actual vs Predicted for Dataset {dataset_index}', training_plot_path)


Training RMSE: 0.0029834176568009525
Dataset 102 - Training RMSE: 0.0029834176568009525
Plot saved to plot_Training_102.png
Training RMSE: 0.1025776667299522
Dataset 103 - Training RMSE: 0.1025776667299522
Plot saved to plot_Training_103.png
Training RMSE: 0.10720128616086828
Dataset 105 - Training RMSE: 0.10720128616086828
Plot saved to plot_Training_105.png
Training RMSE: 0.12068056841439087
Dataset 110 - Training RMSE: 0.12068056841439087
Plot saved to plot_Training_110.png
Training RMSE: 0.08550447588049973
Dataset 115 - Training RMSE: 0.08550447588049973
Plot saved to plot_Training_115.png
Training RMSE: 0.19615331278106804
Dataset 118 - Training RMSE: 0.19615331278106804
Plot saved to plot_Training_118.png
Training RMSE: 0.1622961268123497
Dataset 119 - Training RMSE: 0.1622961268123497
Plot saved to plot_Training_119.png
Training RMSE: 0.20686029890211788
Dataset 12 - Training RMSE: 0.20686029890211788
Plot saved to plot_Training_12.png
Training RMSE: 0.21521638666740234
Dataset

# Hybrid Model
## Trying using Hybrid Model one to extract the trend and another to extract the seasonality


In [72]:

# You'll add fit and predict methods to this minimal class
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        """
        Class BoostedHybrid :
        Parameters: 
        model_1: model is used to extract the trend
        model_2: model is used to extract the seasonlity
        """
        self.model_1 = model_1
        self.model_2 = model_2
        self.poly = False
        self.poly_model = None

    def predict(self, X):
        """
        mothed predict: use to make predictions
        parameters:
        inputs:
        X: the features the Features for the both models
        outputs:
        y_pred: the model prediction 
        """
        t = copy.deepcopy(X)
        dp = DeterministicProcess(index=X.index, order=1)
        # X_1: Features for model_1 (Trend)
        X_1 = dp.in_sample()
        # X_2: Features for model_2 (Seasonality)
        X_2 = copy.deepcopy(X)
        if self.poly: # doing transform if the given model is poly transform only
            X_1 = self.poly_model.transform(X_1)
        # Predict trend and add seasonal predictions
        y_pred = self.model_1.predict(X_1)
        y_pred += self.model_2.predict(X_2)
        return y_pred

    def fit(self, X, y):
        """
        Method fit: fit the given data to train the model 
        parameters:
        inputs:
        X: the features for the both model_1 and model_2
        y: the output of the train data
        
        """
        t = copy.deepcopy(X)
        # X_1: Features for model_1 (Trend)
        dp = DeterministicProcess(index=X.index, order=1)
        X_1 = dp.in_sample()
        # X_2: Features for model_2 (Seasonality)
        X_2 = copy.deepcopy(X)

        # Check if model_1 is a PolynomialFeatures instance
        if isinstance(self.model_1, PolynomialFeatures):
            self.poly = True
            X_1 = self.model_1.fit_transform(X_1)
            self.poly_model = copy.deepcopy(self.model_1)
            self.model_1 = LinearRegression()
        # Fit the trend model
        self.model_1.fit(X_1, y)

        # Get fitted values and calculate residuals
        y_fit = self.model_1.predict(X_1)
        y_resid = y - y_fit

        # Fit the seasonal model on residuals
        self.model_2.fit(X_2, y_resid)

    def save_model(self, file_path):
        """
        method: save_model: used in saving the model in a given path
        parameters: 
        file_path: the path that want to save the model in.
        """
        with open(file_path, 'wb') as f:
            # Save models and their state
            pickle.dump({
                'model_1': self.model_1,
                'model_2': self.model_2,
                'poly': self.poly,
                'poly_model': self.poly_model
            }, f)

    def get_params(self, deep=True):
        """
        method: 
        get_params: used to get the parameters of the model 
        outputs:
        return the hybird model parameters.
        """
        return {
            'model_1': self.model_1,
            'model_2': self.model_2
        }

    def set_params(self, **params):
        """
        method: 
        set_params: used to set the parameters of the model 
        inputs:
         **params:the hybird model parameters that want to set.
        """
        for param, value in params.items():
            setattr(self, param, value)
        return self


#  Define Model_Getter to get the best hybrid model

In [None]:
class Model_Getter:
    def __init__(self, train_dataset, y_column='value'):
        """
        Initializes the Model_Getter class with the training dataset and target column.

        Parameters:
        train_dataset (DataFrame): The dataset used for training the models.
        y_column (str): The name of the target variable column. Default is 'value'.
        """
        self.train_dataset = copy.deepcopy(train_dataset)
        self.y = self.train_dataset[y_column]
        self.X = self.train_dataset.drop('value', axis=1)
        self.best_model = None
        self.best_score = float('inf')

    def define_models(self):
        """Defines trend and seasonality models."""
        # Define trend models
        self.trend_models = [
            LinearRegression(),  # Simple linear regression
            Ridge(alpha=1.0),
            PolynomialFeatures(degree=2), 
        ]

        # Define seasonality models with their parameter grids
        self.seasonality_models = {
            'XGBRegressor': (XGBRegressor(),{'max_depth':[3,5] ,'n_estimators': [150, 200]}),
            'DecisionTreeRegressor': (DecisionTreeRegressor(), {'max_depth': [3, 5]}),
        }

        
    def evaluate_models(self):
        """Evaluates all combinations of trend and seasonality models using GridSearchCV."""
        self.define_models()
        tscv = TimeSeriesSplit(n_splits=5)  # Define the TimeSeriesSplit
        best_params = {}
        best_scores = {}

        for trend_model in self.trend_models:
            for model_name, (seasonality_model, param_grid) in self.seasonality_models.items():
                model = BoostedHybrid(model_1=trend_model, model_2=seasonality_model)
                
                # Use GridSearchCV to find the best hyperparameters
                grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=tscv)
                grid_search.fit(X_train, y_train)

                # Get the best parameters and score
                best_params[model_name] = grid_search.best_params_
                best_scores[model_name] = -grid_search.best_score_  # Negate to get the positive MSE

                # Evaluate the best model on the validation set
                best_model = grid_search.best_estimator_
                y_val_pred = best_model.predict(X_val)
                val_mse = mean_squared_error(y_val, y_val_pred)
                # Update the best model if the current one is better
                if val_mse < self.best_score:
                    self.best_score = val_mse
                    self.best_model = best_model

    def save_best_model(self, model_path):
        """Saves the best found model."""
        if self.best_model:
            self.best_model.save_model(model_path)
        else:
            print("No model has been evaluated yet.")



# Define Hybrid_Model_Loader which is used for save and load the hybrid model

In [None]:
class Hybrid_Model_Loader:
    @staticmethod
    def save_model(boosted_hybrid, file_path):
        """
        save_model: Save the state of the BoostedHybrid model to a specified file.

        Parameters:
            boosted_hybrid (BoostedHybrid): The BoostedHybrid instance to save.
            file_path (str): The path where the model state will be saved.
        """
        with open(file_path, 'wb') as f:
            # Save models and their state
            pickle.dump({
                'model_1': boosted_hybrid.model_1,
                'model_2': boosted_hybrid.model_2,
                'poly': boosted_hybrid.poly,
                'poly_model': boosted_hybrid.poly_model,
            }, f)

    @staticmethod
    def load_model(file_path):
        """
        load_model :Load the BoostedHybrid model state from a specified file.

        Parameters:
             inputs: file_path (str): The path from which the model state will be loaded.

        Returns:
            BoostedHybrid: An instance of the BoostedHybrid model with the loaded state.
        """
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
            model_1 = data['model_1']
            model_2 = data['model_2']
            poly = data['poly']
            poly_model = data['poly_model']
            boosted_hybrid = BoostedHybrid(model_1, model_2)
            boosted_hybrid.poly = poly
            boosted_hybrid.poly_model = poly_model
            return boosted_hybrid

# Load the datasets and doing the prepocessing part
## split the data into traitn parts

In [None]:
train_folder = "Train"
train_datasets = DataLoader.load_data(train_folder)

# Apply the time extraction function to each dataset in train_datasets
for key in train_datasets:
    data_processor=Dataset_Processing(train_datasets[key],key,'value')
    data_processor.data_processing()
    train_datasets[key]=data_processor.df

#  example to use
## Get the best model for data 9

In [None]:
data_key=9
train, val = time_series_train_val_split(train_datasets[data_key], train_size=0.8)

X_train=train.drop(columns='value')
y_train=train['value']

X_val=train.drop(columns='value')
y_val=train['value']

In [None]:
# Initialize the class with training datasets
model_getter = Model_Getter(train_datasets[data_key])

# Evaluate all models to find the best one
model_getter.evaluate_models()

# Save the best model
model_getter.save_best_model('boosted_hybrid_model.pkl')


In [None]:
# Loading the model
loaded_model = Hybrid_Model_Loader.load_model('boosted_hybrid_model.pkl')

# Making predictions
true_y = train_datasets[data_key]['value']
pred_y = loaded_model.predict(train_datasets[data_key].drop('value', axis=1))
TimeSeriesPlotter.plot_actual_vs_predicted( true_y[0:200], pred_y[0:200])
mae = mean_absolute_error(true_y, pred_y)
mse = mean_squared_error(true_y, pred_y)
rmse = np.sqrt(mse)
r2 = r2_score(true_y, pred_y)

print(f'MAE: {mae}, MSE: {mse}, RMSE: {rmse}, R²: {r2}')