In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')


In [18]:


class EnergyPredictionModel:
    def __init__(self, file_path, x_values=5):
        self.file_path = file_path
        self.x_values = x_values
        self.data = None
        self.train_data = None
        self.test_data = None
        self.model = None
        self.FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year']
        self.TARGET = 'energy'

    def load_and_preprocess_data(self):
        """Load the dataset and preprocess it."""
        self.data = pd.read_csv(self.file_path)
        self.data['datetime'] = pd.to_datetime(self.data['datetime'])
        self.data = self.data.set_index('datetime')

        # Handle NaN values
        self.data['energy'] = self.check_nan_and_replace_with_avg(self.data['energy'])

        # Split the data into training and testing sets
        self.split_ratio = 0.8
        self.split_date_index = int(len(self.data) * self.split_ratio)
        self.split_date = self.data.index[self.split_date_index]

        print("NaN values handled. Data loaded successfully.")

    def check_nan_and_replace_with_avg(self, series):
        """Replace NaN values in a Series with the average of the last x valid values."""
        series = series.copy()
        for i in range(len(series)):
            if np.isnan(series.iloc[i]):
                valid_values = [series.iloc[i - j] for j in range(1, self.x_values + 1) if i - j >= 0 and not np.isnan(series.iloc[i - j])]
                if valid_values:
                    series.iloc[i] = np.mean(valid_values)
        return series

    def create_features(self, df):
        """Create time-based and lag features from the DataFrame."""
        df = df.copy()
        df['hour'] = df.index.hour
        df['dayofweek'] = df.index.dayofweek
        df['quarter'] = df.index.quarter
        df['month'] = df.index.month
        df['year'] = df.index.year
        df['dayofyear'] = df.index.dayofyear

        return df.dropna()  # Drop NaN values created by shifting

    def split_data(self):
        """Split the dataset into training and testing sets."""
        self.train_data = self.data.loc[self.data.index < self.split_date]
        self.test_data = self.data.loc[self.data.index >= self.split_date]

        # Create features for the training and testing data
        self.train_data = self.create_features(self.train_data)
        self.test_data = self.create_features(self.test_data)
        print("Data split into train and test sets.")

    def train_model(self):
        """Train the RandomForestRegressor model."""
        X_train = self.train_data[self.FEATURES]
        y_train = self.train_data[self.TARGET]

        self.model = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)

        self.model.fit(X_train, y_train)
        print("Model trained.")

    def calculate_trend_value(self, dt):
        """Calculate the trend value based on historical data up to the specified datetime."""
        historical_data = self.train_data[self.train_data.index <= pd.to_datetime(dt)]

        # Ensure you have enough data points
        if len(historical_data) < 2:
            return 0.0  # Not enough data to calculate a trend

        # Prepare data for linear regression
        X = np.arange(len(historical_data)).reshape(-1, 1)  # Time as a feature
        y = historical_data['energy'].values  # Historical energy values

        # Fit linear regression model
        model = LinearRegression()
        model.fit(X, y)

        # The slope of the line (the coefficient) represents the trend
        trend_value = model.coef_[0]
        return trend_value

    def predict_for_datetime(self, dt):
        # Calculate the trend value based on historical data up to the specified datetime
        self.trend_value = self.calculate_trend_value(dt)

        # Create an empty DataFrame for the specific datetime
        df = pd.DataFrame(index=[pd.to_datetime(dt)])

        # Create time-based features for the DataFrame
        df['hour'] = df.index.hour
        df['dayofweek'] = df.index.dayofweek
        df['quarter'] = df.index.quarter
        df['month'] = df.index.month
        df['year'] = df.index.year
        df['dayofyear'] = df.index.dayofyear

        # Predict using the trained model
        prediction = self.model.predict(df[self.FEATURES])

        # Add the calculated and scaled trend value to the prediction
        final_prediction = prediction[0] + (self.trend_value)

        print(f"Predicted energy for {dt} with trend: {final_prediction}")
        return final_prediction


    def calculate_error_percentage(self, y_true, y_pred):
        """Calculate the mean error percentage between true and predicted values."""
        error_percentage = np.abs(y_true - y_pred) / y_true * 100
        mean_error_percentage = np.mean(error_percentage)
        print(f"Mean Error Percentage: {mean_error_percentage:.2f}%")
        return mean_error_percentage

    def calculate_accuracy(self, y_true, y_pred):
        """Calculate the accuracy of the model based on predictions."""
        error_percentage = self.calculate_error_percentage(y_true, y_pred)
        accuracy = 100 - error_percentage
        print(f"Model Accuracy: {accuracy:.2f}%")
        return accuracy

    def calculate_mean(self, values):
        """Calculate and return the mean of the given values."""
        mean_value = np.mean(values)
        print(f"Mean: {mean_value:.2f}")
        return mean_value

    def calculate_std(self, values):
        """Calculate and return the standard deviation of the given values."""
        std_value = np.std(values)
        print(f"Standard Deviation: {std_value:.2f}")
        return std_value




In [19]:

file_path = './processed_train_data.csv'  # Specify your CSV file path
energy_model = EnergyPredictionModel(file_path)
energy_model.load_and_preprocess_data()
energy_model.split_data()
energy_model.train_model()



NaN values handled. Data loaded successfully.
Data split into train and test sets.
Model trained.


In [20]:
# Example prediction
prediction_datetime = '2018-03-01 12:00:00'
prediction = energy_model.predict_for_datetime(prediction_datetime)
print(f"Energy prediction for {prediction_datetime}: {prediction}")

Predicted energy for 2018-03-01 12:00:00 with trend: 1833.8987580967564
Energy prediction for 2018-03-01 12:00:00: 1833.8987580967564


In [21]:
# Make predictions on the test set
X_test = energy_model.test_data[energy_model.FEATURES]
y_test = energy_model.test_data[energy_model.TARGET]

# Generate predictions
y_pred = energy_model.model.predict(X_test)
# Calculate mean and standard deviation of actual and predicted values
mean_actual = energy_model.calculate_mean(y_test)
std_actual = energy_model.calculate_std(y_test)
mean_predicted = energy_model.calculate_mean(y_pred)
std_predicted = energy_model.calculate_std(y_pred)

# Calculate and print error percentage and accuracy
energy_model.calculate_error_percentage(y_test, y_pred)
energy_model.calculate_accuracy(y_test, y_pred)

Mean: 2002.08
Standard Deviation: 349.63
Mean: 1928.81
Standard Deviation: 290.11
Mean Error Percentage: 8.30%
Mean Error Percentage: 8.30%
Model Accuracy: 91.70%


91.69806471747057

In [22]:
# After training the model, save it to a file using joblib
import joblib

# Save the trained model as a joblib file
model_filename = 'energy_prediction_model.joblib'
joblib.dump(energy_model.model, model_filename)
print(f"Model saved as {model_filename}")

import pickle

model_filename = 'energy_prediction_model.pkl'
pickle.dump(energy_model.model, open(model_filename, 'wb'))

print(f"Model saved as {model_filename}")

Model saved as energy_prediction_model.joblib
Model saved as energy_prediction_model.pkl
