In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')


In [2]:
file_path = "./processed_train_data.csv"
x_values = 10
data = pd.read_csv(file_path, index_col=0)
train_data = None
test_data = None
model = None
FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year']
TARGET = 'energy'
split_ratio = 0.8
split_date_index = int(len(data) * split_ratio)
split_date = data.index[split_date_index]

In [3]:
def check_nan_and_replace_with_avg(series,x_values):
        """Replace NaN values in a Series with the average of the last x valid values."""
        series = series.copy()
        for i in range(len(series)):
            if np.isnan(series.iloc[i]):
                valid_values = [series.iloc[i - j] for j in range(1, x_values + 1) if i - j >= 0 and not np.isnan(series.iloc[i - j])]
                if valid_values:
                    series.iloc[i] = np.mean(valid_values)
        return series

In [4]:
def load_and_preprocess_data(file_path,x_values):
    """Load the dataset and preprocess it."""
    data = pd.read_csv(file_path)
    data['datetime'] = pd.to_datetime(data['datetime'])
    data = data.set_index('datetime')

    # Handle NaN values
    data['energy'] = check_nan_and_replace_with_avg(data['energy'],x_values)

    # Split the data into training and testing sets
    split_ratio = 0.8
    split_date_index = int(len(data) * split_ratio)
    split_date = data.index[split_date_index]

    print("NaN values handled. Data loaded successfully.")
    return data

In [5]:
def create_features(df):
        """Create time-based and lag features from the DataFrame."""
        df = df.copy()
        df['hour'] = df.index.hour
        df['dayofweek'] = df.index.dayofweek
        df['quarter'] = df.index.quarter
        df['month'] = df.index.month
        df['year'] = df.index.year
        df['dayofyear'] = df.index.dayofyear

        return df.dropna()  # Drop NaN values created by shifting

In [6]:

def split_data(data,split_date):
    """Split the dataset into training and testing sets."""
    train_data = data.loc[data.index < split_date]
    test_data = data.loc[data.index >= split_date]

    # Create features for the training and testing data
    train_data = create_features(train_data)
    test_data = create_features(test_data)
    print("Data split into train and test sets.")
    return train_data,test_data

In [7]:
def calculate_trend_value(dt):
    """Calculate the trend value based on historical data up to the specified datetime."""
    historical_data = train_data[train_data.index <= pd.to_datetime(dt)]

    # Ensure you have enough data points
    if len(historical_data) < 2:
        return 0.0  # Not enough data to calculate a trend

    # Prepare data for linear regression
    X = np.arange(len(historical_data)).reshape(-1, 1)  # Time as a feature
    y = historical_data['energy'].values  # Historical energy values

    # Fit linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # The slope of the line (the coefficient) represents the trend
    trend_value = model.coef_[0]
    return trend_value

In [8]:
def predict_for_datetime(dt,model):
    # Calculate the trend value based on historical data up to the specified datetime
    trend_value = calculate_trend_value(dt)

    # Create an empty DataFrame for the specific datetime
    df = pd.DataFrame(index=[pd.to_datetime(dt)])

    # Create time-based features for the DataFrame
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear

    # Predict using the trained model
    prediction = model.predict(df[FEATURES])

    # Add the calculated and scaled trend value to the prediction
    final_prediction = prediction[0] + (trend_value)

    print(f"Predicted energy for {dt} with trend: {final_prediction}")
    return final_prediction

In [9]:
def train_model():
    """Train the RandomForestRegressor model."""
    X_train = train_data[FEATURES]
    y_train = train_data[TARGET]

    model = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)

    model.fit(X_train, y_train)
    print("Model trained.")
    return model,X_train,y_train

In [10]:
def calculate_error_percentage(y_true, y_pred):
    """Calculate the mean error percentage between true and predicted values."""
    error_percentage = np.abs(y_true - y_pred) / y_true * 100
    mean_error_percentage = np.mean(error_percentage)
    print(f"Mean Error Percentage: {mean_error_percentage:.2f}%")
    return mean_error_percentage

def calculate_accuracy(y_true, y_pred):
    """Calculate the accuracy of the model based on predictions."""
    error_percentage = calculate_error_percentage(y_true, y_pred)
    accuracy = 100 - error_percentage
    print(f"Model Accuracy: {accuracy:.2f}%")
    return accuracy

def calculate_mean(values):
    """Calculate and return the mean of the given values."""
    mean_value = np.mean(values)
    print(f"Mean: {mean_value:.2f}")
    return mean_value

def calculate_std(values):
    """Calculate and return the standard deviation of the given values."""
    std_value = np.std(values)
    print(f"Standard Deviation: {std_value:.2f}")
    return std_value

In [11]:
file_path = './processed_train_data.csv'  # Specify your CSV file path


In [12]:
energy_data = pd.read_csv(file_path)
energy_data['datetime'] = pd.to_datetime(energy_data['datetime'])
energy_data = energy_data.set_index('datetime')


In [13]:
# Load and preprocess the data
data = load_and_preprocess_data(file_path,x_values)

# Split the data into training and testing sets
train_data,test_data = split_data(data,split_date)

# Train the model
model,X_train,y_train=train_model()

print("Data loaded, preprocessed, split, and model trained successfully.")

NaN values handled. Data loaded successfully.
Data split into train and test sets.
Model trained.
Data loaded, preprocessed, split, and model trained successfully.


In [14]:
# Example prediction
prediction_datetime = '2018-03-01 12:00:00'
prediction = predict_for_datetime(prediction_datetime,model)

Predicted energy for 2018-03-01 12:00:00 with trend: 1833.9445607739854


In [15]:
# Make predictions on the test set
X_test = test_data[FEATURES]
y_test = test_data[TARGET]

# Generate predictions
y_pred = model.predict(X_test)
# Calculate mean and standard deviation of actual and predicted values
mean_actual = calculate_mean(y_test)
std_actual = calculate_std(y_test)
mean_predicted = calculate_mean(y_pred)
std_predicted = calculate_std(y_pred)

# Calculate and print error percentage and accuracy
calculate_error_percentage(y_test, y_pred)
calculate_accuracy(y_test, y_pred)

Mean: 2002.16
Standard Deviation: 349.29
Mean: 1929.50
Standard Deviation: 289.15
Mean Error Percentage: 8.35%
Mean Error Percentage: 8.35%
Model Accuracy: 91.65%


91.64752515806303

In [16]:
import joblib

# Save the model to a file
model_filename = 'model.joblib'
joblib.dump(model, model_filename, compress=8)
print(f"Model saved to {model_filename}")

Model saved to model.joblib
