# Making Predictions with Random Forest Regressor using historical data

In this notebook, I would be implementing a Random Forest Regressor model to predict kWHDelivered using three different methods:

1. Using historical data from only one charging station
2. Using historical data from 54 charging stations with cummulative kWhDelivered added based on stationID
3. Using historical data for each 54 charging stations using Transfer Learning

## Method 1: Making Predictions with Random Forest Regressor using historical data from only one charging station

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from smape import smape
import matplotlib.pyplot as plt
from IPython.display import HTML, display

In [None]:
# Load the data from the local file

In [2]:
data = pd.read_csv('./jpl_model_data/1-1-178-817.csv')

In [7]:
# display the data

In [3]:
data.tail()

Unnamed: 0,month,siteID,stationID,timezone,spaceID,kWhDelivered,MinTemp,MaxTemp,AvgTemp,AvgPrecipitation,AvgHumidity,AvgWindSpeed
7,2019-04,1,1-1-178-817,America/Los_Angeles,AG-1F09,398.767,11,29,17.733333,0.009167,50.929167,8.108333
8,2019-05,1,1-1-178-817,America/Los_Angeles,AG-1F09,527.571,10,26,16.645161,0.050806,58.834677,8.802419
9,2019-06,1,1-1-178-817,America/Los_Angeles,AG-1F09,424.334,14,36,21.633333,0.000833,56.3125,7.720833
10,2019-07,1,1-1-178-817,America/Los_Angeles,AG-1F09,537.814406,16,36,24.548387,0.0,52.03629,7.241935
11,2019-08,1,1-1-178-817,America/Los_Angeles,AG-1F09,471.962,17,34,25.483871,0.0,49.379032,6.58871


In [4]:
# Select which parameters to include as features and which parameter to be considered as the label
X = data.drop(["siteID","stationID", "timezone","spaceID","kWhDelivered","month"], axis=1)
y = data["kWhDelivered"]

In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=19)

In [6]:
# Define and Fit Random forest model
rfr = RandomForestRegressor(random_state=13)
rfr.fit(X_train, y_train)

In [7]:
y_predict = rfr.predict(X_test)

In [8]:
# Obtain the metrics and compare results

mae = mean_absolute_error(y_predict, y_test)
mse = mean_squared_error(y_predict, y_test)
smape_value = smape(y_predict, y_test)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')

Mean Squared Error: 12563.041852846864
Mean Absolute Error: 107.31071997500013
Symmetric Mean Absolute Percentage Error: 26.02%


In [9]:
# Model optimization using GridSearch

param_grid = {
    'n_estimators': [100,200,300],
     'max_depth': [10,20,30],
     'min_samples_split': [2,5,10],
     'min_samples_leaf': [1,2,4],
}

In [10]:
rfr_cv = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

In [11]:
rfr_cv.fit(X_train, y_train)

In [14]:
y_predict = rfr_cv.predict(X_test)

In [15]:
# Obtain the metrics

mae = mean_absolute_error(y_predict, y_test)
mse = mean_squared_error(y_predict, y_test)
smape_value = smape(y_predict, y_test)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')
print(y_test)

Mean Squared Error: 11257.706498887108
Mean Absolute Error: 100.42372094681892
Symmetric Mean Absolute Percentage Error: 24.36%
7    398.767000
1    470.523645
Name: kWhDelivered, dtype: float64


## Method 2: Making Predictions with Random Forest Regressor using historical data from 54 charging stations with cummulative kWhDelivered

In [59]:
# Define the directory containing the CSV files

In [16]:
directory = './jpl_model_data'

In [None]:
# Load data and sum all CSV files into a single dataframe

In [17]:
combined_data = pd.DataFrame()


# Loop through each CSV file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)
        # Load the data from the CSV file
        data = pd.read_csv(filepath)
        # Append the data to the combined DataFrame
        combined_data = pd.concat([combined_data, data], axis=0)

summed_data = combined_data.groupby('month').agg({
    'kWhDelivered': 'sum',
    'MinTemp': 'first',
    'MaxTemp': 'first',
    'AvgTemp': 'first',
    'AvgPrecipitation': 'first',
    'AvgHumidity': 'first',
    'AvgWindSpeed': 'first'
}).reset_index()

# Print or save the result
summed_data.head()

Unnamed: 0,month,kWhDelivered,MinTemp,MaxTemp,AvgTemp,AvgPrecipitation,AvgHumidity,AvgWindSpeed
0,2018-09,9514.851,17,35,24.533333,0.0,49.204167,7.2875
1,2018-10,18161.496291,14,32,21.064516,0.021371,47.125,6.241935
2,2018-11,15311.232726,11,32,17.733333,0.058333,38.8125,6.316667
3,2018-12,14177.875,5,25,13.225806,0.067339,44.790323,5.927419
4,2019-01,19698.515,3,25,12.83871,0.232258,50.447581,6.814516


In [18]:
# Select which parameters to include as features and which parameter to be considered as the label
X = summed_data.drop(["kWhDelivered","month"], axis=1)
y = summed_data["kWhDelivered"]

In [19]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=19)

In [20]:
# Define and Fit Random forest model
rfr = RandomForestRegressor(random_state=13)
rfr.fit(X_train, y_train)

In [21]:
y_predict = rfr.predict(X_test)

In [22]:
# Obtain the metrics and compare results

mae = mean_absolute_error(y_predict, y_test)
mse = mean_squared_error(y_predict, y_test)
smape_value = smape(y_predict, y_test)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')

Mean Squared Error: 7616375.677874967
Mean Absolute Error: 2559.2310841958206
Symmetric Mean Absolute Percentage Error: 14.51%


In [23]:
y_predict = rfr_cv.predict(X_test)

In [24]:
# Obtain the metrics

mae = mean_absolute_error(y_predict, y_test)
mse = mean_squared_error(y_predict, y_test)
smape_value = smape(y_predict, y_test)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')

Mean Squared Error: 397690350.16927576
Mean Absolute Error: 19840.16273874698
Symmetric Mean Absolute Percentage Error: 192.29%


## Method 3: Making Predictions with Random Forest Regressor using historical data for each 54 charging stations using Transfer Learning

In [25]:
# Initialize a dictionary to store the models and metrics
models = {}
metrics = {}

# Directory containing the CSV files
directory = "./caltech_model_data"  # Replace with your directory path

# List all files in the directory
filenames = sorted([filename for filename in os.listdir(directory) if filename.endswith('.csv')])

# Placeholder to store the first trained model for Transfer Learning
previous_model = None

# Loop through each CSV file in the directory
for filename in filenames:
    filepath = os.path.join(directory, filename)
    # Load the data from the CSV file
    data = pd.read_csv(filepath)
    
    # Extract the stationID
    stationID = filename.split('.')[0]  # Example: If filename is 'station1.csv'
    
    # Sort the data by the 'Month' column
    data = data.sort_values(by='month')
    
    # Select features (excluding identifiers and kWhDelivered)
    X = data.drop(["siteID", "stationID", "timezone", "spaceID", "kWhDelivered", "month"], axis=1)
    y = data["kWhDelivered"]
    
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=19)
    # X_train, X_test = X[:-1], X[-1:] 
    # y_train, y_test = y[:-1], y[-1:]
    
    # Initialize or use the previous model for transfer learning
    if previous_model is not None:
        model = previous_model
        model.fit(X_train, y_train)
    else:
        model = RandomForestRegressor(random_state=13)
        model.fit(X_train, y_train)
    
    # Predict the next month's kWhDelivered
    forecast = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test.values, forecast)
    mse = mean_squared_error(y_test.values, forecast)
    smape_value = smape(y_test.values, forecast)
    
    # Store the model and metrics
    models[stationID] = model
    metrics[stationID] = {'MAE': mae, 'MSE': mse, 'SMAPE (%)': smape_value, 'Actual kWh':y_test.values[0], 'Predicted kWh':forecast[0]}
    
    # Update the previous model for transfer learning
    previous_model = model

# Convert metrics to DataFrame and print
metrics_df = pd.DataFrame.from_dict(metrics, orient='index')
metrics_df

Unnamed: 0,MAE,MSE,SMAPE (%),Actual kWh,Predicted kWh
2-39-123-23,70.416486,9302.601165,21.407608,491.981,487.47445
2-39-123-557,51.055378,3151.512062,33.574328,171.03,245.427624
2-39-124-22,90.581389,8313.548617,41.241835,360.582,440.744144
2-39-124-558,49.323525,4569.300505,90.912869,88.818,91.91934
2-39-125-21,91.427412,8867.737591,21.708527,492.440214,606.423466
2-39-125-559,62.072228,3852.961542,77.420817,49.139,111.211228
2-39-126-20,148.726055,26366.585932,65.256467,274.391,488.28719
2-39-126-560,43.005152,2870.173874,80.988927,96.008,84.951724
2-39-127-19,109.624501,12026.270983,23.794216,548.545,661.125821
2-39-127-561,41.22681,1699.649863,44.373129,72.296,113.52281


In [26]:
metrics_df = pd.DataFrame.from_dict(metrics, orient='index').reset_index()

# Rename the index column to 'stationID'
metrics_df = metrics_df.rename(columns={'index': 'stationID'})

metrics_df

Unnamed: 0,stationID,MAE,MSE,SMAPE (%),Actual kWh,Predicted kWh
0,2-39-123-23,70.416486,9302.601165,21.407608,491.981,487.47445
1,2-39-123-557,51.055378,3151.512062,33.574328,171.03,245.427624
2,2-39-124-22,90.581389,8313.548617,41.241835,360.582,440.744144
3,2-39-124-558,49.323525,4569.300505,90.912869,88.818,91.91934
4,2-39-125-21,91.427412,8867.737591,21.708527,492.440214,606.423466
5,2-39-125-559,62.072228,3852.961542,77.420817,49.139,111.211228
6,2-39-126-20,148.726055,26366.585932,65.256467,274.391,488.28719
7,2-39-126-560,43.005152,2870.173874,80.988927,96.008,84.951724
8,2-39-127-19,109.624501,12026.270983,23.794216,548.545,661.125821
9,2-39-127-561,41.22681,1699.649863,44.373129,72.296,113.52281
