# Making Predictions with Random Forest Regressor using historical data

In this notebook, I would be implementing a Random Forest Regressor model to predict kWHDelivered using three different methods:

1. Using historical data from only one charging station
2. Using historical data from 54 charging stations with cummulative kWhDelivered added based on stationID
3. Using historical data for each 54 charging stations using Transfer Learning

## Method 1: Making Predictions with Random Forest Regressor using historical data from only one charging station

In [182]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from smape import smape
import matplotlib.pyplot as plt
from IPython.display import HTML, display

In [None]:
# Load the data from the local file

In [183]:
data = pd.read_csv('./caltech_model_data/2-39-95-444.csv')

In [7]:
# display the data

In [184]:
data.tail()

Unnamed: 0,month,siteID,stationID,timezone,spaceID,kWhDelivered,MinTemp,MaxTemp,AvgTemp,AvgPrecipitation,AvgHumidity,AvgWindSpeed
6,2018-11,2,2-39-95-444,America/Los_Angeles,CA-497,182.559,11,32,17.733333,0.058333,38.8125,6.316667
7,2018-12,2,2-39-95-444,America/Los_Angeles,CA-497,224.156,5,25,13.225806,0.067339,44.790323,5.927419
8,2019-01,2,2-39-95-444,America/Los_Angeles,CA-497,312.082,3,25,12.83871,0.232258,50.447581,6.814516
9,2019-02,2,2-39-95-444,America/Los_Angeles,CA-497,312.787,3,19,9.892857,0.315625,53.901786,7.026786
10,2019-03,2,2-39-95-444,America/Los_Angeles,CA-497,196.661,7,28,14.290323,0.095565,52.427419,7.491935


In [185]:
# Select which parameters to include as features and which parameter to be considered as the label
X = data.drop(["siteID","stationID", "timezone","spaceID","kWhDelivered","month"], axis=1)
y = data["kWhDelivered"]

In [186]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=19)

In [187]:
# Define and Fit Random forest model
rfr = RandomForestRegressor(random_state=13)
rfr.fit(X_train, y_train)

In [188]:
y_predict = rfr.predict(X_test)

In [189]:
# Obtain the metrics and compare results

mae = mean_absolute_error(y_predict, y_test)
mse = mean_squared_error(y_predict, y_test)
smape_value = smape(y_predict, y_test)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')

Mean Squared Error: 2772.5193208572696
Mean Absolute Error: 49.315765000000255
Symmetric Mean Absolute Percentage Error: 18.74%


In [190]:
# Model optimization using GridSearch

param_grid = {
    'n_estimators': [100,200,300],
     'max_depth': [10,20,30],
     'min_samples_split': [2,5,10],
     'min_samples_leaf': [1,2,4],
}

In [191]:
rfr_cv = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

In [192]:
rfr_cv.fit(X_train, y_train)

In [83]:
y_predict = rfr_cv.predict(X_test)

In [176]:
# Obtain the metrics

mae = mean_absolute_error(y_predict, y_test)
mse = mean_squared_error(y_predict, y_test)
smape_value = smape(y_predict, y_test)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')
print(y_test)

Mean Squared Error: 2772.5193208572696
Mean Absolute Error: 49.315765000000255
Symmetric Mean Absolute Percentage Error: 18.74%
1    290.400
7    224.156
Name: kWhDelivered, dtype: float64


## Method 2: Making Predictions with Random Forest Regressor using historical data from 54 charging stations with cummulative kWhDelivered

In [59]:
# Define the directory containing the CSV files

In [140]:
directory = './caltech_model_data'

In [None]:
# Load data and sum all CSV files into a single dataframe

In [141]:
combined_data = pd.DataFrame()


# Loop through each CSV file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)
        # Load the data from the CSV file
        data = pd.read_csv(filepath)
        # Append the data to the combined DataFrame
        combined_data = pd.concat([combined_data, data], axis=0)

summed_data = combined_data.groupby('month').agg({
    'kWhDelivered': 'sum',
    'MinTemp': 'first',
    'MaxTemp': 'first',
    'AvgTemp': 'first',
    'AvgPrecipitation': 'first',
    'AvgHumidity': 'first',
    'AvgWindSpeed': 'first'
}).reset_index()

# Print or save the result
summed_data.head()

Unnamed: 0,month,kWhDelivered,MinTemp,MaxTemp,AvgTemp,AvgPrecipitation,AvgHumidity,AvgWindSpeed
0,2018-05,15635.919485,11,32,18.290323,0.006855,56.451613,9.504032
1,2018-06,16984.573907,14,32,22.433333,0.0,50.483333,9.220833
2,2018-07,18860.532653,17,44,27.548387,0.0,46.762097,8.53629
3,2018-08,20685.462562,18,38,26.516129,0.0,47.41129,8.705645
4,2018-09,20609.835595,17,35,24.533333,0.0,49.204167,7.2875


In [142]:
# Select which parameters to include as features and which parameter to be considered as the label
X = data.drop(["siteID","stationID", "timezone","spaceID","kWhDelivered","month"], axis=1)
y = data["kWhDelivered"]

In [143]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=19)

In [144]:
# Define and Fit Random forest model
rfr = RandomForestRegressor(random_state=13)
rfr.fit(X_train, y_train)

In [126]:
y_predict = rfr.predict(X_test)

In [145]:
# Obtain the metrics and compare results

mae = mean_absolute_error(y_predict, y_test)
mse = mean_squared_error(y_predict, y_test)
smape_value = smape(y_predict, y_test)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')

Mean Squared Error: 47926.12854436765
Mean Absolute Error: 204.64693613750026
Symmetric Mean Absolute Percentage Error: 70.91%


In [146]:
y_predict = rfr_cv.predict(X_test)

In [147]:
# Obtain the metrics

mae = mean_absolute_error(y_predict, y_test)
mse = mean_squared_error(y_predict, y_test)
smape_value = smape(y_predict, y_test)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')

Mean Squared Error: 38770.67596011176
Mean Absolute Error: 193.06197857061937
Symmetric Mean Absolute Percentage Error: 68.67%


## Method 3: Making Predictions with Random Forest Regressor using historical data for each 54 charging stations using Transfer Learning

In [197]:
# Initialize a dictionary to store the models and metrics
models = {}
metrics = {}

# Directory containing the CSV files
directory = "./caltech_model_data"  # Replace with your directory path

# List all files in the directory
filenames = sorted([filename for filename in os.listdir(directory) if filename.endswith('.csv')])

# Placeholder to store the first trained model for Transfer Learning
previous_model = None

# Loop through each CSV file in the directory
for filename in filenames:
    filepath = os.path.join(directory, filename)
    # Load the data from the CSV file
    data = pd.read_csv(filepath)
    
    # Extract the stationID
    stationID = filename.split('.')[0]  # Example: If filename is 'station1.csv'
    
    # Sort the data by the 'Month' column
    data = data.sort_values(by='month')
    
    # Select features (excluding identifiers and kWhDelivered)
    X = data.drop(["siteID", "stationID", "timezone", "spaceID", "kWhDelivered", "month"], axis=1)
    y = data["kWhDelivered"]
    
    # Split data into training and test sets
    X_train, X_test = X[:-1], X[-1:]  # Use all but the last row for training, last row for testing
    y_train, y_test = y[:-1], y[-1:]
    
    # Initialize or use the previous model for transfer learning
    if previous_model is not None:
        model = previous_model
        model.fit(X_train, y_train)
    else:
        model = RandomForestRegressor(random_state=13)
        model.fit(X_train, y_train)
    
    # Predict the next month's kWhDelivered
    forecast = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test.values, forecast)
    mse = mean_squared_error(y_test.values, forecast)
    smape_value = smape(y_test.values, forecast)
    
    # Store the model and metrics
    models[stationID] = model
    metrics[stationID] = {'MAE': mae, 'MSE': mse, 'SMAPE (%)': smape_value, 'Actual kWh':y_test.values[0], 'Predicted kWh':forecast[0]}
    
    # Update the previous model for transfer learning
    previous_model = model


In [198]:
metrics_df = pd.DataFrame.from_dict(metrics, orient='index').reset_index()

# Rename the index column to 'stationID'
metrics_df = metrics_df.rename(columns={'index': 'stationID'})

metrics_df

Unnamed: 0,stationID,MAE,MSE,SMAPE (%),Actual kWh,Predicted kWh
0,2-39-123-23,59.271595,3513.121967,18.65413,288.104,347.375595
1,2-39-123-557,84.160411,7082.974723,142.143154,17.128,101.288411
2,2-39-124-22,90.782766,8241.510519,66.36197,91.408,182.190766
3,2-39-124-558,18.44162,340.093348,37.396462,40.093,58.53462
4,2-39-125-21,11.319968,128.141687,3.958959,280.273,291.592968
5,2-39-125-559,9.959608,99.193799,42.279115,18.577,28.536608
6,2-39-126-20,3.28092,10.764436,3.959139,84.51,81.22908
7,2-39-126-560,3.703115,13.713057,11.737496,33.401,29.697885
8,2-39-127-19,19.983548,399.342204,6.069142,319.273,339.256548
9,2-39-127-561,28.48292,811.276732,106.24205,12.568,41.05092
