# Making Predictions with Linear Regression using weather data

In this notebook, I would be implementing a simple Linear Regression model to predict kWHDelivered using three different methods:

1. Using historical weather data from only one charging station
2. Using historical weather data from 54 charging stations with cummulative kWhDelivered added based on stationID
3. Using historical weather data for each 54 charging stations using Transfer Learning

## Method 1: Making Predictions with Linear Regression using historical weather data from only one charging station

In this first method, we would be making a prediction with Linear regression model using only the historical charging station data for a single charging station

In [None]:
# Import the required packages

In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import warnings
import os as os
from smape import smape

In [3]:
#Load the data from the local file

In [129]:
data = pd.read_csv('./caltech_model_data/2-39-123-23.csv')

In [15]:
# Display the first few rows of the dataframe

In [130]:
data.head()

Unnamed: 0,month,siteID,stationID,timezone,spaceID,kWhDelivered,MinTemp,MaxTemp,AvgTemp,AvgPrecipitation,AvgHumidity,AvgWindSpeed
0,2018-05,2,2-39-123-23,America/Los_Angeles,CA-313,382.524,11,32,18.290323,0.006855,56.451613,9.504032
1,2018-06,2,2-39-123-23,America/Los_Angeles,CA-313,491.981,14,32,22.433333,0.0,50.483333,9.220833
2,2018-07,2,2-39-123-23,America/Los_Angeles,CA-313,395.959351,17,44,27.548387,0.0,46.762097,8.53629
3,2018-08,2,2-39-123-23,America/Los_Angeles,CA-313,666.838745,18,38,26.516129,0.0,47.41129,8.705645
4,2018-09,2,2-39-123-23,America/Los_Angeles,CA-313,548.855,17,35,24.533333,0.0,49.204167,7.2875


In [175]:
# Convert the month to datetime format and set it as the index

In [131]:
data['Month'] = pd.to_datetime(data['month'])
data.set_index('Month', inplace=True)

In [26]:
# Prepare the data for Linear Regression

In [132]:
X = data.drop(["siteID","stationID", "timezone","spaceID","kWhDelivered","month"], axis=1)
y = data["kWhDelivered"]

In [29]:
# Split data into training and testing sets

In [133]:
 X_train, X_test = X[:-1], X[-1:] 
 y_train, y_test = y[:-1], y[-1:]

In [31]:
# Fit Linear Rregression model

In [134]:
model = LinearRegression()
model_fit = model.fit(X_train, y_train)

In [35]:
# Make predictions using the trained model

In [135]:
predictions = model.predict(X_test)

In [38]:
# Evaluate the model using Mean Square Error, Mean Absolute Error and Symmetric Mean Absolute Percentage Error

In [136]:
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
smape_value = smape(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')
print(f'Actual kWh: {y_test.values[0]}')
print(f'Predicted kWh: {predictions[0]}')


Mean Squared Error: 19.93873450089328
Mean Absolute Error: 4.46528101029412
Symmetric Mean Absolute Percentage Error: 1.56%
Actual kWh: 288.104
Predicted kWh: 283.63871898970586


## Method 2: Making Predictions with Linear Regression using historical weather data from 54 charging stations with cummulative kWhDelivered

In [48]:
# Define the directory containing the CSV files

In [137]:
directory = './caltech_model_data'

In [None]:
# Load data and sum all CSV files into a single dataframe

In [138]:
combined_data = pd.DataFrame()


# Loop through each CSV file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)
        # Load the data from the CSV file
        data = pd.read_csv(filepath)
        # Append the data to the combined DataFrame
        combined_data = pd.concat([combined_data, data], axis=0)

summed_data = combined_data.groupby('month').agg({
    'kWhDelivered': 'sum',
    'MinTemp': 'first',
    'MaxTemp': 'first',
    'AvgTemp': 'first',
    'AvgPrecipitation': 'first',
    'AvgHumidity': 'first',
    'AvgWindSpeed': 'first'
}).reset_index()

# Print or save the result
summed_data.head()

Unnamed: 0,month,kWhDelivered,MinTemp,MaxTemp,AvgTemp,AvgPrecipitation,AvgHumidity,AvgWindSpeed
0,2018-05,15635.919485,11,32,18.290323,0.006855,56.451613,9.504032
1,2018-06,16984.573907,14,32,22.433333,0.0,50.483333,9.220833
2,2018-07,18860.532653,17,44,27.548387,0.0,46.762097,8.53629
3,2018-08,20685.462562,18,38,26.516129,0.0,47.41129,8.705645
4,2018-09,20609.835595,17,35,24.533333,0.0,49.204167,7.2875


In [None]:
# Split the data

In [139]:
X = summed_data.drop(["kWhDelivered","month"], axis=1)
y = summed_data["kWhDelivered"]

X_train, X_test = X[:-1], X[-1:] 
y_train, y_test = y[:-1], y[-1:]

In [145]:
#Fit Linear Regression Model

In [146]:
model = LinearRegression()
model_fit = model.fit(X_train, y_train)

In [147]:
## Making Predictions using the model

In [141]:
predictions = model.predict(X_test)

In [None]:
# Evaluate the model using Mean Square Error, Mean Absolute Error and Symmetric Mean Absolute Percentage Error

In [142]:
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
smape_value = smape(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')
print(f'Actual kWh: {y_test.values[0]}')
print(f'Predicted kWh: {predictions[0]}')

Mean Squared Error: 14483522.62361377
Mean Absolute Error: 3805.7223524074598
Symmetric Mean Absolute Percentage Error: 35.12%
Actual kWh: 8933.09810185185
Predicted kWh: 12738.82045425931


## Method 3: Making Predictions with Linear Regression using historical weather data for each 54 charging stations using Transfer Learning

### Using Transfer Learning

In [143]:
# Initialize a dictionary to store the models and metrics
models = {}
metrics = {}

# Directory containing the CSV files
directory = "./caltech_model_data"  # Replace with your directory path

# List all files in the directory
filenames = sorted([filename for filename in os.listdir(directory) if filename.endswith('.csv')])

# Placeholder to store the first trained model for Transfer Learning
previous_model = None

# Loop through each CSV file in the directory
for filename in filenames:
    filepath = os.path.join(directory, filename)
    # Load the data from the CSV file
    data = pd.read_csv(filepath)
    
    # Extract the stationID
    stationID = filename.split('.')[0]  # Example: If filename is 'station1.csv'
    
    # Sort the data by the 'Month' column
    data = data.sort_values(by='month')
    
    # Select features (excluding identifiers and kWhDelivered)
    X = data.drop(["siteID", "stationID", "timezone", "spaceID", "kWhDelivered", "month"], axis=1)
    y = data["kWhDelivered"]
    
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=19)
    
    # Initialize or use the previous model for transfer learning
    if previous_model is not None:
        model = previous_model
        model.fit(X_train, y_train)
    else:
        model = LinearRegression()
        model.fit(X_train, y_train)
    
    # Predict the next month's kWhDelivered
    forecast = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, forecast)
    mse = mean_squared_error(y_test, forecast)
    smape_value = smape(y_test, forecast)
    
    # Store the model and metrics
    models[stationID] = model
    metrics[stationID] = {
        'MAE': mae,
        'MSE': mse,
        'SMAPE (%)': smape_value,
        'Actual kWh': y_test.values[0] if len(y_test) > 0 else None, 
        'Predicted kWh': forecast[0] if len(forecast) > 0 else None
    }
    
    # Update the previous model for transfer learning
    previous_model = model

### Results from Transfer Learning

In [148]:
metrics_df = pd.DataFrame.from_dict(metrics, orient='index').reset_index()

# Rename the index column to 'stationID'
metrics_df = metrics_df.rename(columns={'index': 'stationID'})

metrics_df

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


Unnamed: 0,stationID,MAE,MSE,SMAPE (%),Actual kWh,Predicted kWh
0,2-39-123-23,432.47183,189042.354242,76.387918,491.981,879.614563
1,2-39-123-557,56.007149,3690.598928,36.791339,171.03,250.570066
2,2-39-124-22,339.687997,116041.412481,92.176196,360.582,725.833194
3,2-39-124-558,81.938351,7843.375202,112.285033,88.818,137.148585
4,2-39-125-21,47.134154,4053.231512,14.748381,492.440214,488.103292
5,2-39-125-559,29.688234,881.391263,46.400106,49.139,78.827234
6,2-39-126-20,125.279098,16721.049698,62.087598,274.391,431.704413
7,2-39-126-560,79.048418,6652.677233,103.257706,96.008,154.95605
8,2-39-127-19,112.63192,14656.590426,25.817342,548.545,616.785017
9,2-39-127-561,27.980636,782.916018,32.427663,72.296,100.276636
