# Making Predictions with Linear Regression using weather data

In this notebook, I would be implementing a simple Linear Regression model to predict kWHDelivered using three different methods:

1. Using historical weather data from only one charging station
2. Using historical weather data from 54 charging stations with cummulative kWhDelivered added based on stationID
3. Using historical weather data for each 54 charging stations using Transfer Learning

## Method 1: Making Predictions with Linear Regression using historical weather data from only one charging station

In this first method, we would be making a prediction with Linear regression model using only the historical charging station data for a single charging station

In [None]:
# Import the required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import warnings
import os as os
from smape import smape

In [3]:
#Load the data from the local file

In [2]:
data = pd.read_csv('./jpl_model_data/1-1-178-817.csv')

In [15]:
# Display the first few rows of the dataframe

In [3]:
data.head()

Unnamed: 0,month,siteID,stationID,timezone,spaceID,kWhDelivered,MinTemp,MaxTemp,AvgTemp,AvgPrecipitation,AvgHumidity,AvgWindSpeed
0,2018-09,1,1-1-178-817,America/Los_Angeles,AG-1F09,197.982,17,35,24.533333,0.0,49.204167,7.2875
1,2018-10,1,1-1-178-817,America/Los_Angeles,AG-1F09,470.523645,14,32,21.064516,0.021371,47.125,6.241935
2,2018-11,1,1-1-178-817,America/Los_Angeles,AG-1F09,373.943,11,32,17.733333,0.058333,38.8125,6.316667
3,2018-12,1,1-1-178-817,America/Los_Angeles,AG-1F09,290.616,5,25,13.225806,0.067339,44.790323,5.927419
4,2019-01,1,1-1-178-817,America/Los_Angeles,AG-1F09,408.382,3,25,12.83871,0.232258,50.447581,6.814516


In [175]:
# Convert the month to datetime format and set it as the index

In [4]:
data['Month'] = pd.to_datetime(data['month'])
data.set_index('Month', inplace=True)

In [26]:
# Prepare the data for Linear Regression

In [5]:
X = data.drop(["siteID","stationID", "timezone","spaceID","kWhDelivered","month"], axis=1)
y = data["kWhDelivered"]

In [29]:
# Split data into training and testing sets

In [6]:
 X_train, X_test = X[:-1], X[-1:] 
 y_train, y_test = y[:-1], y[-1:]

In [31]:
# Fit Linear Rregression model

In [7]:
model = LinearRegression()
model_fit = model.fit(X_train, y_train)

In [35]:
# Make predictions using the trained model

In [8]:
predictions = model.predict(X_test)

In [38]:
# Evaluate the model using Mean Square Error, Mean Absolute Error and Symmetric Mean Absolute Percentage Error

In [9]:
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
smape_value = smape(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')
print(f'Actual kWh: {y_test.values[0]}')
print(f'Predicted kWh: {predictions[0]}')


Mean Squared Error: 15015.685165071229
Mean Absolute Error: 122.53850482632481
Symmetric Mean Absolute Percentage Error: 29.84%
Actual kWh: 471.962
Predicted kWh: 349.4234951736752


## Method 2: Making Predictions with Linear Regression using historical weather data from 54 charging stations with cummulative kWhDelivered

In [48]:
# Define the directory containing the CSV files

In [21]:
directory = './jpl_model_data'

In [None]:
# Load data and sum all CSV files into a single dataframe

In [22]:
combined_data = pd.DataFrame()


# Loop through each CSV file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)
        # Load the data from the CSV file
        data = pd.read_csv(filepath)
        # Append the data to the combined DataFrame
        combined_data = pd.concat([combined_data, data], axis=0)

summed_data = combined_data.groupby('month').agg({
    'kWhDelivered': 'sum',
    'MinTemp': 'first',
    'MaxTemp': 'first',
    'AvgTemp': 'first',
    'AvgPrecipitation': 'first',
    'AvgHumidity': 'first',
    'AvgWindSpeed': 'first'
}).reset_index()

# Print or save the result
summed_data.head()

Unnamed: 0,month,kWhDelivered,MinTemp,MaxTemp,AvgTemp,AvgPrecipitation,AvgHumidity,AvgWindSpeed
0,2018-09,9514.851,17,35,24.533333,0.0,49.204167,7.2875
1,2018-10,18161.496291,14,32,21.064516,0.021371,47.125,6.241935
2,2018-11,15311.232726,11,32,17.733333,0.058333,38.8125,6.316667
3,2018-12,14177.875,5,25,13.225806,0.067339,44.790323,5.927419
4,2019-01,19698.515,3,25,12.83871,0.232258,50.447581,6.814516


In [None]:
# Split the data

In [23]:
X = summed_data.drop(["kWhDelivered","month"], axis=1)
y = summed_data["kWhDelivered"]

X_train, X_test = X[:-1], X[-1:] 
y_train, y_test = y[:-1], y[-1:]

In [145]:
#Fit Linear Regression Model

In [24]:
model = LinearRegression()
model_fit = model.fit(X_train, y_train)

In [147]:
## Making Predictions using the model

In [25]:
predictions = model.predict(X_test)

In [None]:
# Evaluate the model using Mean Square Error, Mean Absolute Error and Symmetric Mean Absolute Percentage Error

In [26]:
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
smape_value = smape(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Symmetric Mean Absolute Percentage Error: {round(smape_value,2)}%')
print(f'Actual kWh: {y_test.values[0]}')
print(f'Predicted kWh: {predictions[0]}')

Mean Squared Error: 58121088.49451275
Mean Absolute Error: 7623.718810037051
Symmetric Mean Absolute Percentage Error: 43.1%
Actual kWh: 21499.099601388887
Predicted kWh: 13875.380791351836


## Method 3: Making Predictions with Linear Regression using historical weather data for each 54 charging stations using Transfer Learning

### Using Transfer Learning

In [18]:
# Initialize a dictionary to store the models and metrics
models = {}
metrics = {}

# Directory containing the CSV files
directory = "./jpl_model_data"  # Replace with your directory path

# List all files in the directory
filenames = sorted([filename for filename in os.listdir(directory) if filename.endswith('.csv')])

# Placeholder to store the first trained model for Transfer Learning
previous_model = None

# Loop through each CSV file in the directory
for filename in filenames:
    filepath = os.path.join(directory, filename)
    # Load the data from the CSV file
    data = pd.read_csv(filepath)
    
    # Extract the stationID
    stationID = filename.split('.')[0]  # Example: If filename is 'station1.csv'
    
    # Sort the data by the 'Month' column
    data = data.sort_values(by='month')
    
    # Select features (excluding identifiers and kWhDelivered)
    X = data.drop(["siteID", "stationID", "timezone", "spaceID", "kWhDelivered", "month"], axis=1)
    y = data["kWhDelivered"]
    
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=19)
    
    # Initialize or use the previous model for transfer learning
    if previous_model is not None:
        model = previous_model
        model.fit(X_train, y_train)
    else:
        model = LinearRegression()
        model.fit(X_train, y_train)
    
    # Predict the next month's kWhDelivered
    forecast = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, forecast)
    mse = mean_squared_error(y_test, forecast)
    smape_value = smape(y_test, forecast)
    
    # Store the model and metrics
    models[stationID] = model
    metrics[stationID] = {
        'MAE': mae,
        'MSE': mse,
        'SMAPE (%)': smape_value,
        'Actual kWh': y_test.values[0] if len(y_test) > 0 else None, 
        'Predicted kWh': forecast[0] if len(forecast) > 0 else None
    }
    
    # Update the previous model for transfer learning
    previous_model = model

### Results from Transfer Learning

In [19]:
metrics_df = pd.DataFrame.from_dict(metrics, orient='index').reset_index()

# Rename the index column to 'stationID'
metrics_df = metrics_df.rename(columns={'index': 'stationID'})

metrics_df

Unnamed: 0,stationID,MAE,MSE,SMAPE (%),Actual kWh,Predicted kWh
0,1-1-178-817,66.455404,6082.51776,15.980493,398.767,424.403327
1,1-1-178-823,51.032532,4653.690447,11.152765,503.028,406.725488
2,1-1-178-824,75.648739,6405.503264,16.757803,435.347,385.828157
3,1-1-178-828,133.670605,24447.562685,26.269639,645.991,431.204848
4,1-1-179-777,161.294451,36224.432457,41.119225,560.381,298.049266
5,1-1-179-779,166.676326,27781.621792,44.685731,456.344,290.457711
6,1-1-179-781,97.815062,10506.73593,33.467283,362.572,234.114653
7,1-1-179-783,36.719209,1348.711524,10.894952,395.778,431.855926
8,1-1-179-787,37.908228,2860.8254,11.468298,236.24,236.415065
9,1-1-179-788,119.745705,18464.709313,32.483698,503.549,319.571874
