In [127]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [128]:
df = pd.read_csv("processed_data.csv")

In [129]:
df.dtypes

User ID                                      object
Vehicle Model                                object
Battery Capacity (kWh)                      float64
Charging Station ID                          object
Charging Station Location                    object
Charging Start Time                          object
Charging End Time                            object
Energy Consumed (kWh)                       float64
Charging Duration (hours)                   float64
Charging Rate (kW)                          float64
Charging Cost (USD)                         float64
Time of Day                                  object
Day of Week                                  object
State of Charge (Start %)                   float64
State of Charge (End %)                     float64
Distance Driven (since last charge) (km)    float64
Temperature (°C)                            float64
Vehicle Age (years)                         float64
Charger Type                                 object
User Type   

In [130]:

date_columns = ["Charging_Start", "Charging_end"]

for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors="coerce")  

print(df.dtypes)


User ID                                             object
Vehicle Model                                       object
Battery Capacity (kWh)                             float64
Charging Station ID                                 object
Charging Station Location                           object
Charging Start Time                                 object
Charging End Time                                   object
Energy Consumed (kWh)                              float64
Charging Duration (hours)                          float64
Charging Rate (kW)                                 float64
Charging Cost (USD)                                float64
Time of Day                                         object
Day of Week                                         object
State of Charge (Start %)                          float64
State of Charge (End %)                            float64
Distance Driven (since last charge) (km)           float64
Temperature (°C)                                   float

In [131]:
df = pd.read_csv("processed_data.csv", parse_dates=["Charging_Start"], index_col="Charging_Start")


In [132]:
df = df.sort_index()

In [133]:
print(df.head(), "\n", df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 40000 entries, 2000-01-01 07:20:02.839385110 to 2022-12-30 21:49:40.323087360
Data columns (total 30 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   User ID                                   40000 non-null  object 
 1   Vehicle Model                             40000 non-null  object 
 2   Battery Capacity (kWh)                    40000 non-null  float64
 3   Charging Station ID                       40000 non-null  object 
 4   Charging Station Location                 40000 non-null  object 
 5   Charging Start Time                       40000 non-null  object 
 6   Charging End Time                         40000 non-null  object 
 7   Energy Consumed (kWh)                     40000 non-null  float64
 8   Charging Duration (hours)                 40000 non-null  float64
 9   Charging Rate (kW)                        40000 no

In [134]:
object_cols = df.select_dtypes(include=['object']).columns

In [135]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

le = LabelEncoder()
for col in object_cols:
    df[col] = le.fit_transform(df[col])

In [136]:
# Extract time-based features
df["Year"] = df.index.year
df["Month"] = df.index.month
df["Day"] = df.index.day
df["DayOfWeek"] = df.index.dayofweek
df["Hour"] = df.index.hour  # Use only if timestamp contains hours

# Create lag features (previous values as new features)
df["Lag_1"] = df["Charging Cost (USD)"].shift(1)
df["Lag_7"] = df["Charging Cost (USD)"].shift(7)  # Weekly lag

# Drop missing values created by shift()
df.dropna(inplace=True)

# Define X (features) and y (target)
X = df.drop(columns=["Charging Cost (USD)"])
y = df["Charging Cost (USD)"]

In [137]:
# Split into 80% train, 20% test (last 20% is for future prediction)
train_size = int(len(df) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

print(f"Train Size: {X_train.shape}, Test Size: {X_test.shape}")


Train Size: (31994, 34), Test Size: (7999, 34)


In [138]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)


In [139]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)


In [140]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_forecast(y_true, y_pred, model_name):
    print(f"📊 {model_name} Forecasting Performance:")
    print(f"- MAE: {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"- MSE: {mean_squared_error(y_true, y_pred):.4f}")
    print(f"- RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print(f"- R² Score: {r2_score(y_true, y_pred):.4f}\n")

evaluate_forecast(y_test, y_pred_rf, "Random Forest")
evaluate_forecast(y_test, y_pred_xgb, "XGBoost")


📊 Random Forest Forecasting Performance:
- MAE: 0.0000
- MSE: 0.0000
- RMSE: 0.0000
- R² Score: 1.0000

📊 XGBoost Forecasting Performance:
- MAE: 0.1722
- MSE: 0.0502
- RMSE: 0.2240
- R² Score: 0.9996



In [141]:
# Create an empty DataFrame with same columns as X_train
future_df = pd.DataFrame(index=future_dates, columns=X_train.columns)

# Fill time-based features
future_df["Year"] = future_df.index.year
future_df["Month"] = future_df.index.month
future_df["Day"] = future_df.index.day
future_df["DayOfWeek"] = future_df.index.dayofweek

# Fill lag features (use last known values for rolling predictions)
future_df["Lag_1"] = [y_test.iloc[-1]] + list(y_pred_rf[-9:])  # Using last known value + previous predictions

# Fill missing values with mean (or another strategy)
future_df.fillna(X_train.mean(), inplace=True)  # Ensures all columns exist

# Make future predictions
future_df["Predicted_Values"] = rf_model.predict(future_df)

# Show results
print(future_df[["Predicted_Values"]])


                               Predicted_Values
2022-12-30 21:49:40.323087360         21.337275
2022-12-31 21:49:40.323087360         21.337275
2023-01-01 21:49:40.323087360         21.337275
2023-01-02 21:49:40.323087360         21.337275
2023-01-03 21:49:40.323087360         21.337275
2023-01-04 21:49:40.323087360         21.337275
2023-01-05 21:49:40.323087360         21.337275
2023-01-06 21:49:40.323087360         21.337275
2023-01-07 21:49:40.323087360         21.337275
2023-01-08 21:49:40.323087360         21.337275


  future_df.fillna(X_train.mean(), inplace=True)  # Ensures all columns exist


In [142]:
df2 = pd.read_csv("processed_data.csv", parse_dates=["Charging_end"], index_col="Charging_end")

In [143]:
df2 = df2.sort_index()

In [144]:
print(df2.head(), "\n", df2.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 40000 entries, 2024-01-01 00:39:00 to 2024-02-24 23:56:00
Data columns (total 30 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   User ID                                   40000 non-null  object 
 1   Vehicle Model                             40000 non-null  object 
 2   Battery Capacity (kWh)                    40000 non-null  float64
 3   Charging Station ID                       40000 non-null  object 
 4   Charging Station Location                 40000 non-null  object 
 5   Charging Start Time                       40000 non-null  object 
 6   Charging End Time                         40000 non-null  object 
 7   Energy Consumed (kWh)                     40000 non-null  float64
 8   Charging Duration (hours)                 40000 non-null  float64
 9   Charging Rate (kW)                        40000 non-null  float64
 10 

In [145]:
object_cols = df2.select_dtypes(include=['object']).columns

In [146]:
le = LabelEncoder()
for col in object_cols:
    df2[col] = le.fit_transform(df2[col])

In [147]:
# Extract time-based features
df2["Year"] = df2.index.year
df2["Month"] = df2.index.month
df2["Day"] = df2.index.day
df2["DayOfWeek"] = df2.index.dayofweek
df2["Hour"] = df2.index.hour  

# Create lag features (previous values as new features)
df2["Lag_1"] = df2["Charging Cost (USD)"].shift(1)
df2["Lag_7"] = df2["Charging Cost (USD)"].shift(7)  # Weekly lag

# Drop missing values created by shift()
df2.dropna(inplace=True)

# Define X (features) and y (target)
X = df2.drop(columns=["Charging Cost (USD)"])
y = df2["Charging Cost (USD)"]

In [148]:
df2.shape

(39993, 35)

In [149]:
# Split into 80% train, 20% test (last 20% is for future prediction)
train_size = int(len(df2) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

print(f"Train Size: {X_train.shape}, Test Size: {X_test.shape}")


Train Size: (31994, 34), Test Size: (7999, 34)


In [150]:
rf_model2 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model2.fit(X_train, y_train)

# Make predictions
y_pred_rf2 = rf_model2.predict(X_test)

In [151]:
xgb_model2 = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model2.fit(X_train, y_train)

# Make predictions
y_pred_xgb2 = xgb_model2.predict(X_test)

In [152]:
def evaluate_forecast(y_true, y_pred, model_name):
    print(f"📊 {model_name} Forecasting Performance:")
    print(f"- MAE: {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"- MSE: {mean_squared_error(y_true, y_pred):.4f}")
    print(f"- RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print(f"- R² Score: {r2_score(y_true, y_pred):.4f}\n")

evaluate_forecast(y_test, y_pred_rf2, "Random Forest")
evaluate_forecast(y_test, y_pred_xgb2, "XGBoost")

📊 Random Forest Forecasting Performance:
- MAE: 0.5564
- MSE: 1.0752
- RMSE: 1.0369
- R² Score: 0.9909

📊 XGBoost Forecasting Performance:
- MAE: 0.3848
- MSE: 1.3704
- RMSE: 1.1706
- R² Score: 0.9884



In [153]:
# Create an empty DataFrame with same columns as X_train
future_df = pd.DataFrame(index=future_dates, columns=X_train.columns)

# Fill time-based features
future_df["Year"] = future_df.index.year
future_df["Month"] = future_df.index.month
future_df["Day"] = future_df.index.day
future_df["DayOfWeek"] = future_df.index.dayofweek

# Fill lag features (use last known values for rolling predictions)
future_df["Lag_1"] = [y_test.iloc[-1]] + list(y_pred_rf[-9:])  # Using last known value + previous predictions

# Fill missing values with mean (or another strategy)
future_df.fillna(X_train.mean(), inplace=True)  # Ensures all columns exist

# Make future predictions
future_df["Predicted_Values"] = rf_model2.predict(future_df)

# Show results
print(future_df[["Predicted_Values"]])


                               Predicted_Values
2022-12-30 21:49:40.323087360         18.254089
2022-12-31 21:49:40.323087360         18.696274
2023-01-01 21:49:40.323087360         15.649425
2023-01-02 21:49:40.323087360         18.037481
2023-01-03 21:49:40.323087360         19.280777
2023-01-04 21:49:40.323087360         16.439565
2023-01-05 21:49:40.323087360         16.790471
2023-01-06 21:49:40.323087360         19.478137
2023-01-07 21:49:40.323087360         18.087403
2023-01-08 21:49:40.323087360         19.584232


  future_df.fillna(X_train.mean(), inplace=True)  # Ensures all columns exist


In [154]:
import pickle

In [155]:
with open("rf_forecast_df.pkl", "wb") as f:
    pickle.dump(rf_model, f)

with open("xgb_forecast_df.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

# Save models for df2
with open("rf_forecast_df2.pkl", "wb") as f:
    pickle.dump(rf_model2, f)

with open("xgb_forecast_df2.pkl", "wb") as f:
    pickle.dump(xgb_model2, f)

print("✅ Models for both datasets saved successfully!")

✅ Models for both datasets saved successfully!
