In [120]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error,r2_score,accuracy_score



In [60]:
df= pd.read_csv('ev_charging_patterns.csv')

df.head()

Unnamed: 0,User ID,Vehicle Model,Battery Capacity (kWh),Charging Station ID,Charging Station Location,Charging Start Time,Charging End Time,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type
0,User_1,BMW i3,108.463007,Station_391,Houston,2024-01-01 00:00:00,2024-01-01 00:39:00,60.712346,0.591363,36.389181,13.087717,Evening,Tuesday,29.371576,86.119962,293.602111,27.947953,2.0,DC Fast Charger,Commuter
1,User_2,Hyundai Kona,100.0,Station_428,San Francisco,2024-01-01 01:00:00,2024-01-01 03:01:00,12.339275,3.133652,30.677735,21.128448,Morning,Monday,10.115778,84.664344,112.112804,14.311026,3.0,Level 1,Casual Driver
2,User_3,Chevy Bolt,75.0,Station_181,San Francisco,2024-01-01 02:00:00,2024-01-01 04:48:00,19.128876,2.452653,27.513593,35.66727,Morning,Thursday,6.854604,69.917615,71.799253,21.002002,2.0,Level 2,Commuter
3,User_4,Hyundai Kona,50.0,Station_327,Houston,2024-01-01 03:00:00,2024-01-01 06:42:00,79.457824,1.266431,32.88287,13.036239,Evening,Saturday,83.120003,99.624328,199.577785,38.316313,1.0,Level 1,Long-Distance Traveler
4,User_5,Hyundai Kona,50.0,Station_108,Los Angeles,2024-01-01 04:00:00,2024-01-01 05:46:00,19.629104,2.019765,10.215712,10.161471,Morning,Saturday,54.25895,63.743786,203.661847,-7.834199,1.0,Level 1,Long-Distance Traveler


In [61]:

df["Charging Start Time"] = pd.to_datetime(df["Charging Start Time"], format="%Y-%m-%d %H:%M:%S")
df["Charging End Time"]   = pd.to_datetime(df["Charging End Time"], format="%Y-%m-%d %H:%M:%S")

df["total_duration"] = (df["Charging End Time"] - df["Charging Start Time"]).dt.total_seconds() / 3600

df["total_duration"] = df["total_duration"].round(2)

df.head()


Unnamed: 0,User ID,Vehicle Model,Battery Capacity (kWh),Charging Station ID,Charging Station Location,Charging Start Time,Charging End Time,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),...,Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type,total_duration
0,User_1,BMW i3,108.463007,Station_391,Houston,2024-01-01 00:00:00,2024-01-01 00:39:00,60.712346,0.591363,36.389181,...,Evening,Tuesday,29.371576,86.119962,293.602111,27.947953,2.0,DC Fast Charger,Commuter,0.65
1,User_2,Hyundai Kona,100.0,Station_428,San Francisco,2024-01-01 01:00:00,2024-01-01 03:01:00,12.339275,3.133652,30.677735,...,Morning,Monday,10.115778,84.664344,112.112804,14.311026,3.0,Level 1,Casual Driver,2.02
2,User_3,Chevy Bolt,75.0,Station_181,San Francisco,2024-01-01 02:00:00,2024-01-01 04:48:00,19.128876,2.452653,27.513593,...,Morning,Thursday,6.854604,69.917615,71.799253,21.002002,2.0,Level 2,Commuter,2.8
3,User_4,Hyundai Kona,50.0,Station_327,Houston,2024-01-01 03:00:00,2024-01-01 06:42:00,79.457824,1.266431,32.88287,...,Evening,Saturday,83.120003,99.624328,199.577785,38.316313,1.0,Level 1,Long-Distance Traveler,3.7
4,User_5,Hyundai Kona,50.0,Station_108,Los Angeles,2024-01-01 04:00:00,2024-01-01 05:46:00,19.629104,2.019765,10.215712,...,Morning,Saturday,54.25895,63.743786,203.661847,-7.834199,1.0,Level 1,Long-Distance Traveler,1.77


In [62]:
df = df.drop(columns=["Charging Start Time","Charging End Time","Charging Duration (hours)","User ID"])

In [63]:
df['Charger Type'].value_counts()

Charger Type
Level 1            459
Level 2            431
DC Fast Charger    430
Name: count, dtype: int64

In [64]:
df.head()

Unnamed: 0,Vehicle Model,Battery Capacity (kWh),Charging Station ID,Charging Station Location,Energy Consumed (kWh),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type,total_duration
0,BMW i3,108.463007,Station_391,Houston,60.712346,36.389181,13.087717,Evening,Tuesday,29.371576,86.119962,293.602111,27.947953,2.0,DC Fast Charger,Commuter,0.65
1,Hyundai Kona,100.0,Station_428,San Francisco,12.339275,30.677735,21.128448,Morning,Monday,10.115778,84.664344,112.112804,14.311026,3.0,Level 1,Casual Driver,2.02
2,Chevy Bolt,75.0,Station_181,San Francisco,19.128876,27.513593,35.66727,Morning,Thursday,6.854604,69.917615,71.799253,21.002002,2.0,Level 2,Commuter,2.8
3,Hyundai Kona,50.0,Station_327,Houston,79.457824,32.88287,13.036239,Evening,Saturday,83.120003,99.624328,199.577785,38.316313,1.0,Level 1,Long-Distance Traveler,3.7
4,Hyundai Kona,50.0,Station_108,Los Angeles,19.629104,10.215712,10.161471,Morning,Saturday,54.25895,63.743786,203.661847,-7.834199,1.0,Level 1,Long-Distance Traveler,1.77


In [65]:
df = df.drop(columns=["Battery Capacity (kWh)","State of Charge (Start %)","State of Charge (End %)","Distance Driven (since last charge) (km)","Temperature (°C)"])

In [66]:
df = df.drop(columns=["Charging Station Location","Charging Station ID","Vehicle Model"])

In [67]:
df.dtypes

Energy Consumed (kWh)    float64
Charging Rate (kW)       float64
Charging Cost (USD)      float64
Time of Day               object
Day of Week               object
Vehicle Age (years)      float64
Charger Type              object
User Type                 object
total_duration           float64
dtype: object

In [68]:
df.head()

Unnamed: 0,Energy Consumed (kWh),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,Vehicle Age (years),Charger Type,User Type,total_duration
0,60.712346,36.389181,13.087717,Evening,Tuesday,2.0,DC Fast Charger,Commuter,0.65
1,12.339275,30.677735,21.128448,Morning,Monday,3.0,Level 1,Casual Driver,2.02
2,19.128876,27.513593,35.66727,Morning,Thursday,2.0,Level 2,Commuter,2.8
3,79.457824,32.88287,13.036239,Evening,Saturday,1.0,Level 1,Long-Distance Traveler,3.7
4,19.629104,10.215712,10.161471,Morning,Saturday,1.0,Level 1,Long-Distance Traveler,1.77


In [69]:
df["Charging Rate (kW)"] = df["Energy Consumed (kWh)"]*df["total_duration"]

In [70]:
df.head()

Unnamed: 0,Energy Consumed (kWh),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,Vehicle Age (years),Charger Type,User Type,total_duration
0,60.712346,39.463025,13.087717,Evening,Tuesday,2.0,DC Fast Charger,Commuter,0.65
1,12.339275,24.925336,21.128448,Morning,Monday,3.0,Level 1,Casual Driver,2.02
2,19.128876,53.560852,35.66727,Morning,Thursday,2.0,Level 2,Commuter,2.8
3,79.457824,293.993949,13.036239,Evening,Saturday,1.0,Level 1,Long-Distance Traveler,3.7
4,19.629104,34.743514,10.161471,Morning,Saturday,1.0,Level 1,Long-Distance Traveler,1.77


In [71]:
df.isnull().sum()

Energy Consumed (kWh)    66
Charging Rate (kW)       66
Charging Cost (USD)       0
Time of Day               0
Day of Week               0
Vehicle Age (years)       0
Charger Type              0
User Type                 0
total_duration            0
dtype: int64

In [72]:
#splitting numerical and categorical cols:

num_cols=df.select_dtypes(include=[np.number]).columns
cat_cols=df.select_dtypes(exclude=[np.number]).columns

In [73]:
num_cols

Index(['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Charging Cost (USD)',
       'Vehicle Age (years)', 'total_duration'],
      dtype='object')

In [74]:
cat_cols

Index(['Time of Day', 'Day of Week', 'Charger Type', 'User Type'], dtype='object')

In [None]:
#only to check whether it is skewed data or not

for col in df.select_dtypes(include=['number']).columns:
    plt.figure(figsize=(4,4))
    sns.histplot(df[col], kde=True, bins=20)
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [76]:
num_imputer = SimpleImputer(strategy="median")
df[num_cols] = num_imputer.fit_transform(df[num_cols])

In [77]:
cat_imputer=SimpleImputer(strategy="most_frequent")

df[cat_cols]=cat_imputer.fit_transform(df[cat_cols])

In [None]:
#only to check whether it is skewed data or not

for col in df.select_dtypes(include=['number']).columns:
    plt.figure(figsize=(4,4))
    sns.histplot(df[col], kde=True, bins=20)
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [79]:
df.head()

Unnamed: 0,Energy Consumed (kWh),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,Vehicle Age (years),Charger Type,User Type,total_duration
0,60.712346,39.463025,13.087717,Evening,Tuesday,2.0,DC Fast Charger,Commuter,0.65
1,12.339275,24.925336,21.128448,Morning,Monday,3.0,Level 1,Casual Driver,2.02
2,19.128876,53.560852,35.66727,Morning,Thursday,2.0,Level 2,Commuter,2.8
3,79.457824,293.993949,13.036239,Evening,Saturday,1.0,Level 1,Long-Distance Traveler,3.7
4,19.629104,34.743514,10.161471,Morning,Saturday,1.0,Level 1,Long-Distance Traveler,1.77


In [80]:
# Encode categorical columns using Label encoder

encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le


In [81]:
df['Vehicle Age (years)']=np.round(df['Vehicle Age (years)'])

In [82]:
df.head()

Unnamed: 0,Energy Consumed (kWh),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,Vehicle Age (years),Charger Type,User Type,total_duration
0,60.712346,39.463025,13.087717,1,5,2.0,0,1,0.65
1,12.339275,24.925336,21.128448,2,1,3.0,1,0,2.02
2,19.128876,53.560852,35.66727,2,4,2.0,2,1,2.8
3,79.457824,293.993949,13.036239,1,2,1.0,1,2,3.7
4,19.629104,34.743514,10.161471,2,2,1.0,1,2,1.77


In [107]:
df["Cost per Unit"] = df["Charging Cost (USD)"] / df["Energy Consumed (kWh)"]

features = [
    "Charging Rate (kW)", "total_duration",
    "Vehicle Age (years)",
    "Charger Type", "User Type", "Time of Day", "Day of Week"
]

In [110]:
df['Cost per Unit']=np.round(df['Cost per Unit'],2)


In [111]:
df.head()

Unnamed: 0,Energy Consumed (kWh),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,Vehicle Age (years),Charger Type,User Type,total_duration,Cost per Unit
0,60.712346,39.463025,13.087717,1,5,2.0,0,1,0.65,0.22
1,12.339275,24.925336,21.128448,2,1,3.0,1,0,2.02,1.71
2,19.128876,53.560852,35.66727,2,4,2.0,2,1,2.8,1.86
3,79.457824,293.993949,13.036239,1,2,1.0,1,2,3.7,0.16
4,19.629104,34.743514,10.161471,2,2,1.0,1,2,1.77,0.52


In [103]:
X=df[features]
y=df['Cost per Unit']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [112]:
numeric_features = ["Charging Rate (kW)", "total_duration", "Vehicle Age (years)"]
categorical_features = ["Charger Type", "User Type", "Time of Day", "Day of Week"]

In [114]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop="first"), categorical_features)
])

In [122]:

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate
for name, model in models.items():
    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("regressor", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    print(f"\n📊 {name}")
    print("R² Score:", r2_score(y_test, y_pred))
    print("RMSE:", mean_squared_error(y_test, y_pred))



📊 Linear Regression
R² Score: -7.226625080181634
RMSE: 10.642790673593726

📊 Random Forest
R² Score: 0.6109362989795715
RMSE: 0.5033319846590909

📊 Gradient Boosting
R² Score: 0.5863623468072308
RMSE: 0.5351233239317628

📊 XGBoost
R² Score: 0.5322148893190496
RMSE: 0.605173927908151


In [106]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train faster XGBoost model
xgb_model_fast = XGBRegressor(
    n_estimators=100,     # number of trees
    learning_rate=0.1,    # step size
    max_depth=4,          # tree depth
    random_state=42
)
xgb_model_fast.fit(X_train, y_train)

# Predictions
y_pred_xgb_fast = xgb_model_fast.predict(X_test)

# Metrics
mae_xgb_fast = mean_absolute_error(y_test, y_pred_xgb_fast)
rmse_xgb_fast = np.sqrt(mean_squared_error(y_test, y_pred_xgb_fast))
r2_xgb_fast = r2_score(y_test, y_pred_xgb_fast)

print("XGBoost Performance:")
print("MAE :", mae_xgb_fast)
print("RMSE:", rmse_xgb_fast)
print("R²  :", r2_xgb_fast)


XGBoost Performance:
MAE : 0.390500612396634
RMSE: 0.7367775259935451
R²  : 0.5803966710971749
