In [1]:
pip install pandas numpy scikit-learn catboost joblib


Collecting catboost
  Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.0.0-py3-none-any.whl.metadata (5.6 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-1.29.0-py3-none-any.whl.metadata (10 kB)
Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.1/101.7 MB 2.0 MB/s eta 0:00:52
   ---------------------------------------- 0.2/101.7 MB 3.5 MB/s eta 0:00:30
   ---------------------------------------- 0.6/101.7 MB 5.0 MB/s eta 0:00:21
    --------------------------------------- 1.7/101.7 MB 11.0 MB/s eta 0:00:10
   - -------------------------------------- 3.4/101.7 MB 16.8 MB/s eta 0:00:06
   - -------------------------------------- 4.7/101.7 MB 18.9 MB/s e

In [74]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
import torch
import torch.nn as nn
import torch.optim as optim
import joblib

# GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("Step 1: Loading dataset...")
df = pd.read_csv(r"D:\projects\Raptee_hv\ev_bike_dataset_cleaned.csv")
print("Dataset loaded successfully!")

print("Step 2: Feature Engineering...")
df['Speed_Variation'] = df['Average Speed (km/h)'].diff().fillna(0)
df['Temp_Change'] = df['Temperature (°C)'].diff().fillna(0)
df['Battery_Usage'] = df['Battery Start Percentage (%)'] - df['Battery End Percentage (%)']
df['Traffic_Level'] = df['Traffic'].map({'Low': 1, 'Medium': 2, 'High': 3})
df['Battery Health Status'] = df['Battery Health Status'].map({'Healthy': 0, 'Aging': 1, 'Degraded': 2})
print("Feature Engineering completed!")

print("Step 3: Selecting features and target...")
features = ['Ride Duration (minutes)', 'Distance Traveled (km)', 'Battery Start Percentage (%)',
            'Battery End Percentage (%)', 'Average Speed (km/h)', 'Elevation (%)', 'Temperature (°C)',
            'Traffic_Level', 'Load Weight (kg)', 'Energy Consumed (Wh)', 'Battery Health Status',
            'Efficiency (Wh/km)', 'Speed_Variation', 'Temp_Change', 'Battery_Usage']
target = 'Range Estimation (km)'
print("Feature selection completed!")

print("Step 4: Splitting data into training and testing sets...")
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Data split completed!")

print("Step 5: Preprocessing - Scaling and Encoding...")
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = ['Battery Health Status']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)
print("Preprocessing completed!")

print("Step 6: Training Random Forest Model with Hyperparameter Tuning...")
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}
rf = GridSearchCV(RandomForestRegressor(), rf_params, cv=3, scoring='r2', verbose=2)

# Define pipeline with preprocessing and model
pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', rf)])

# Train the pipeline
pipeline_rf.fit(X_train, y_train)

print("Best RF Parameters:", rf.best_params_)
print("Random Forest training completed!")

# Save the entire model pipeline
joblib.dump(pipeline_rf, 'random_forest_model.pkl')
print("Random Forest model saved!")

# Extract and save the scaler separately
scaler = pipeline_rf.named_steps['preprocessor'].named_transformers_['num']
joblib.dump(scaler, "scaler.pkl")
print("Scaler saved successfully!")



Using device: cuda
Step 1: Loading dataset...
Dataset loaded successfully!
Step 2: Feature Engineering...
Feature Engineering completed!
Step 3: Selecting features and target...
Feature selection completed!
Step 4: Splitting data into training and testing sets...
Data split completed!
Step 5: Preprocessing - Scaling and Encoding...
Preprocessing completed!
Step 6: Training Random Forest Model with Hyperparameter Tuning...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END ..max_depth=5, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END ..max_depth=5, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END ..max_depth=5, min_samples_split=2, n_estimators=50; total time=   0.5s
[CV] END .max_depth=5, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END .max_depth=5, min_samples_split=2, n_estimators=100; total time=   0.9s
[CV] END .max_depth=5, min_samples_split=2, n_estimators=100; total time=   0.9s
[CV] END .max_depth=5, min_

In [16]:
import joblib


pipeline_rf = joblib.load('random_forest_model.pkl')

print("Random Forest model loaded successfully!")


Random Forest model loaded successfully!


In [17]:
# Predict on the test set
y_test_pred = pipeline_rf.predict(X_test)

# Print some sample predictions
print("Sample Predictions:")
print(y_test_pred[:10])


Sample Predictions:
[115.77485198  13.65354454  97.02491341 127.51347988  97.30621534
 108.58300484  31.8394539  124.75900392 117.42275986 115.7534117 ]


In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

# Print evaluation results
print("Random Forest Model Performance:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")


Random Forest Model Performance:
MAE: 0.9743
MSE: 4.1490
R² Score: 0.9963


In [65]:
new_data = pd.DataFrame([{
    'Ride Duration (minutes)': 30,
    'Distance Traveled (km)': 10.5,
    'Battery Start Percentage (%)': 80,
    'Battery End Percentage (%)': 65,
    'Average Speed (km/h)': 40,
    'Elevation (%)': 5,
    'Temperature (°C)': 28,
    'Traffic_Level': 2,
    'Load Weight (kg)': 75,
    'Energy Consumed (Wh)': 500,
    'Battery Health Status': '0',
    'Efficiency (Wh/km)': 48,
    'Speed_Variation': 2,
    'Temp_Change': 0.5,
    'Battery_Usage': 15
}])

# Apply the same preprocessing (scaling & encoding)
y_pred_new = pipeline_rf.predict(new_data)

print(f"Predicted EV Range: {y_pred_new[0]:.2f} km")


Predicted EV Range: 82.42 km


In [77]:

print("Step 7: Training CatBoost Model with Early Stopping...")
cat_model = CatBoostRegressor(iterations=500, learning_rate=0.05, depth=5, 
                              early_stopping_rounds=50, verbose=10)

# Specify categorical features
cat_features = ['Battery Health Status']

# Fit the model with categorical features specified
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), cat_features=cat_features)

print("CatBoost training completed!")
joblib.dump(cat_model, 'catboost_model.pkl')
print("CatBoost model saved!")


Step 7: Training CatBoost Model with Early Stopping...
0:	learn: 31.6558084	test: 32.3713896	best: 32.3713896 (0)	total: 89.5ms	remaining: 44.7s
10:	learn: 21.1398970	test: 21.8337491	best: 21.8337491 (10)	total: 647ms	remaining: 28.8s
20:	learn: 14.2838213	test: 14.9630839	best: 14.9630839 (20)	total: 1.2s	remaining: 27.3s
30:	learn: 10.1239496	test: 10.8456092	best: 10.8456092 (30)	total: 1.74s	remaining: 26.3s
40:	learn: 7.1992431	test: 7.8902952	best: 7.8902952 (40)	total: 2.28s	remaining: 25.5s
50:	learn: 5.2645522	test: 5.9839994	best: 5.9839994 (50)	total: 2.85s	remaining: 25.1s
60:	learn: 3.9748196	test: 4.7345714	best: 4.7345714 (60)	total: 3.42s	remaining: 24.7s
70:	learn: 3.0624590	test: 3.8193722	best: 3.8193722 (70)	total: 4.01s	remaining: 24.2s
80:	learn: 2.4905637	test: 3.2331799	best: 3.2331799 (80)	total: 4.56s	remaining: 23.6s
90:	learn: 2.1023208	test: 2.8258620	best: 2.8258620 (90)	total: 5.13s	remaining: 23.1s
100:	learn: 1.8517120	test: 2.5598405	best: 2.5598405 (

In [5]:

print("Step 8: Training HistGradientBoostingRegressor Model with Early Stopping...")

# One-hot encode categorical feature before training
X_train_encoded = pd.get_dummies(X_train, columns=['Battery Health Status'])
X_test_encoded = pd.get_dummies(X_test, columns=['Battery Health Status'])

hist_model = HistGradientBoostingRegressor(max_iter=500, learning_rate=0.05, 
                                           max_depth=5, early_stopping=True, verbose=1)

hist_model.fit(X_train_encoded, y_train)
print("HistGradientBoostingRegressor training completed!")

joblib.dump(hist_model, 'hist_gradient_boosting_model.pkl')
print("HistGradientBoostingRegressor model saved!")


Step 8: Training HistGradientBoostingRegressor Model with Early Stopping...
Binning 0.000 GB of training data: 0.049 s
Binning 0.000 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
Fit 202 trees in 1.288 s, (2447 total leaves)
Time spent computing histograms: 0.254s
Time spent finding best splits:  0.168s
Time spent applying splits:      0.297s
Time spent predicting:           0.017s
HistGradientBoostingRegressor training completed!
HistGradientBoostingRegressor model saved!


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# Ensure categorical variables are encoded
le = LabelEncoder()
X_train['Battery Health Status'] = le.fit_transform(X_train['Battery Health Status'])
X_test['Battery Health Status'] = le.transform(X_test['Battery Health Status'])

# Feature Scaling for Input (Standardization)
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Scaling Target Variable (Min-Max Normalization)
scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test_scaled, dtype=torch.float32).to(device)

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, 20, batch_first=True)  # Reduced hidden size to 20
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(20, 1)

    def forward(self, x):
        x, _ = self.lstm(x.unsqueeze(1))  # Adding extra dimension for LSTM input
        x = self.dropout(x[:, -1, :])
        x = self.fc(x)
        return x

# Initialize model, loss, and optimizer
model = LSTMModel(X_train.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

print("LSTM Model training completed!")

# Inverse Transform Predictions to Original Scale
y_test_pred = model(X_test_tensor).detach().cpu().numpy()
y_test_pred_original = scaler_y.inverse_transform(y_test_pred)

# Evaluate Model Performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("Evaluation Metrics:")
print("MAE:", mean_absolute_error(y_test, y_test_pred_original))
print("MSE:", mean_squared_error(y_test, y_test_pred_original))
print("R² Score:", r2_score(y_test, y_test_pred_original))


Epoch 1/50, Loss: 0.26019346714019775
Epoch 2/50, Loss: 0.25680848956108093
Epoch 3/50, Loss: 0.2525625228881836
Epoch 4/50, Loss: 0.2473595142364502
Epoch 5/50, Loss: 0.24594295024871826
Epoch 6/50, Loss: 0.24183568358421326
Epoch 7/50, Loss: 0.23639462888240814
Epoch 8/50, Loss: 0.23461443185806274
Epoch 9/50, Loss: 0.231167271733284
Epoch 10/50, Loss: 0.22802124917507172
Epoch 11/50, Loss: 0.22268757224082947
Epoch 12/50, Loss: 0.21989013254642487
Epoch 13/50, Loss: 0.21563559770584106
Epoch 14/50, Loss: 0.2143423855304718
Epoch 15/50, Loss: 0.20934590697288513
Epoch 16/50, Loss: 0.2070801854133606
Epoch 17/50, Loss: 0.2030351758003235
Epoch 18/50, Loss: 0.20097854733467102
Epoch 19/50, Loss: 0.19615942239761353
Epoch 20/50, Loss: 0.19412775337696075
Epoch 21/50, Loss: 0.1907486915588379
Epoch 22/50, Loss: 0.18761610984802246
Epoch 23/50, Loss: 0.1844903528690338
Epoch 24/50, Loss: 0.1820054054260254
Epoch 25/50, Loss: 0.18030807375907898
Epoch 26/50, Loss: 0.17474864423274994
Epoch

In [10]:
y_test_pred = model(X_test_tensor).detach().cpu().numpy()
y_test_pred_original = scaler_y.inverse_transform(y_test_pred)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("Evaluation Metrics:")
print("MAE:", mean_absolute_error(y_test, y_test_pred_original))
print("MSE:", mean_squared_error(y_test, y_test_pred_original))
print("R² Score:", r2_score(y_test, y_test_pred_original))


Evaluation Metrics:
MAE: 37.94274311237737
MSE: 1965.897445775887
R² Score: -0.7333692336873303


In [75]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor

# Encode categorical features
le = LabelEncoder()
X_train['Battery Health Status'] = le.fit_transform(X_train['Battery Health Status'])
X_test['Battery Health Status'] = le.transform(X_test['Battery Health Status'])
joblib.dump(le, "label_encoder.pkl")


# Feature Scaling (Standardization)
scaler = joblib.load("scaler.pkl")
print("Scaler loaded successfully!")


# Initialize and Train CatBoost Model
cat_model = CatBoostRegressor(
    iterations=1000,  # More iterations for better learning
    learning_rate=0.05,  # Balanced learning rate
    depth=6,  # Controls complexity
    early_stopping_rounds=50,  # Stops training if no improvement
    verbose=100
)

cat_model.fit(X_train_scaled, y_train, eval_set=(X_test_scaled, y_test))

# Make Predictions
y_test_pred = cat_model.predict(X_test_scaled)

# Evaluate Model Performance
print("Evaluation Metrics:")
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print("R² Score:", r2_score(y_test, y_test_pred))

# Save the model
cat_model.save_model("catboost_ev_model.cbm")
print("CatBoost model saved successfully!")


Scaler loaded successfully!
0:	learn: 31.7661507	test: 32.4599843	best: 32.4599843 (0)	total: 7.28ms	remaining: 7.27s
100:	learn: 1.7271488	test: 2.6819322	best: 2.6819322 (100)	total: 454ms	remaining: 4.04s
200:	learn: 1.0354490	test: 2.0102093	best: 2.0102093 (200)	total: 880ms	remaining: 3.5s
300:	learn: 0.8518520	test: 1.9194681	best: 1.9194681 (300)	total: 1.32s	remaining: 3.06s
400:	learn: 0.7141863	test: 1.8852906	best: 1.8852906 (400)	total: 1.77s	remaining: 2.64s
500:	learn: 0.6223715	test: 1.8689814	best: 1.8689611 (494)	total: 2.2s	remaining: 2.19s
600:	learn: 0.5537909	test: 1.8547366	best: 1.8547366 (600)	total: 2.65s	remaining: 1.76s
700:	learn: 0.4937815	test: 1.8470486	best: 1.8470486 (700)	total: 3.1s	remaining: 1.32s
800:	learn: 0.4413208	test: 1.8439196	best: 1.8439066 (799)	total: 3.56s	remaining: 885ms
900:	learn: 0.3971594	test: 1.8397290	best: 1.8397290 (900)	total: 4s	remaining: 440ms
999:	learn: 0.3594438	test: 1.8352235	best: 1.8349734 (986)	total: 4.42s	remai

In [76]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor

# Load the CatBoost Model
cat_model = CatBoostRegressor()
cat_model.load_model("catboost_ev_model.cbm")
print("CatBoost model loaded successfully!")

# Make Predictions on Test Data
y_test_pred = cat_model.predict(X_test_scaled)

# Evaluate Model Performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print("Evaluation Metrics (CatBoost):")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

#user input
new_data = pd.DataFrame([{
    'Ride Duration (minutes)': 30,
    'Distance Traveled (km)': 10.5,
    'Battery Start Percentage (%)': 80,
    'Battery End Percentage (%)': 65,
    'Average Speed (km/h)': 40,
    'Elevation (%)': 5,
    'Temperature (°C)': 28,
    'Traffic_Level': 2,  # 1 = Low, 2 = Medium, 3 = High
    'Load Weight (kg)': 75,
    'Energy Consumed (Wh)': 500,
    'Battery Health Status': '0',
    'Efficiency (Wh/km)': 48,
    'Speed_Variation': 2,
    'Temp_Change': 0.5,
    'Battery_Usage': 15
}])


if 'Battery Health Status' in new_data:
    battery_status = new_data.loc[0, 'Battery Health Status']

   
    if battery_status in le.classes_:
        encoded_value = le.transform([battery_status])[0]
    else:
        print(f" Warning: '{battery_status}' not found in LabelEncoder classes. Using default.")
        encoded_value = le.transform([le.classes_[0]])[0]  # Default to first known category

    new_data.loc[0, 'Battery Health Status'] = encoded_value


new_data['Battery Health Status'] = new_data['Battery Health Status'].astype(int)


new_data_scaled = scaler.transform(new_data)


y_pred_new = cat_model.predict(new_data_scaled)

print(f"Predicted EV Range for New Input: {y_pred_new[0]:.2f} km")


CatBoost model loaded successfully!
Evaluation Metrics (CatBoost):
MAE: 0.9503
MSE: 3.3671
R² Score: 0.9970
Predicted EV Range for New Input: 80.42 km
