In [None]:
import numpy as np
import pandas as pd

In [None]:
df=pd.read_csv("/content/electric_vehicles_spec_2025.csv.csv")

In [None]:
print(df["range_km"].min())
print(df["range_km"].max())

135
685


In [None]:
df['km_per_kWh'] = 1000 / df['efficiency_wh_per_km']

In [None]:
df = df.dropna(subset=['model'])
df['model'] = df['model'].str.strip()


<h2>Synthetic Data</h2>

In [None]:
synthetic_rows = []
for model in df['model'].unique():
    model_rows = df[df['model'] == model]
    base_row = model_rows.iloc[0]
    for _ in range(10):
        synthetic = base_row.copy()

        # Add synthetic environmental/driving conditions
        synthetic['SOC_percent'] = round(np.random.uniform(40, 100), 1)
        synthetic['temperature_C'] = round(np.random.uniform(10, 45), 1)
        synthetic['trip_distance_km'] = round(np.random.uniform(5, 250), 1)

        # Compute effective range
        soc = synthetic['SOC_percent'] / 100
        temp_factor = 1 - 0.005 * (synthetic['temperature_C'] - 25)
        synthetic['effective_range_km'] = round(
            synthetic['battery_capacity_kWh'] * synthetic['km_per_kWh'] * soc * temp_factor, 2
        )

        synthetic_rows.append(synthetic)

# Combine all synthetic rows into a new dataset
augmented_df = pd.DataFrame(synthetic_rows)

In [None]:
augmented_df.shape

(4770, 27)

In [None]:
augmented_df.sample(5)

Unnamed: 0,brand,model,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,efficiency_wh_per_km,range_km,acceleration_0_100_s,...,length_mm,width_mm,height_mm,car_body_type,source_url,km_per_kWh,SOC_percent,temperature_C,trip_distance_km,effective_range_km
76,CUPRA,Born 170 kW - 77 kWh,160,77.0,Lithium-ion,288.0,310.0,157,460,7.0,...,4322,1809,1540,Hatchback,https://ev-database.org/car/2234/CUPRA-Born-17...,6.369427,93.1,38.2,8.3,426.47
336,Porsche,Taycan 4S Cross Turismo,240,97.0,Lithium-ion,396.0,710.0,188,510,3.8,...,4974,1967,1409,Station/Estate,https://ev-database.org/car/2111/Porsche-Tayca...,5.319149,55.0,27.5,160.3,280.23
88,Citroen,e-C4 X 54 kWh,150,50.8,Lithium-ion,102.0,260.0,119,335,9.1,...,4600,1834,1525,Sedan,https://ev-database.org/car/3050/Citroen-e-C4-...,8.403361,52.1,10.7,156.7,238.31
43,BMW,i5 M60 xDrive Touring (MY25),230,81.2,Lithium-ion,,795.0,179,425,3.9,...,5060,1900,1505,Station/Estate,https://ev-database.org/car/3117/BMW-i5-M60-xD...,5.586592,76.2,43.3,52.7,314.04
152,Hyundai,IONIQ 5 84 kWh RWD (MY24),185,80.0,Lithium-ion,384.0,350.0,155,450,7.5,...,4655,1890,1605,SUV,https://ev-database.org/car/2236/Hyundai-IONIQ...,6.451613,86.8,26.1,227.5,445.54


In [None]:
features=["model",'battery_capacity_kWh', 'SOC_percent','top_speed_kmh', 'torque_nm',
    'acceleration_0_100_s', 'efficiency_wh_per_km',
    'temperature_C', 'car_body_type', 'drivetrain',"effective_range_km","range_km",'trip_distance_km']

target='km_per_kWh'

In [None]:
final_df=augmented_df[features+[target]]

In [None]:
final_df.sample(2)

Unnamed: 0,model,battery_capacity_kWh,SOC_percent,top_speed_kmh,torque_nm,acceleration_0_100_s,efficiency_wh_per_km,temperature_C,car_body_type,drivetrain,effective_range_km,range_km,trip_distance_km,km_per_kWh
296,Zafira-e Life L2 75 kWh,68.0,94.4,130,270.0,14.2,202,10.4,Small Passenger Van,FWD,340.98,260,37.6,4.950495
459,G6 AWD Performance,87.5,64.9,200,660.0,4.1,159,14.4,SUV,AWD,376.08,475,24.2,6.289308


In [None]:
final_df.to_csv("ev_augmented_10x.csv", index=False)

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import mean_absolute_error,r2_score

In [None]:
x=final_df[features]
y=final_df[target]

In [None]:
numeric = x.select_dtypes(include=[np.number]).columns.tolist()
categorical = ['car_body_type', 'drivetrain','model']

In [None]:
preprocessor=ColumnTransformer([
    ("numeric",StandardScaler(),numeric),
    ("categorical",OneHotEncoder(),categorical)
])

In [None]:
def train_evaluate_model(model,name):
  pipeline=Pipeline([
      ("preprocessor",preprocessor),
      ("model",model)
  ])

  x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
  pipeline.fit(x_train,y_train)
  y_pred=pipeline.predict(x_test)

  print(f"\n {name} Results")
  print(f"MAE: {mean_absolute_error(y_test,y_pred)}")
  print(f"R2: {r2_score(y_test,y_pred)}")
  mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
  print(f"MAPE: {mape:.2f}%")
  accuracy = 100 - mape
  print(f"Approx. Accuracy: {accuracy:.2f}%")
  return pipeline

In [None]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
xgb_model=train_evaluate_model(xgb.XGBRegressor(n_estimators=100),"XGBoost")


 XGBoost Results
MAE: 0.00014443787025530532
R2: 0.9999999607038331
MAPE: 0.00%
Approx. Accuracy: 100.00%


In [None]:
lgb_model=train_evaluate_model(lgb.LGBMRegressor(n_estimators=100,min_gain_to_split=0.0,min_data_in_leaf=1),"LightGBM")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2575
[LightGBM] [Info] Number of data points in the train set: 3816, number of used features: 498
[LightGBM] [Info] Start training from score 6.340433

 LightGBM Results
MAE: 2.283933349226267e-05
R2: 0.9999999992255335
MAPE: 0.00%
Approx. Accuracy: 100.00%


In [None]:
rf_model=train_evaluate_model(RandomForestRegressor(n_estimators=100),"Random Forest")

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

<h2>TabNet</h2>


In [None]:
!pip install pytorch-tabnet --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m99.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

def train_evaluate_model_2(model, name):

    pipeline=Pipeline([
      ("preprocessor",preprocessor),
  ])

    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
    pipeline.fit(x_train,y_train)
    # Convert to NumPy arrays
    X_train_np = x_train.values if hasattr(x_train, "values") else x_train
    X_test_np = x_test.values if hasattr(x_test, "values") else x_test
    y_train_np = y_train.values.reshape(-1, 1)
    y_test_np = y_test.values.reshape(-1, 1)



    # Train the model
    model.fit(
        X_train=X_train_np,
        y_train=y_train_np,
        eval_set=[(X_test_np, y_test_np)],
        eval_metric=['mae'],
        max_epochs=100,
        patience=20,
        batch_size=256,
        virtual_batch_size=128,
    )

    # Predict
    y_pred = model.predict(X_test_np).ravel()

    # Evaluate
    print(f"\n📊 {name} Results")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("R² Score:", r2_score(y_test, y_pred))

    return model


In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor

tabnet_model = TabNetRegressor()
tabnet_model = train_evaluate_model_2(tabnet_model, "TabNet")


ValueError: could not convert string to float: 'A290 Electric 220 hp'

In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor

tabnet_model = TabNetRegressor()




In [None]:

# def train_evaluate_model_2(model, name, X_train, X_test, y_train, y_test):
#   y_train_np = y_train.values.reshape(-1, 1)
#   y_test_np = y_test.values.reshape(-1, 1)
#     # Train the model
#   model.fit(
#         X_train=X_train,
#         y_train=y_train.reshape(-1, 1),
#         eval_set=[(X_test, y_test.reshape(-1, 1))],
#         eval_metric=['mae'],
#         max_epochs=100,
#         patience=20,
#         batch_size=256,
#         virtual_batch_size=128,
#     )

#     # Predict
#   y_pred = model.predict(X_test).ravel()

#     # Evaluate
#   print(f"\n📊 {name} Results")
#   print("MAE:", mean_absolute_error(y_test, y_pred))
#   print("R² Score:", r2_score(y_test, y_pred))

#   return model

In [None]:
tabnet_model = train_evaluate_model_2(tabnet_model, "TabNet", x_train, x_test, y_train, y_test)

AttributeError: 'Series' object has no attribute 'reshape'