In [27]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

In [2]:
# specify the file_path
file_path = './Electric_cars_dataset.csv'

cars_df = pd.read_csv(file_path)
cars_df.head()

Unnamed: 0,ID,VIN (1-10),County,City,State,ZIP Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,Expected Price ($1k)
0,EV33174,5YJ3E1EC6L,Snohomish,LYNNWOOD,WA,98037.0,2020.0,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,308,0,32.0,109821694,POINT (-122.287614 47.83874),PUGET SOUND ENERGY INC,50.0
1,EV40247,JN1AZ0CP8B,Skagit,BELLINGHAM,WA,98229.0,2011.0,NISSAN,LEAF,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,73,0,40.0,137375528,POINT (-122.414936 48.709388),PUGET SOUND ENERGY INC,15.0
2,EV12248,WBY1Z2C56F,Pierce,TACOMA,WA,98422.0,2015.0,BMW,I3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,81,0,27.0,150627382,POINT (-122.396286 47.293138),BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,18.0
3,EV55713,1G1RD6E44D,King,REDMOND,WA,98053.0,2013.0,CHEVROLET,VOLT,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,38,0,45.0,258766301,POINT (-122.024951 47.670286),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),33.9
4,EV28799,1G1FY6S05K,Pierce,PUYALLUP,WA,98375.0,2019.0,CHEVROLET,BOLT EV,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,238,0,25.0,296998138,POINT (-122.321062 47.103797),BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,41.78


In [3]:
# general info about the dataSet
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64353 entries, 0 to 64352
Data columns (total 18 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   ID                                                 64353 non-null  object 
 1   VIN (1-10)                                         64353 non-null  object 
 2   County                                             64349 non-null  object 
 3   City                                               64344 non-null  object 
 4   State                                              64342 non-null  object 
 5   ZIP Code                                           64347 non-null  float64
 6   Model Year                                         64346 non-null  float64
 7   Make                                               64349 non-null  object 
 8   Model                                              64340 non-null  object 
 9   Electr

In [4]:
cars_df.describe()

Unnamed: 0,ZIP Code,Model Year,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID
count,64347.0,64346.0,64353.0,64353.0,64184.0,64353.0
mean,98143.452888,2018.186212,106.948985,2524.990754,29.951904,197290500.0
std,2856.064329,2.726742,104.093919,12402.895104,14.661124,106946600.0
min,745.0,1993.0,0.0,0.0,0.0,4385.0
25%,98052.0,2017.0,14.0,0.0,19.0,137286500.0
50%,98121.0,2018.0,73.0,0.0,34.0,175377600.0
75%,98370.0,2021.0,215.0,0.0,43.0,229903900.0
max,99701.0,2022.0,337.0,845000.0,49.0,478934600.0


In [5]:
# Genrating a pandas profiling report
profile_report = ProfileReport(cars_df,title="Electic_cars profiling report",explorative=True)
profile_path = 'Electric_cars.html'
profile_report.to_file(profile_path)
print(f"\n Profile report saved to {profile_path}")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


 Profile report saved to Electric_cars.html


In [6]:
# According to the profile report there are missing values,skewedness in specific 2 columns
cars_df.isnull().sum()

ID                                                     0
VIN (1-10)                                             0
County                                                 4
City                                                   9
State                                                 11
ZIP Code                                               6
Model Year                                             7
Make                                                   4
Model                                                 13
Electric Vehicle Type                                  0
Clean Alternative Fuel Vehicle (CAFV) Eligibility      0
Electric Range                                         0
Base MSRP                                              0
Legislative District                                 169
DOL Vehicle ID                                         0
Vehicle Location                                     510
Electric Utility                                     722
Expected Price ($1k)           

In [7]:
# checking for duplicated values there are no duplicates according to the report
cars_df.duplicated().sum()

np.int64(0)

In [8]:
# Dropping rows with missing essential values: Model Year, Make, Model
df_cleaned = cars_df.dropna(subset=["Model Year", "Make", "Model","Vehicle Location"])

# Filling missing ZIP Code and Legislative District with mode
df_cleaned["ZIP Code"].fillna(df_cleaned["ZIP Code"].mode()[0], inplace=True)
df_cleaned["Legislative District"].fillna(df_cleaned["Legislative District"].mode()[0], inplace=True)

# Filling categorical columns with "Unknown"
categorical_cols = ["County", "City", "State", "Electric Utility"]
df_cleaned[categorical_cols] = df_cleaned[categorical_cols].fillna("Unknown")

# Converting Expected Price ($1k) to numeric
df_cleaned["Expected Price ($1k)"] = pd.to_numeric(df_cleaned["Expected Price ($1k)"], errors="coerce")

# Converting ZIP Code and Model Year to integers
df_cleaned["ZIP Code"] = df_cleaned["ZIP Code"].astype(int)
df_cleaned["Model Year"] = df_cleaned["Model Year"].astype(int)
df_cleaned.info(), df_cleaned.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 63819 entries, 0 to 64352
Data columns (total 18 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   ID                                                 63819 non-null  object 
 1   VIN (1-10)                                         63819 non-null  object 
 2   County                                             63819 non-null  object 
 3   City                                               63819 non-null  object 
 4   State                                              63819 non-null  object 
 5   ZIP Code                                           63819 non-null  int64  
 6   Model Year                                         63819 non-null  int64  
 7   Make                                               63819 non-null  object 
 8   Model                                              63819 non-null  object 
 9   Electric Ve

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned["ZIP Code"].fillna(df_cleaned["ZIP Code"].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["ZIP Code"].fillna(df_cleaned["ZIP Code"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({c

(None,
 ID                                                   0
 VIN (1-10)                                           0
 County                                               0
 City                                                 0
 State                                                0
 ZIP Code                                             0
 Model Year                                           0
 Make                                                 0
 Model                                                0
 Electric Vehicle Type                                0
 Clean Alternative Fuel Vehicle (CAFV) Eligibility    0
 Electric Range                                       0
 Base MSRP                                            0
 Legislative District                                 0
 DOL Vehicle ID                                       0
 Vehicle Location                                     0
 Electric Utility                                     0
 Expected Price ($1k)                    

In [9]:
# Dropping unnecessary columns
df_cleaned.drop(columns=["ID", "VIN (1-10)", "DOL Vehicle ID", "Vehicle Location"], inplace=True)

# Handling outliers using IQR method for Base MSRP and Electric Range
Q1 = df_cleaned[["Base MSRP", "Electric Range"]].quantile(0.25)
Q3 = df_cleaned[["Base MSRP", "Electric Range"]].quantile(0.75)
IQR = Q3 - Q1

# Define acceptable range for values
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter dataset to remove extreme outliers
df_cleaned = df_cleaned[
    (df_cleaned["Base MSRP"] >= lower_bound["Base MSRP"]) & (df_cleaned["Base MSRP"] <= upper_bound["Base MSRP"]) &
    (df_cleaned["Electric Range"] >= lower_bound["Electric Range"]) & (df_cleaned["Electric Range"] <= upper_bound["Electric Range"])
]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.drop(columns=["ID", "VIN (1-10)", "DOL Vehicle ID", "Vehicle Location"], inplace=True)


In [10]:
# Encoding categorical values
encoder = LabelEncoder()
categorical_features = ["County", "City", "State", "Make", "Model", "Electric Vehicle Type",
                        "Clean Alternative Fuel Vehicle (CAFV) Eligibility", "Electric Utility"]
for col in categorical_features:
    df_cleaned[col] = encoder.fit_transform(df_cleaned[col])
df_cleaned

Unnamed: 0,County,City,State,ZIP Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,Electric Utility,Expected Price ($1k)
0,117,263,34,98037,2020,26,50,0,0,308,0,32.0,65,50.000
1,115,37,34,98229,2011,21,47,0,0,73,0,40.0,65,15.000
2,93,463,34,98422,2015,3,40,0,0,81,0,27.0,20,18.000
3,60,383,34,98053,2013,5,83,1,0,38,0,45.0,66,33.900
4,93,372,34,98375,2019,5,12,0,0,238,0,25.0,16,41.780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64348,60,423,34,98144,2020,14,54,0,0,239,0,37.0,52,33.000
64349,93,463,34,98402,2013,28,59,1,2,6,0,27.0,20,13.300
64350,60,423,34,98119,2018,5,12,0,0,238,0,36.0,52,22.857
64351,60,423,34,98115,2017,23,17,1,2,14,0,46.0,52,45.700


In [21]:
# Identifying the targets and features
X = df_cleaned.drop(columns=["Expected Price ($1k)"])
y = df_cleaned["Expected Price ($1k)"]
# splitting our data into training and testing 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
# Standardizing features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Training the model using rbf
svm_model = SVR(kernel="rbf") 
svm_model.fit(X_train_scaled, y_train)

In [23]:
# testing our model
y_pred =svm_model.predict(X_test_scaled)
y_pred
y_pred.shape

(12151,)

In [29]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae,rmse,mse,r2

(5.937262676310653,
 np.float64(14.841702541218512),
 220.27613432201204,
 0.6701681635158361)

In [None]:
# Model evaliuation using mean absolute error,mean squared error and r2 score
# since the model  R² is close to 1, the model explains most of the variance in price.
# MAE is low, the model makes small average errors.
# The RMSE is slightly higher than the MAE hence the model makes some errors
# since mse is squared, it magnifies the effect of larger errors.

In [None]:
# How to improve the model
# Check if important predictors are missing (e.g., mileage).
# Instead of Label Encoding, try One-Hot Encoding or Target Encoding for better representation.
#Using Hyperparameter tuning for SVR: C,Gamma