In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # For scaling numerical data and encoding categorical data
from sklearn.svm import SVR  # For Support Vector Regression 
import xgboost as xgb # For XGBoost Regression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error  # For model evaluation metrics


In [147]:
df = pd.read_csv('Electric_cars_dataset.csv')
df.head()

Unnamed: 0,ID,VIN (1-10),County,City,State,ZIP Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,Expected Price ($1k)
0,EV33174,5YJ3E1EC6L,Snohomish,LYNNWOOD,WA,98037.0,2020.0,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,308,0,32.0,109821694,POINT (-122.287614 47.83874),PUGET SOUND ENERGY INC,50.0
1,EV40247,JN1AZ0CP8B,Skagit,BELLINGHAM,WA,98229.0,2011.0,NISSAN,LEAF,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,73,0,40.0,137375528,POINT (-122.414936 48.709388),PUGET SOUND ENERGY INC,15.0
2,EV12248,WBY1Z2C56F,Pierce,TACOMA,WA,98422.0,2015.0,BMW,I3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,81,0,27.0,150627382,POINT (-122.396286 47.293138),BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,18.0
3,EV55713,1G1RD6E44D,King,REDMOND,WA,98053.0,2013.0,CHEVROLET,VOLT,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,38,0,45.0,258766301,POINT (-122.024951 47.670286),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),33.9
4,EV28799,1G1FY6S05K,Pierce,PUYALLUP,WA,98375.0,2019.0,CHEVROLET,BOLT EV,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,238,0,25.0,296998138,POINT (-122.321062 47.103797),BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,41.78


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64353 entries, 0 to 64352
Data columns (total 18 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   ID                                                 64353 non-null  object 
 1   VIN (1-10)                                         64353 non-null  object 
 2   County                                             64349 non-null  object 
 3   City                                               64344 non-null  object 
 4   State                                              64342 non-null  object 
 5   ZIP Code                                           64347 non-null  float64
 6   Model Year                                         64346 non-null  float64
 7   Make                                               64349 non-null  object 
 8   Model                                              64340 non-null  object 
 9   Electr

In [148]:
feat = df.drop(['ID','County','VIN (1-10)','ZIP Code','Legislative District','DOL Vehicle ID','Vehicle Location'], axis=1,)
feat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64353 entries, 0 to 64352
Data columns (total 11 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   City                                               64344 non-null  object 
 1   State                                              64342 non-null  object 
 2   Model Year                                         64346 non-null  float64
 3   Make                                               64349 non-null  object 
 4   Model                                              64340 non-null  object 
 5   Electric Vehicle Type                              64353 non-null  object 
 6   Clean Alternative Fuel Vehicle (CAFV) Eligibility  64353 non-null  object 
 7   Electric Range                                     64353 non-null  int64  
 8   Base MSRP                                          64353 non-null  int64  
 9   Electr

In [149]:
feat.duplicated().sum()

47757

In [150]:
feat = feat.drop_duplicates()

In [151]:
feat.duplicated().sum()

0

In [152]:
feat.isnull().sum()

City                                                   9
State                                                 11
Model Year                                             7
Make                                                   4
Model                                                 12
Electric Vehicle Type                                  0
Clean Alternative Fuel Vehicle (CAFV) Eligibility      0
Electric Range                                         0
Base MSRP                                              0
Electric Utility                                     507
Expected Price ($1k)                                   0
dtype: int64

In [153]:
# Drop rows with missing values in specific columns
feat = feat.dropna(subset=['State','City','Model', 'Electric Utility','Model Year','Make'])


In [154]:
feat.isnull().sum()

City                                                 0
State                                                0
Model Year                                           0
Make                                                 0
Model                                                0
Electric Vehicle Type                                0
Clean Alternative Fuel Vehicle (CAFV) Eligibility    0
Electric Range                                       0
Base MSRP                                            0
Electric Utility                                     0
Expected Price ($1k)                                 0
dtype: int64

In [155]:
# Convert Expected Price to float
feat["Expected Price ($1k)"] = feat["Expected Price ($1k)"].astype(float)
feat['Expected Price ($1k)'].dtype


dtype('float64')

In [130]:
feat.info()


<class 'pandas.core.frame.DataFrame'>
Index: 16049 entries, 0 to 64352
Data columns (total 11 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   City                                               16049 non-null  object 
 1   State                                              16049 non-null  object 
 2   Model Year                                         16049 non-null  float64
 3   Make                                               16049 non-null  object 
 4   Model                                              16049 non-null  object 
 5   Electric Vehicle Type                              16049 non-null  object 
 6   Clean Alternative Fuel Vehicle (CAFV) Eligibility  16049 non-null  object 
 7   Electric Range                                     16049 non-null  int64  
 8   Base MSRP                                          16049 non-null  int64  
 9   Electric Ut

In [107]:
feat.describe()

Unnamed: 0,Model Year,Electric Range,Base MSRP,Expected Price ($1k)
count,16049.0,16049.0,16049.0,16049.0
mean,2017.823727,90.200012,3969.477849,39.035228
std,2.86305,96.210176,16171.255833,26.949727
min,1993.0,0.0,0.0,0.0
25%,2016.0,18.0,0.0,20.0
50%,2018.0,40.0,0.0,31.5
75%,2020.0,192.0,0.0,50.0
max,2022.0,337.0,845000.0,1100.0


In [156]:
num_feat = feat.select_dtypes(include='number').columns
num_feat = num_feat.drop(['Model Year','Expected Price ($1k)'])
num_feat

Index(['Electric Range', 'Base MSRP'], dtype='object')

In [157]:
# Dealing with outliers


from scipy.stats import zscore

# Calculate Z-scores for all numerical columns
z_scores = df[num_feat].apply(zscore)

# Set the Z-score threshold for detecting outliers
threshold = 3

# Start with the original dataframe and remove outliers progressively
df_no_outliers = feat.copy()

# Remove outliers per column, progressively updating the dataframe
for col in num_feat:
    df_no_outliers = df_no_outliers[(z_scores[col] < threshold) & (z_scores[col] > -threshold)]

# df_no_outliers now contains rows without outliers based on Z-scores

# Print the shape of the DataFrame before and after removing outliers
print("Original shape:", df.shape)
print("Shape after removing outliers:", df_no_outliers.shape)

# The DataFrame df now has outliers removed on a per-column basis

Original shape: (64353, 18)
Shape after removing outliers: (15354, 11)


  df_no_outliers = df_no_outliers[(z_scores[col] < threshold) & (z_scores[col] > -threshold)]
  df_no_outliers = df_no_outliers[(z_scores[col] < threshold) & (z_scores[col] > -threshold)]


In [158]:
feat.head()

Unnamed: 0,City,State,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Electric Utility,Expected Price ($1k)
0,LYNNWOOD,WA,2020.0,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,308,0,PUGET SOUND ENERGY INC,50.0
1,BELLINGHAM,WA,2011.0,NISSAN,LEAF,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,73,0,PUGET SOUND ENERGY INC,15.0
2,TACOMA,WA,2015.0,BMW,I3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,81,0,BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,18.0
3,REDMOND,WA,2013.0,CHEVROLET,VOLT,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,38,0,PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),33.9
4,PUYALLUP,WA,2019.0,CHEVROLET,BOLT EV,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,238,0,BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,41.78


In [91]:
cat_feat = feat.select_dtypes(include='object').columns
cat_feat

Index(['City', 'State', 'Make', 'Model', 'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility',
       'Electric Utility'],
      dtype='object')

In [143]:
# Calculate frequency of each city
feat['Model'].value_counts()



Model
LEAF                    1648
MODEL 3                 1280
MODEL S                 1166
VOLT                    1061
BOLT EV                  770
                        ... 
A8 E                       1
LIFE                       1
BENTAYGA                   1
918 SPYDER                 1
PRIUS PLUG-IN HYBRID       1
Name: count, Length: 107, dtype: int64

In [159]:
# Map frequency to each city
frequency = feat['City'].value_counts()
feat['City'] = feat['City'].map(frequency)


In [162]:
feat.head()

Unnamed: 0,City,State,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Electric Utility,Expected Price ($1k)
0,157,WA,2020.0,TESLA,MODEL 3,0,Clean Alternative Fuel Vehicle Eligible,308,0,PUGET SOUND ENERGY INC,50.0
1,202,WA,2011.0,NISSAN,LEAF,0,Clean Alternative Fuel Vehicle Eligible,73,0,PUGET SOUND ENERGY INC,15.0
2,283,WA,2015.0,BMW,I3,0,Clean Alternative Fuel Vehicle Eligible,81,0,BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,18.0
3,210,WA,2013.0,CHEVROLET,VOLT,1,Clean Alternative Fuel Vehicle Eligible,38,0,PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),33.9
4,222,WA,2019.0,CHEVROLET,BOLT EV,0,Clean Alternative Fuel Vehicle Eligible,238,0,BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,41.78


In [161]:
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder for binary column 
le = LabelEncoder()

# Fit and transform the 'country' column
feat['Electric Vehicle Type'] = le.fit_transform(feat['Electric Vehicle Type'])


In [163]:
feat = pd.get_dummies(feat, columns=['State','Make','Clean Alternative Fuel Vehicle (CAFV) Eligibility'])


In [164]:
feat.head()

Unnamed: 0,City,Model Year,Model,Electric Vehicle Type,Electric Range,Base MSRP,Electric Utility,Expected Price ($1k),State_MT,State_OR,...,Make_SUBARU,Make_TESLA,Make_TH!NK,Make_TOYOTA,Make_VOLKSWAGEN,Make_VOLVO,Make_WHEEGO ELECTRIC CARS,Clean Alternative Fuel Vehicle (CAFV) Eligibility_Clean Alternative Fuel Vehicle Eligible,Clean Alternative Fuel Vehicle (CAFV) Eligibility_Eligibility unknown as battery range has not been researched,Clean Alternative Fuel Vehicle (CAFV) Eligibility_Not eligible due to low battery range
0,157,2020.0,MODEL 3,0,308,0,PUGET SOUND ENERGY INC,50.0,False,False,...,False,True,False,False,False,False,False,True,False,False
1,202,2011.0,LEAF,0,73,0,PUGET SOUND ENERGY INC,15.0,False,False,...,False,False,False,False,False,False,False,True,False,False
2,283,2015.0,I3,0,81,0,BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,18.0,False,False,...,False,False,False,False,False,False,False,True,False,False
3,210,2013.0,VOLT,1,38,0,PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),33.9,False,False,...,False,False,False,False,False,False,False,True,False,False
4,222,2019.0,BOLT EV,0,238,0,BONNEVILLE POWER ADMINISTRATION||CITY OF TACOM...,41.78,False,False,...,False,False,False,False,False,False,False,True,False,False


In [168]:
# Encoding for the remaining categorical columns
em = feat.groupby('Model')['Expected Price ($1k)'].mean()
feat['Model'] = feat['Model'].map(em) 

In [170]:
emap = feat.groupby('Electric Utility')['Expected Price ($1k)'].mean()
feat['Electric Utility'] = feat['Electric Utility'].map(emap)

In [171]:
feat.head()

Unnamed: 0,City,Model Year,Model,Electric Vehicle Type,Electric Range,Base MSRP,Electric Utility,Expected Price ($1k),State_MT,State_OR,...,Make_SUBARU,Make_TESLA,Make_TH!NK,Make_TOYOTA,Make_VOLKSWAGEN,Make_VOLVO,Make_WHEEGO ELECTRIC CARS,Clean Alternative Fuel Vehicle (CAFV) Eligibility_Clean Alternative Fuel Vehicle Eligible,Clean Alternative Fuel Vehicle (CAFV) Eligibility_Eligibility unknown as battery range has not been researched,Clean Alternative Fuel Vehicle (CAFV) Eligibility_Not eligible due to low battery range
0,157,2020.0,58.730469,0,308,0,38.022457,50.0,False,False,...,False,True,False,False,False,False,False,True,False,False
1,202,2011.0,24.624472,0,73,0,38.022457,15.0,False,False,...,False,False,False,False,False,False,False,True,False,False
2,283,2015.0,23.255612,0,81,0,38.407032,18.0,False,False,...,False,False,False,False,False,False,False,True,False,False
3,210,2013.0,20.953401,1,38,0,40.355125,33.9,False,False,...,False,False,False,False,False,False,False,True,False,False
4,222,2019.0,30.433468,0,238,0,36.602546,41.78,False,False,...,False,False,False,False,False,False,False,True,False,False


In [172]:
x = feat.drop(['Expected Price ($1k)'], axis=1)
y = feat['Expected Price ($1k)']

In [174]:
# Split into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [177]:
# Fit and transform the data
scaler = StandardScaler()
x_train_sc = scaler.fit_transform(x_train)
x_test_sc = scaler.transform(x_test)

In [178]:
# Initialize the model with some parameters
svr_reg = SVR(kernel='rbf', C=100, epsilon=0.1)

# Fit the model
svr_reg.fit(x_train_sc, y_train)

# Predicting on test data
y_pred_svr = svr_reg.predict(x_test_sc)

# Model evaluation
print("SVR RMSE:", mean_squared_error(y_test, y_pred_svr, squared=False))
print("SVR R2 Score:", r2_score(y_test, y_pred_svr))
print(" SVR MAE:", mean_absolute_error(y_test, y_pred_svr))

SVR RMSE: 8.944882701403833
SVR R2 Score: 0.8657892227922406
 SVR MAE: 3.377440009238063


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR                         # Sensei, I left this running for three hours; still no  output

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'epsilon': [0.01, 0.05, 0.1, 0.2],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

# Initialize SVR model
svr = SVR()

# Grid search with cross-validation
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(x_train_sc, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)
print("Best score:", -grid_search.best_score_)
