In [1]:
from IPython.display import display, HTML
display(HTML("<style>.output_scroll {overflow: visible !important; max-height: none !important;}</style>"))

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Display the first few rows
df.head()

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,2T3YL4DV0E,King,Bellevue,WA,98005.0,2014,TOYOTA,RAV4,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,103.0,0.0,41.0,186450183,POINT (-122.1621 47.64441),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033020000.0
1,5YJ3E1EB6K,King,Bothell,WA,98011.0,2019,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220.0,0.0,1.0,478093654,POINT (-122.20563 47.76144),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033020000.0
2,5UX43EU02S,Thurston,Olympia,WA,98502.0,2025,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,40.0,0.0,35.0,274800718,POINT (-122.92333 47.03779),PUGET SOUND ENERGY INC,53067010000.0
3,JTMAB3FV5R,Thurston,Olympia,WA,98513.0,2024,TOYOTA,RAV4 PRIME,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,42.0,0.0,2.0,260758165,POINT (-122.81754 46.98876),PUGET SOUND ENERGY INC,53067010000.0
4,5YJYGDEE8M,Yakima,Selah,WA,98942.0,2021,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,0.0,0.0,15.0,236581355,POINT (-120.53145 46.65405),PACIFICORP,53077000000.0


In [5]:
# Dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232230 entries, 0 to 232229
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         232230 non-null  object 
 1   County                                             232226 non-null  object 
 2   City                                               232226 non-null  object 
 3   State                                              232230 non-null  object 
 4   Postal Code                                        232226 non-null  float64
 5   Model Year                                         232230 non-null  int64  
 6   Make                                               232230 non-null  object 
 7   Model                                              232230 non-null  object 
 8   Electric Vehicle Type                              232230 non-null  object

In [6]:
# Check for missing values
df.isnull().sum()

VIN (1-10)                                             0
County                                                 4
City                                                   4
State                                                  0
Postal Code                                            4
Model Year                                             0
Make                                                   0
Model                                                  0
Electric Vehicle Type                                  0
Clean Alternative Fuel Vehicle (CAFV) Eligibility      0
Electric Range                                        27
Base MSRP                                             27
Legislative District                                 481
DOL Vehicle ID                                         0
Vehicle Location                                      11
Electric Utility                                       4
2020 Census Tract                                      4
dtype: int64

In [7]:
# Summary statistics
df.describe()

Unnamed: 0,Postal Code,Model Year,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,2020 Census Tract
count,232226.0,232230.0,232203.0,232203.0,231749.0,232230.0,232226.0
mean,98180.172044,2021.353727,46.755998,803.808973,28.880979,234367100.0,52981770000.0
std,2489.407943,2.994884,84.373596,7246.597102,14.904503,68314180.0,1507814000.0
min,1731.0,1999.0,0.0,0.0,1.0,4385.0,1001020000.0
25%,98052.0,2020.0,0.0,0.0,17.0,203473700.0,53033010000.0
50%,98126.0,2023.0,0.0,0.0,32.0,251271700.0,53033030000.0
75%,98375.0,2023.0,38.0,0.0,42.0,268694300.0,53053070000.0
max,99577.0,2025.0,337.0,845000.0,49.0,479254800.0,56021000000.0


In [8]:
# Unique values in the target column
df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].value_counts()

Clean Alternative Fuel Vehicle (CAFV) Eligibility
Eligibility unknown as battery range has not been researched    136865
Clean Alternative Fuel Vehicle Eligible                          72847
Not eligible due to low battery range                            22518
Name: count, dtype: int64

In [9]:
# Fill missing values (Best Practice)
df['Postal Code'] = df['Postal Code'].fillna(df['Postal Code'].mode()[0])
df['Electric Range'] = df['Electric Range'].fillna(df['Electric Range'].median())
df['Base MSRP'] = df['Base MSRP'].fillna(df['Base MSRP'].median())

# Fill remaining missing data with 'Unknown' (Ensure type compatibility)
df = df.fillna('Unknown')

In [10]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
encoder = LabelEncoder()
df['Make'] = encoder.fit_transform(df['Make'])
df['Model'] = encoder.fit_transform(df['Model'])
df['Electric Vehicle Type'] = encoder.fit_transform(df['Electric Vehicle Type'])

# Encode the target column
df['CAFV_Eligibility'] = encoder.fit_transform(df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'])

# Drop the original target column
df.drop(['Clean Alternative Fuel Vehicle (CAFV) Eligibility'], axis=1, inplace=True)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['Model Year', 'Electric Range', 'Base MSRP']

# Scale the numerical features
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [12]:
from sklearn.model_selection import train_test_split

# Features and target
X = df.drop(['CAFV_Eligibility'], axis=1)
y = df['CAFV_Eligibility']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42,
                                                    stratify=y)