In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Load Original Datasets

In [3]:
df1 = pd.read_csv(r"asserts\Electric_Vehicle_Population_Data.csv")
df1.head(2)

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,5UXTA6C03P,King,Seattle,WA,98177.0,2023,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,30,0,36.0,218985539,POINT (-122.38242499999996 47.77279000000004),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033000000.0
1,1FMCU0EZXN,Yakima,Moxee,WA,98936.0,2022,FORD,ESCAPE,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,38,0,15.0,197264322,POINT (-120.37951169999997 46.55609000000004),PACIFICORP,53077000000.0


In [4]:
df2 = pd.read_csv(r"asserts\Electric_Vehicle_Population_Size_History_By_County.csv")
df2.head(2)

Unnamed: 0,Date,County,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles
0,November 30 2020,St. Mary's,MD,Passenger,1,0,1,215,216,0.46
1,February 28 2021,Sedgwick,KS,Passenger,1,1,2,89,91,2.2


# Cleaning

### Filter Out Needed Columns and Rows

In [5]:
# filter out only Washington and California records
df1_wa_ca = df1[df1['State'].isin(['WA', 'CA'])].reset_index(drop=True)
df2_wa_ca = df2[df2['State'].isin(['WA', 'CA'])].reset_index(drop=True)

In [6]:
#Select relevant columns for both datasets(Based on insights I want to draw from both datasets)

df1_filtered = df1_wa_ca[['State', 'County', 'City', 'Model Year', 'Make',
                          'Model', 'Electric Vehicle Type', 'Electric Range', 'Base MSRP']].copy()



df2_filtered = df2_wa_ca[['State', 'County', 'Date', 'Electric Vehicle (EV) Total', 'Non-Electric Vehicle Total',
                          'Battery Electric Vehicles (BEVs)', 'Plug-In Hybrid Electric Vehicles (PHEVs)']].copy()


In [7]:
# Rename specific columns and rearrange columns for both datasets

df1_filtered.rename(columns={
    'Model Year': 'ModelYear',
    'Electric Vehicle Type': 'EVType',
    'Electric Range': 'ElectricRange',
    'Base MSRP': 'BasePrice'
}, inplace=True)


df1_filtered = df1_filtered[['State', 'County', 'City', 'Make', 'Model', 'ModelYear', 'EVType', 'ElectricRange', 'BasePrice']]


df2_filtered.rename(columns={
    'Electric Vehicle (EV) Total': 'EVTotal',
    'Non-Electric Vehicle Total': 'NonEVTotal',
    'Battery Electric Vehicles (BEVs)': 'BEVCount',
    'Plug-In Hybrid Electric Vehicles (PHEVs)': 'PHEVCount'
}, inplace=True)


df2_filtered = df2_filtered[['Date', 'State', 'County', 'BEVCount', 'PHEVCount', 'EVTotal', 'NonEVTotal']]



### Check Filtered Datasets Info

In [8]:
df1_filtered.head()


Unnamed: 0,State,County,City,Make,Model,ModelYear,EVType,ElectricRange,BasePrice
0,WA,King,Seattle,BMW,X5,2023,Plug-in Hybrid Electric Vehicle (PHEV),30,0
1,WA,Yakima,Moxee,FORD,ESCAPE,2022,Plug-in Hybrid Electric Vehicle (PHEV),38,0
2,WA,King,Seattle,CHEVROLET,BOLT EV,2018,Battery Electric Vehicle (BEV),238,0
3,WA,King,Newcastle,TESLA,MODEL S,2013,Battery Electric Vehicle (BEV),208,69900
4,WA,Kitsap,Bremerton,FORD,C-MAX,2015,Plug-in Hybrid Electric Vehicle (PHEV),19,0


In [9]:
df2_filtered.head()

Unnamed: 0,Date,State,County,BEVCount,PHEVCount,EVTotal,NonEVTotal
0,April 30 2021,WA,Thurston,0,0,0,56710
1,October 31 2017,WA,Adams,3,3,6,12283
2,July 31 2018,WA,Whatcom,2,0,2,42313
3,August 31 2019,WA,Grays Harbor,0,0,0,20570
4,October 31 2019,WA,Cowlitz,0,0,0,29023


In [10]:
df1_filtered.info()
print("\n\n")
df2_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143359 entries, 0 to 143358
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   State          143359 non-null  object
 1   County         143357 non-null  object
 2   City           143357 non-null  object
 3   Make           143359 non-null  object
 4   Model          143359 non-null  object
 5   ModelYear      143359 non-null  int64 
 6   EVType         143359 non-null  object
 7   ElectricRange  143359 non-null  int64 
 8   BasePrice      143359 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 9.8+ MB



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7543 entries, 0 to 7542
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Date        7543 non-null   object
 1   State       7543 non-null   object
 2   County      7543 non-null   object
 3   BEVCount    7543 non-null   int64 
 4   PH

### Handling Missing Values for Both Datasets

In [11]:
# Check missing values for df1_filtered
missing_values_df1 = df1_filtered.isna().sum()
print("Missing values in df1_filtered:")
print(missing_values_df1)


Missing values in df1_filtered:
State            0
County           2
City             2
Make             0
Model            0
ModelYear        0
EVType           0
ElectricRange    0
BasePrice        0
dtype: int64


In [12]:
# Check missing values for df2_filtered
missing_values_df2 = df2_filtered.isna().sum()
print("Missing values in df2_combined_insights:")
print(missing_values_df2)


Missing values in df2_combined_insights:
Date          0
State         0
County        0
BEVCount      0
PHEVCount     0
EVTotal       0
NonEVTotal    0
dtype: int64


In [13]:
# Find the rows where 'County' or 'City' is missing in df1_filtered
missing_county_or_city = df1_filtered[df1_filtered['County'].isna() | df1_filtered['City'].isna()]

print("Rows with missing County or City values:")
print(missing_county_or_city)


Rows with missing County or City values:
       State County City   Make    Model  ModelYear  \
126446    CA    NaN  NaN  TESLA  MODEL 3       2018   
136203    CA    NaN  NaN  TESLA  MODEL 3       2022   

                                EVType  ElectricRange  BasePrice  
126446  Battery Electric Vehicle (BEV)            215          0  
136203  Battery Electric Vehicle (BEV)              0          0  


In [14]:
# Replacing the missing values with a placeholder
df1_filtered.fillna({'County': 'Unknown', 'City': 'Unknown'}, inplace=True)


### Data Type Conversion

In [15]:
df2_filtered['Date'] = pd.to_datetime(df2_filtered['Date'])


### Rename the Values in "EVType" Column

In [16]:
#checking the EVType value counts
print(df1_filtered["EVType"].value_counts())


df1_filtered['EVType'] = df1_filtered['EVType'].replace({
    'Battery Electric Vehicle (BEV)': 'BEV',
    'Plug-in Hybrid Electric Vehicle (PHEV)': 'PHEV'
})


Battery Electric Vehicle (BEV)            110716
Plug-in Hybrid Electric Vehicle (PHEV)     32643
Name: EVType, dtype: int64


In [18]:
# save two filtered datasets to csv

file_path1 = 'asserts/df1_filtered.csv'
df1_filtered.to_csv(file_path1, index=False)

file_path2 = 'asserts/df2_filtered.csv'
df2_filtered.to_csv(file_path2, index=False)
