In [40]:
import pandas as pd

In [36]:
def describe_categorical_columns(df: pd.DataFrame):
    """
    Describe the categorical columns of a given DataFrame.
    
    For each categorical column, this function prints:
    1. Unique values in the column.
    2. Count of each unique value.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame to be analyzed.
    
    Returns:
    None. The function prints the results.
    """
    
    for column in df.columns:
        # Check if the column has an 'object' data type (categorical).
        if df[column].dtype == 'object':
            print(f"Column '{column}':")
            
            # Print unique values in the column.
            unique_values = df[column].unique()
            print("Unique Values:")
            print(unique_values)
            
            # Print value counts for each unique value.
            print("Value Counts:")
            value_counts = df[column].value_counts()
            print(value_counts)
            
            print()  # Add an empty line for separation.

# To use the function:
# describe_categorical_columns(your_dataframe)


## Initial Inspection of Both Datasets

In [6]:
df = pd.read_csv(r"asserts\Electric_Vehicle_Population_Data.csv")
df.head(5)

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,5UXTA6C03P,King,Seattle,WA,98177.0,2023,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,30,0,36.0,218985539,POINT (-122.38242499999996 47.77279000000004),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033000000.0
1,1FMCU0EZXN,Yakima,Moxee,WA,98936.0,2022,FORD,ESCAPE,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,38,0,15.0,197264322,POINT (-120.37951169999997 46.55609000000004),PACIFICORP,53077000000.0
2,1G1FW6S03J,King,Seattle,WA,98117.0,2018,CHEVROLET,BOLT EV,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,238,0,36.0,168549727,POINT (-122.37275999999997 47.689685000000054),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033000000.0
3,5YJSA1AC0D,King,Newcastle,WA,98059.0,2013,TESLA,MODEL S,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,208,69900,41.0,244891062,POINT (-122.15733999999998 47.487175000000036),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033030000.0
4,1FADP5CU8F,Kitsap,Bremerton,WA,98312.0,2015,FORD,C-MAX,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,19,0,26.0,134915000,POINT (-122.65223 47.57192),PUGET SOUND ENERGY INC,53035080000.0


In [35]:
df["Clean Alternative Fuel Vehicle (CAFV) Eligibility"].value_counts().head(5)

Clean Alternative Fuel Vehicle (CAFV) Eligibility
Eligibility unknown as battery range has not been researched    63954
Clean Alternative Fuel Vehicle Eligible                         62149
Not eligible due to low battery range                           17493
Name: count, dtype: int64

In [17]:
df_wa = df[df["State"] == 'WA']
df_wa["Legislative District"].value_counts().head(5)

Legislative District
41.0    9553
45.0    8807
48.0    8045
36.0    6270
1.0     6209
Name: count, dtype: int64

In [18]:
df_wa["Electric Utility"].value_counts().head(5)

Electric Utility
PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA)                                      52866
PUGET SOUND ENERGY INC                                                             28419
CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA)                                       26145
BONNEVILLE POWER ADMINISTRATION||PUD NO 1 OF CLARK COUNTY - (WA)                    8255
BONNEVILLE POWER ADMINISTRATION||CITY OF TACOMA - (WA)||PENINSULA LIGHT COMPANY     6351
Name: count, dtype: int64

## Column Dictionary 

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143596 entries, 0 to 143595
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         143596 non-null  object 
 1   County                                             143574 non-null  object 
 2   City                                               143574 non-null  object 
 3   State                                              143596 non-null  object 
 4   Postal Code                                        143574 non-null  float64
 5   Model Year                                         143596 non-null  int64  
 6   Make                                               143596 non-null  object 
 7   Model                                              143596 non-null  object 
 8   Electric Vehicle Type                              143596 non-null  object

- VIN (1-10): VINs are unique identifiers for individual vehicles.

- County, City, State, Postal Code: These are geographic identifiers that specify where the vehicle is located.

- Model Year: The year the vehicle model was manufactured.

- Make, Model: The brand and the specific model of the vehicle.

- Electric Vehicle Type:Plug-in Hybrid Electric Vehicle (PHEV) or Battery Electric Vehicle (BEV).

- Clean Alternative Fuel Vehicle (CAFV) Eligibility: Indicates whether the vehicle is eligible for any clean alternative fuel vehicle programs or incentives.

- Electric Range: Represents the range of the vehicle on a single electric charge, usually in miles or kilometers.

- Base MSRP: Manufacturer's Suggested Retail Price; we may not need it.

- Legislative District: Probably relates to the legislative or governmental district where the vehicle is registered.

- DOL Vehicle ID: Department of Licensing Vehicle ID, another unique identifier for the vehicle.

- Vehicle Location: Appears to be the longitude and latitude coordinates of the vehicle location.

- Electric Utility: The electric utility provider in the vehicle's geographic area.

- 2020 Census Tract: A geographic region defined for the purpose of taking a census. Useful for sociodemographic analysis.


In [19]:
# Check for duplicate rows in the dataframe
duplicate_rows = df.duplicated()

# Count the number of duplicate rows
num_duplicate_rows = duplicate_rows.sum()

print(f"Number of duplicate rows = {duplicate_rows.sum()}")


Number of duplicate rows = 0


In [20]:
missing_values = df.isnull().sum()
for column, count in missing_values.items():
    print(f"Column '{column}' has {count} missing values.")


Column 'VIN (1-10)' has 0 missing values.
Column 'County' has 22 missing values.
Column 'City' has 22 missing values.
Column 'State' has 0 missing values.
Column 'Postal Code' has 22 missing values.
Column 'Model Year' has 0 missing values.
Column 'Make' has 0 missing values.
Column 'Model' has 0 missing values.
Column 'Electric Vehicle Type' has 0 missing values.
Column 'Clean Alternative Fuel Vehicle (CAFV) Eligibility' has 0 missing values.
Column 'Electric Range' has 0 missing values.
Column 'Base MSRP' has 0 missing values.
Column 'Legislative District' has 327 missing values.
Column 'DOL Vehicle ID' has 0 missing values.
Column 'Vehicle Location' has 25 missing values.
Column 'Electric Utility' has 22 missing values.
Column '2020 Census Tract' has 22 missing values.


A more comprehensive view of categorical columns by showing both unique values and their corresponding counts.

In [37]:
describe_categorical_columns(df)

Column 'VIN (1-10)':
Unique Values:
['5UXTA6C03P' '1FMCU0EZXN' '1G1FW6S03J' ... 'WBY8P2C0XL' 'KMHCX5LD1L'
 'W1KCG5FB2P']
Value Counts:
VIN (1-10)
7SAYGDEE7P    652
7SAYGDEE2P    636
7SAYGDEE6P    632
7SAYGDEE8P    628
7SAYGDEE0P    619
             ... 
WP0BC2Y1XP      1
WA1VABGE1N      1
YV4BR0CK1N      1
WA1G2AFY1L      1
W1KCG5FB2P      1
Name: count, Length: 9311, dtype: int64

Column 'County':
Unique Values:
['King' 'Yakima' 'Kitsap' 'Thurston' 'Snohomish' 'Kittitas' 'Skagit'
 'Island' 'Pend Oreille' 'Walla Walla' 'Chelan' 'Whitman' 'Spokane'
 'Kings' nan 'Mason' 'Clark' 'Cowlitz' 'Jefferson' 'Clallam' 'Benton'
 'Stevens' 'Lewis' 'Pierce' 'Douglas' 'Grant' 'Grays Harbor' 'Klickitat'
 'Whatcom' 'San Juan' 'Skamania' 'Lincoln' 'Pacific' 'Franklin'
 'Wahkiakum' 'Asotin' 'Laramie' 'Adams' 'Okanogan' 'Maricopa' 'Columbia'
 'District of Columbia' 'Ferry' 'San Diego' 'Lake' 'DeKalb' 'Bexar' 'Lee'
 'Santa Clara' 'Stafford' 'Contra Costa' 'Kern' 'Los Angeles' 'Saratoga'
 'Orange' 'Anne Aru

In [14]:
df[['Electric Range', 'Base MSRP']].describe()

Unnamed: 0,Electric Range,Base MSRP
count,143596.0,143596.0
mean,70.495738,1373.386515
std,97.128735,9450.067747
min,0.0,0.0
25%,0.0,0.0
50%,19.0,0.0
75%,111.0,0.0
max,337.0,845000.0


In [22]:
df["Base MSRP"].value_counts().head(5)

Base MSRP
0        140151
69900      1446
31950       401
52900       217
32250       149
Name: count, dtype: int64

In [23]:
df["Electric Range"].value_counts().head(5)

Electric Range
0      63954
215     6472
220     4154
84      4039
238     3550
Name: count, dtype: int64

In [25]:
# Extract the required columns 'Electric Range' and 'Base MSRP' from the dataframe
subset_df = df[['Electric Range', 'Base MSRP']]

# Calculate IQR for Electric Range
Q1_ER = subset_df['Electric Range'].quantile(0.25)
Q3_ER = subset_df['Electric Range'].quantile(0.75)
IQR_ER = Q3_ER - Q1_ER

# Calculate IQR for Base MSRP
Q1_BM = subset_df['Base MSRP'].quantile(0.25)
Q3_BM = subset_df['Base MSRP'].quantile(0.75)
IQR_BM = Q3_BM - Q1_BM

# Find outliers for Electric Range based on IQR
outliers_electric_range_iqr = subset_df[
    (subset_df['Electric Range'] < (Q1_ER - 1.5 * IQR_ER)) | 
    (subset_df['Electric Range'] > (Q3_ER + 1.5 * IQR_ER))
]

# Find outliers for Base MSRP based on IQR
outliers_base_msrp_iqr = subset_df[
    (subset_df['Base MSRP'] < (Q1_BM - 1.5 * IQR_BM)) | 
    (subset_df['Base MSRP'] > (Q3_BM + 1.5 * IQR_BM))
]

# Print the outliers for Electric Range
print('Outliers in Electric Range:')
print(outliers_electric_range_iqr['Electric Range'])

# Print the outliers for Base MSRP
print('\nOutliers in Base MSRP:')
print(outliers_base_msrp_iqr['Base MSRP'])

Outliers in Electric Range:
15        293
67        322
71        293
76        289
81        291
         ... 
143473    289
143513    293
143535    291
143587    291
143589    291
Name: Electric Range, Length: 5942, dtype: int64

Outliers in Base MSRP:
3         69900
60        69900
88        69900
181       31950
199       33950
          ...  
143435    36900
143531    54950
143534    34995
143545    54950
143558    69900
Name: Base MSRP, Length: 3445, dtype: int64


In [26]:
df1 = pd.read_csv(r"asserts\Electric_Vehicle_Population_Size_History_By_County.csv")
df1.head(5)

Unnamed: 0,Date,County,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles
0,November 30 2020,St. Mary's,MD,Passenger,1,0,1,215,216,0.46
1,February 28 2021,Sedgwick,KS,Passenger,1,1,2,89,91,2.2
2,November 30 2017,New Haven,CT,Passenger,0,1,1,53,54,1.85
3,April 30 2021,Thurston,WA,Truck,0,0,0,56710,56710,0.0
4,October 31 2017,Adams,WA,Passenger,3,3,6,12283,12289,0.05


## Column Dictionary

In [26]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17965 entries, 0 to 17964
Data columns (total 10 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Date                                      17965 non-null  object 
 1   County                                    17886 non-null  object 
 2   State                                     17886 non-null  object 
 3   Vehicle Primary Use                       17965 non-null  object 
 4   Battery Electric Vehicles (BEVs)          17965 non-null  int64  
 5   Plug-In Hybrid Electric Vehicles (PHEVs)  17965 non-null  int64  
 6   Electric Vehicle (EV) Total               17965 non-null  int64  
 7   Non-Electric Vehicle Total                17965 non-null  int64  
 8   Total Vehicles                            17965 non-null  int64  
 9   Percent Electric Vehicles                 17965 non-null  float64
dtypes: float64(1), int64(5), object(4)

- Date: The date for the data entry. It appears to be formatted as "Month Day Year" and end-of-period date for the data provided.

- County: The name of the county.

- State: The state in which the mentioned county is located.

- Vehicle Primary Use: The primary use of the vehicle (e.g., "Passenger," "Truck").

- Battery Electric Vehicles (BEVs): The count of BEVs in the given county, state, and date.

- Plug-In Hybrid Electric Vehicles (PHEVs): The count of PHEVs in the given county, state, and date.

- Electric Vehicle (EV) Total: The total count of all types of electric vehicles.

- Non-Electric Vehicle Total: The count of vehicles that are not electric.

- Total Vehicles: The total number of all vehicles, including both electric and non-electric.

- Percent Electric Vehicles: The percentage of electric vehicles out of the total number of vehicles


In [27]:
# Check for duplicate rows in the dataframe
duplicate_rows = df1.duplicated()

# Count the number of duplicate rows
num_duplicate_rows = duplicate_rows.sum()

print(f"Number of duplicate rows = {duplicate_rows.sum()}")


Number of duplicate rows = 0


In [29]:
missing_values = df1.isnull().sum()
for column, count in missing_values.items():
    print(f"Column '{column}' has {count} missing values.")


Column 'Date' has 0 missing values.
Column 'County' has 79 missing values.
Column 'State' has 79 missing values.
Column 'Vehicle Primary Use' has 0 missing values.
Column 'Battery Electric Vehicles (BEVs)' has 0 missing values.
Column 'Plug-In Hybrid Electric Vehicles (PHEVs)' has 0 missing values.
Column 'Electric Vehicle (EV) Total' has 0 missing values.
Column 'Non-Electric Vehicle Total' has 0 missing values.
Column 'Total Vehicles' has 0 missing values.
Column 'Percent Electric Vehicles' has 0 missing values.


In [38]:
describe_categorical_columns(df1)

Column 'Date':
Unique Values:
['November 30 2020' 'February 28 2021' 'November 30 2017' 'April 30 2021'
 'October 31 2017' 'July 31 2018' 'February 29 2020' 'January 31 2017'
 'August 31 2019' 'October 31 2019' 'September 30 2017'
 'September 30 2020' 'July 31 2021' 'June 30 2018' 'March 31 2018'
 'November 30 2022' 'March 31 2017' 'November 30 2021' 'January 31 2019'
 'April 30 2019' 'June 30 2021' 'September 30 2022' 'July 31 2023'
 'June 30 2017' 'December 31 2018' 'May 31 2017' 'October 31 2022'
 'October 31 2021' 'April 30 2018' 'October 31 2020' 'September 30 2018'
 'February 28 2022' 'May 31 2022' 'July 31 2017' 'April 30 2017'
 'March 31 2021' 'June 30 2023' 'January 31 2023' 'February 28 2023'
 'March 31 2020' 'December 31 2022' 'May 31 2019' 'March 31 2022'
 'April 30 2022' 'January 31 2022' 'May 31 2020' 'November 30 2018'
 'July 31 2022' 'June 30 2022' 'December 31 2020' 'August 31 2021'
 'August 31 2017' 'August 31 2018' 'August 31 2020' 'May 31 2021'
 'December 31 2019' '

In [39]:
df1[['Battery Electric Vehicles (BEVs)', 'Electric Vehicle (EV) Total', "Percent Electric Vehicles"]].describe()


Unnamed: 0,Battery Electric Vehicles (BEVs),Electric Vehicle (EV) Total,Percent Electric Vehicles
count,17965.0,17965.0,17965.0
mean,202.596104,281.354133,3.83709
std,2017.309902,2613.246332,10.886949
min,0.0,0.0,0.0
25%,0.0,1.0,0.32
50%,1.0,1.0,1.09
75%,3.0,4.0,2.56
max,58610.0,73335.0,100.0
