# EV Adoption Forecasting

## Import Required Libraries

In [None]:
%pip install pandas numpy matplotlib seaborn scikit-learn

In [None]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Load Dataset

In [None]:
df = pd.read_csv("Electric_Vehicle_Population_By_County.csv")

## Explore and Understand the Data

In [None]:
df.head()

Total 20819 data points and 10 features.

In [None]:
df.shape

(20819, 10)

In [None]:
df.info()

As we could see, only Percent Electric Vehicles is numeric.

In [None]:
df.isnull().sum()

Missing values in County and State.

## Check if any column contains outliers

In [None]:
Q1 = df['Percent Electric Vehicles'].quantile(0.25)
Q3 = df['Percent Electric Vehicles'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print('lower_bound:', lower_bound)
print('upper_bound:', upper_bound)

outliers = df[(df['Percent Electric Vehicles'] < lower_bound) | (df['Percent Electric Vehicles'] > upper_bound)]
print("Number of outliers in 'Percent Electric Vehicles':", outliers.shape[0])

lower_bound: -3.5174999999999996

upper_bound: 6.9025

Number of outliers in 'Percent Electric Vehicles': 2476

## Data Preprocessing
### Basic Data Cleaning

In [None]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df[df['Date'].notnull()]
df = df[df['Electric Vehicle (EV) Total'].notnull()]
df['County'] = df['County'].fillna('Unknown')
df['State'] = df['State'].fillna('Unknown')

print("Missing after fill:")
print(df[['County', 'State']].isnull().sum())
df.head()

Missing after fill:
County    0
State     0

## Remove Outliers: Cap the values to the IQR bounds

In [None]:
df['Percent Electric Vehicles'] = np.where(df['Percent Electric Vehicles'] > upper_bound, upper_bound,
    np.where(df['Percent Electric Vehicles'] < lower_bound, lower_bound, df['Percent Electric Vehicles']))

outliers = df[(df['Percent Electric Vehicles'] < lower_bound) | (df['Percent Electric Vehicles'] > upper_bound)]
print("Number of outliers in 'Percent Electric Vehicles':", outliers.shape[0])

Number of outliers in 'Percent Electric Vehicles': 0