# EV Adoption Forecasting Project

This notebook covers data loading, preprocessing, feature engineering, model building, evaluation, and forecasting for EV adoption using Random Forest.

## Import Required Libraries

In [None]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Load Dataset

In [None]:
# Load data
df = pd.read_csv("Electric_Vehicle_Population_By_County.csv")

## Explore and Understand the Data

In [None]:
print("Dataset Shape:", df.shape)
df.head()
df.info()
df.isnull().sum()

### Check for Outliers

In [None]:
Q1 = df['Percent Electric Vehicles'].quantile(0.25)
Q3 = df['Percent Electric Vehicles'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print('lower_bound:', lower_bound)
print('upper_bound:', upper_bound)

outliers = df[(df['Percent Electric Vehicles'] < lower_bound) | (df['Percent Electric Vehicles'] > upper_bound)]
print("Number of outliers in 'Percent Electric Vehicles':", outliers.shape[0])

## Data Preprocessing

In [None]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df[df['Date'].notnull()]
df = df[df['Electric Vehicle (EV) Total'].notnull()]
df['County'] = df['County'].fillna('Unknown')
df['State'] = df['State'].fillna('Unknown')

# Cap outliers
df['Percent Electric Vehicles'] = np.where(df['Percent Electric Vehicles'] > upper_bound, upper_bound,
                                 np.where(df['Percent Electric Vehicles'] < lower_bound, lower_bound, df['Percent Electric Vehicles']))

cols_to_convert = [
    'Battery Electric Vehicles (BEVs)',
    'Plug-In Hybrid Electric Vehicles (PHEVs)',
    'Electric Vehicle (EV) Total',
    'Non-Electric Vehicle Total',
    'Total Vehicles',
    'Percent Electric Vehicles'
]

for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna().reset_index(drop=True)
df.to_csv('preprocessed_ev_data.csv', index=False)
df.head()

## Feature Engineering

In [None]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['numeric_date'] = df['Date'].dt.year * 12 + df['Date'].dt.month

le = LabelEncoder()
df['county_encoded'] = le.fit_transform(df['County'])
df = df.sort_values(['County', 'Date'])
df['months_since_start'] = df.groupby('County').cumcount()

for lag in [1, 2, 3]:
    df[f'ev_total_lag{lag}'] = df.groupby('County')['Electric Vehicle (EV) Total'].shift(lag)

df['ev_total_roll_mean_3'] = df.groupby('County')['Electric Vehicle (EV) Total']                                .transform(lambda x: x.shift(1).rolling(3).mean())

df['ev_total_pct_change_1'] = df.groupby('County')['Electric Vehicle (EV) Total'].pct_change(periods=1, fill_method=None)
df['ev_total_pct_change_3'] = df.groupby('County')['Electric Vehicle (EV) Total'].pct_change(periods=3, fill_method=None)

df['ev_total_pct_change_1'] = df['ev_total_pct_change_1'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['ev_total_pct_change_3'] = df['ev_total_pct_change_3'].replace([np.inf, -np.inf], np.nan).fillna(0)

df['cumulative_ev'] = df.groupby('County')['Electric Vehicle (EV) Total'].cumsum()

df['ev_growth_slope'] = df.groupby('County')['cumulative_ev'].transform(
    lambda x: x.rolling(6).apply(lambda y: np.polyfit(range(len(y)), y, 1)[0] if len(y) == 6 else np.nan)
)

df = df.dropna().reset_index(drop=True)

## Model Training

In [None]:
features = [
    'months_since_start',
    'county_encoded',
    'ev_total_lag1',
    'ev_total_lag2',
    'ev_total_lag3',
    'ev_total_roll_mean_3',
    'ev_total_pct_change_1',
    'ev_total_pct_change_3',
    'ev_growth_slope',
]

target = 'Electric Vehicle (EV) Total'
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.1)

param_dist = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2', None]
}

rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42
)
random_search.fit(X_train, y_train)
model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

## Model Evaluation

In [None]:
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))

## Save and Test the Model

In [None]:
joblib.dump(model, 'forecasting_ev_model.pkl')
loaded_model = joblib.load('forecasting_ev_model.pkl')
sample = X_test.iloc[[0]]
true_value = y_test.iloc[0]
predicted_value = loaded_model.predict(sample)[0]
print(f"Actual EVs: {true_value:.2f}, Predicted EVs: {predicted_value:.2f}")