<a href="https://colab.research.google.com/github/VEERLAPATIABHIRAM/AICTE_Internships/blob/main/EV_Adoption_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset (use raw string or forward slashes)
df = pd.read_csv("Electric_Vehicle_Population_By_County.csv")

# Initial inspection
print(df.head())
print("\nShape of the dataset:", df.shape)
print("\nInfo of the dataset:")
print(df.info())
print("\nMissing values per column:")
print(df.isnull().sum())

# Handle outliers in 'Percent Electric Vehicles' using IQR method
Q1 = df['Percent Electric Vehicles'].quantile(0.25)
Q3 = df['Percent Electric Vehicles'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("\nOutlier boundaries for 'Percent Electric Vehicles':")
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)

outliers = df[(df['Percent Electric Vehicles'] < lower_bound) |
              (df['Percent Electric Vehicles'] > upper_bound)]
print("Number of outliers before treatment:", outliers.shape[0])

# Convert 'Date' column to datetime, remove rows where conversion fails
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df[df['Date'].notnull()]

# Remove rows where 'Electric Vehicle (EV) Total' is missing
df = df[df['Electric Vehicle (EV) Total'].notnull()]

# Fill missing values in 'County' and 'State'
df['County'] = df['County'].fillna('Unknown')
df['State'] = df['State'].fillna('Unknown')

print("\nMissing values after filling:")
print(df[['County', 'State']].isnull().sum())

# Clip outliers in 'Percent Electric Vehicles'
df['Percent Electric Vehicles'] = np.where(
    df['Percent Electric Vehicles'] > upper_bound, upper_bound,
    np.where(df['Percent Electric Vehicles'] < lower_bound, lower_bound, df['Percent Electric Vehicles'])
)

# Confirm outliers removed
outliers = df[(df['Percent Electric Vehicles'] < lower_bound) |
              (df['Percent Electric Vehicles'] > upper_bound)]
print("Number of outliers after capping:", outliers.shape[0])

# Final head check
print("\nCleaned dataset preview:")
print(df.head())


                Date          County State Vehicle Primary Use  \
0  September 30 2022       Riverside    CA           Passenger   
1   December 31 2022  Prince William    VA           Passenger   
2    January 31 2020          Dakota    MN           Passenger   
3       June 30 2022           Ferry    WA               Truck   
4       July 31 2021         Douglas    CO           Passenger   

  Battery Electric Vehicles (BEVs) Plug-In Hybrid Electric Vehicles (PHEVs)  \
0                                7                                        0   
1                                1                                        2   
2                                0                                        1   
3                                0                                        0   
4                                0                                        1   

  Electric Vehicle (EV) Total Non-Electric Vehicle Total Total Vehicles  \
0                           7                        