# 🚗 EV Insights: Trends & Predictions in Washington's Clean Vehicle Movement
This project explores electric vehicle (EV) adoption across Washington State using public data. The aim is to uncover trends, compare vehicle types, and train a classifier to predict EV categories.

## 🔍 Key Findings
- **Battery Electric Vehicles (BEVs)** have a longer electric range but tend to have a higher MSRP.
- **Vehicle registrations** are heavily concentrated in urban counties like King and Snohomish.
- **Electric range** has improved significantly in model years post-2017.
- **Certain utility companies** serve more EVs, indicating potential infrastructure demand.
- **Random Forest Classifier** achieved solid accuracy in predicting EV type based on make, model, and MSRP.

## 🎯 Project Goal
Help the **Washington State Department of Transportation** and stakeholders understand EV adoption trends, price/range tradeoffs, and target areas for charging infrastructure based on real-world vehicle data.

In [None]:
!pip install keplergl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import patsy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import warnings
from keplergl import KeplerGl
from google.colab import output
output.enable_custom_widget_manager()

In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np

# Predicting Base MSRP using only numerical features (Model Year, Electric Range)
try:
    df_reg = data_analysis.df[['Model Year', 'Electric Range', 'Base MSRP']].dropna()

    # Remove outliers in MSRP (e.g., top 1%)
    q_high = df_reg['Base MSRP'].quantile(0.99)
    df_reg = df_reg[df_reg['Base MSRP'] <= q_high]

    X = df_reg[['Model Year', 'Electric Range']]
    y = df_reg['Base MSRP']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)

    print("✅ Fixed Linear Regression Completed")
    print("R-squared score:", r2_score(y_test, y_pred))

except Exception as e:
    print("Error during linear regression:", e)


In [None]:
data_analysis = DataAnalysis("/content/Electric_Vehicle_Population_Data.csv")
data = data_analysis.load_data("/content/Electric_Vehicle_Population_Data.csv")

In [None]:
# Filter data
filtered_data = data_analysis.filter_data(column='Model Year', value=2020)
filtered_data

In [None]:
# Filter data
filtered_data = data_analysis.filter_data(column='Model Year', value=2020)
filtered_data

In [None]:
# Print vehicle info
data_analysis.print_vehicle_info('VIN123', 'Tesla', 'Model S', verbose=True, show_range=True, electric_range=350)

In [None]:
# Plot histogram for make
data_analysis.plot_histogram_for_make()

In [None]:
# Plot pair plot
data_analysis.plot_pair_plot()

In [None]:
# Plot heatmap
data_analysis.plot_heatmap()

In [None]:
# Plot scatter plot
data_analysis.plot_scatter_plot()

In [None]:
# Plot vehicle type distribution
data_analysis.plot_vehicle_type_distribution()

In [None]:
# Plot model year distribution
data_analysis.plot_model_year_distribution()

In [None]:
# Plot electric range distribution
data_analysis.plot_electric_range_distribution()

In [None]:
# Plot CAFV eligibility count
data_analysis.plot_cafv_eligibility_count()

In [None]:
# Plot model year vs electric range
data_analysis.plot_model_year_vs_electric_range()

In [None]:
# Plot top counties
data_analysis.plot_top_counties()

In [None]:
# Plot top cities
data_analysis.plot_top_cities()

In [None]:
# Plot top utility companies
data_analysis.plot_top_utility_companies()

In [None]:
# Plot BEV vs PHEV comparison
data_analysis.plot_bev_phev_comparison()

In [None]:
# Plot multivariate analysis
data_analysis.plot_multivariate_analysis()

In [None]:
# Filter by model year
filtered_by_model_year = data_analysis.filter_by_model_year(model_year=2020)
filtered_by_model_year

In [None]:
# Get data matrix
data_matrix = data_analysis.get_data_matrix()
data_matrix

In [None]:
# Get selected data matrix
selected_data_matrix = data_analysis.get_selected_data_matrix(selected_columns=['Make', 'Model Year', 'Electric Range'])
selected_data_matrix

In [None]:
# Get numpy array
numpy_array = data_analysis.get_numpy_array()
numpy_array

In [None]:
# Create brand model column
brand_model_column = data_analysis.create_brand_model_column()
brand_model_column

In [None]:
# Sort by column
sorted_data = data_analysis.sort_by_column(column='Model Year')
sorted_data

In [None]:
# Perform linear regression
data_analysis.perform_linear_regression()

In [None]:
# Call the create_keplergl_map method using the instance
data_analysis.create_keplergl_map()

In [None]:
data_analysis.train_random_forest_classifier()


## 🚀 Model Comparison: Random Forest vs XGBoost

In [None]:
# Train an XGBoost Classifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Preprocess again
X_encoded, y = self.preprocess_data()
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print('XGBoost Accuracy:', accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

In [None]:
# Optional: XGBoost Feature Importance Plot
import matplotlib.pyplot as plt
import pandas as pd
importances = xgb_model.feature_importances_
feat_names = preprocessor.get_feature_names_out()
feat_imp_df = pd.DataFrame({'Feature': feat_names, 'Importance': importances})
feat_imp_df.sort_values(by='Importance', ascending=False).head(10).plot(kind='bar', x='Feature', y='Importance', figsize=(10, 6))
plt.title('XGBoost - Top 10 Feature Importances')
plt.ylabel('Importance')
plt.tight_layout()
plt.show()

## 📉 Linear Regression on MSRP using Model Year and Electric Range

In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np

# Predicting Base MSRP using only numerical features (Model Year, Electric Range)
try:
    df_reg = data_analysis.df[['Model Year', 'Electric Range', 'Base MSRP']].dropna()

    # Remove outliers in MSRP (e.g., top 1%)
    q_high = df_reg['Base MSRP'].quantile(0.99)
    df_reg = df_reg[df_reg['Base MSRP'] <= q_high]

    X = df_reg[['Model Year', 'Electric Range']]
    y = df_reg['Base MSRP']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)

    print("✅ Fixed Linear Regression Completed")
    print("R-squared score:", r2_score(y_test, y_pred))

except Exception as e:
    print("Error during linear regression:", e)


## 🌲 Random Forest Classifier (Improved) with Sampled Data

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Use a sampled and properly preprocessed dataset
try:
    df_class = data_analysis.df[['Model Year', 'Make', 'Model', 'Electric Range', 'Base MSRP', 'Electric Vehicle Type']].dropna()

    # Sample to reduce memory and overfitting
    df_sampled = df_class.sample(n=10000, random_state=42)

    X = df_sampled.drop('Electric Vehicle Type', axis=1)
    y = df_sampled['Electric Vehicle Type']

    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer

    cat_features = ['Make', 'Model']
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
        ],
        remainder='passthrough'
    )

    X_encoded = preprocessor.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)

    print("✅ Random Forest Classification Completed")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
except Exception as e:
    print("Error during Random Forest training:", e)
