In [4]:
import cudf
import cupy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from cuml.preprocessing import LabelEncoder, StandardScaler
from cuml.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score
from sklearn.metrics import classification_report

In [1]:
def load_and_clean_data(file_path):
    df = cudf.read_csv(file_path)
    columns_to_drop = ['VIN (1-10)', 'DOL Vehicle ID', '2020 Census Tract', 'Vehicle Location', 'Postal Code', 'State']
    df = df.drop(columns = columns_to_drop)
    df[['Longitude', 'Latitude']] = df['Vehicle Location'].str.extract(r'POINT \(([-\d.]+) ([-\d.]+)\)' ).astype(float)
    return df

In [2]:
def preprocess_data(df): 
    df['is_BEV'] = (df['Electric Vehicle Type'] == 'Battery Electric Vehicle (BEV)').astype(int)
    categorical_columnns = ['County', 'City', 'Make', 'Model', 'Electric Utility']
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[f'{col}_encoder'] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

In [3]:
def plot_cafv_distribution(df):
    plt.figure(figsize = (12, 6))
    eligibility_counts = df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].value_counts()
    plt.bar(range(len(eligibility_counts)), eligibility_counts.values)
    plt.xticks(range(len(eligibility_counts)), eligibility_counts.index, rotation = 45, ha = 'right')
    plt.title('Distribution of CAFV Eligibility Status')
    plt.tight_layout()
    plt.show()