In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

#################################################################
# 1. BAYESIAN DECISION THEORY CLASSIFIER
#################################################################
class BayesianClassifier:
    """
    Implements Bayesian Decision Theory assuming Gaussian likelihood.
    This is equivalent to Quadratic Discriminant Analysis (QDA).
    
    Decision Rule:
    Predict class C_k that maximizes P(C_k | x)
    which is proportional to P(x | C_k) * P(C_k)
    
    P(C_k) = Prior (calculated from class frequency)
    P(x | C_k) = Likelihood (calculated using a multivariate Gaussian)
    """
    def __init__(self):
        self.priors = {}
        self.means = {}
        self.covs = {}
        self.classes = []
        self.n_features = 0

    def fit(self, X, y):
        """
        Trains the classifier by calculating priors, means, and covariance matrices
        for each class.
        """
        n_samples, self.n_features = X.shape
        self.classes = np.unique(y)
        
        for c in self.classes:
            # Filter data for the current class
            X_c = X[y == c]
            
            # 1. Calculate Prior: P(C_k)
            self.priors[c] = len(X_c) / n_samples
            
            # 2. Calculate Likelihood Parameters: Mean and Covariance
            self.means[c] = np.mean(X_c, axis=0)
            
            # Add a small epsilon (1e-6) to the diagonal for numerical stability
            # This prevents "singular" matrices if a feature has no variance
            identity_matrix = np.eye(self.n_features)
            epsilon = 1e-6
            self.covs[c] = np.cov(X_c, rowvar=False) + identity_matrix * epsilon

    def predict(self, X):
        """
        Predicts the class for a given set of samples X.
        """
        y_pred = []
        for x in X:
            posteriors = []
            
            # Calculate posterior probability (proportional value) for each class
            for c in self.classes:
                prior = self.priors[c]
                likelihood = multivariate_normal.pdf(x, mean=self.means[c], cov=self.covs[c])
                
                posterior = likelihood * prior
                posteriors.append(posterior)
                
            # Decision Rule: Choose the class with the highest posterior
            y_pred.append(self.classes[np.argmax(posteriors)])
            
        return np.array(y_pred)

#################################################################
# 2. RUN CLASSIFIER ON ELECTRIC VEHICLE DATASET
#################################################################

print("Loading 'Electric_Vehicle_Population_Data.csv'...")
try:
    df = pd.read_csv('Electric_Vehicle_Population_Data.csv')
except FileNotFoundError:
    print("Error: 'Electric_Vehicle_Population_Data.csv' not found.")
    print("Please make sure the file is in the same directory as the script.")
    exit()

# --- 2.1. Define Target and Features ---
target_col = 'Electric Vehicle Type'
# We need continuous numerical features for a Gaussian model
feature_cols = ['Electric Range', 'Model Year']

print(f"Target (y): {target_col}")
print(f"Features (X): {feature_cols}")

# --- 2.2. Preprocess and Clean the Data ---

# 1. Drop rows with missing values in our key columns
df_clean = df.dropna(subset=feature_cols + [target_col])

# 2. Filter for only the two main EV types
valid_types = ['Battery Electric Vehicle (BEV)', 'Plug-in Hybrid Electric Vehicle (PHEV)']
df_clean = df_clean[df_clean[target_col].isin(valid_types)]

# 3. Remove vehicles with 0 electric range.
# This is often bad data or skews the Gaussian distribution.
df_clean = df_clean[df_clean['Electric Range'] > 0]

print(f"Original data size: {len(df)}")
print(f"Cleaned data size: {len(df_clean)}")

# --- 2.3. Encode Target (y) and Prepare Features (X) ---

# Convert text labels (BEV/PHEV) into numbers (0/1)
le = LabelEncoder()
y = le.fit_transform(df_clean[target_col])
class_names = le.classes_

# Get features as a NumPy array
X = df_clean[feature_cols].values

print(f"Target classes: {list(enumerate(class_names))}")

# --- 2.4. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print("---")

# --- 2.5. Train the Custom Bayesian Classifier ---
print("Training custom Bayesian Classifier...")
bc = BayesianClassifier()
bc.fit(X_train, y_train)

# --- 2.6. Evaluate the Custom Model ---
y_pred = bc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=class_names)

print("\n--- Custom Bayesian Classifier Results ---")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(report)

# --- 2.7. (Optional) Compare with scikit-learn's QDA ---
print("\n--- Comparing with scikit-learn's QDA ---")
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
y_pred_qda = qda.predict(X_test)
accuracy_qda = accuracy_score(y_test, y_pred_qda)

print(f"Scikit-learn QDA Accuracy: {accuracy_qda * 100:.2f}%")
print("(Note: Accuracies should be identical or very close.)")

Loading 'Electric_Vehicle_Population_Data.csv'...
Target (y): Electric Vehicle Type
Features (X): ['Electric Range', 'Model Year']
Original data size: 181458
Cleaned data size: 86728
Target classes: [(0, 'Battery Electric Vehicle (BEV)'), (1, 'Plug-in Hybrid Electric Vehicle (PHEV)')]
Training samples: 60709
Testing samples: 26019
---
Training custom Bayesian Classifier...

--- Custom Bayesian Classifier Results ---
Accuracy: 98.30%

Classification Report:
                                        precision    recall  f1-score   support

        Battery Electric Vehicle (BEV)       0.97      1.00      0.98     14173
Plug-in Hybrid Electric Vehicle (PHEV)       1.00      0.97      0.98     11846

                              accuracy                           0.98     26019
                             macro avg       0.98      0.98      0.98     26019
                          weighted avg       0.98      0.98      0.98     26019


--- Comparing with scikit-learn's QDA ---
Scikit-learn 