#### Gaussian Discriminant Analysis for Spam Email Classification

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc

In [2]:
# Load Data
email_data = pd.read_csv('./dataset/emails.csv', header=0)

# Remove the First Column
email_data = email_data.drop(email_data.columns[0], axis=1)

# Training Set
train_X = email_data.iloc[0:5000, 0:-1].values.T.astype(np.float64)
train_Y = email_data.iloc[0:5000:, -1].values.astype(np.float64)

# Test Set
test_X = email_data.iloc[5000:, 0:-1].values.T.astype(np.float64)
test_Y = email_data.iloc[5000:, -1].values.astype(np.float64)

# Standardize the Features for Both Training Set and Test Set
scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

ValueError: X has 172 features, but StandardScaler is expecting 5000 features as input.

In [None]:
# Settings
data_dim, train_data_size = train_X.shape
test_data_size = test_X.shape[1]
# Swap 0s with 1s and 1s with 0s in Output Set for Summation of Indicator Functions
swapped_train_Y = np.where(train_Y==0, 1, 0) 
swapped_test_Y = np.where(test_Y==0, 1, 0)

In [None]:
# The Parameters of The GDA Model
prior_prob = 1 / train_data_size * np.sum(train_Y) #! φ = 1/n * ΣI(y^(i)=1)
mean_y0 = train_X @ swapped_train_Y / np.sum(swapped_train_Y) #! μ0 = ΣI(y^(i)=0) * x^(i) / ΣI(y^(i)=0)
mean_y1 = train_X @ train_Y / np.sum(train_Y) #! μ0 = ΣI(y^(i)=1) * x^(i) / ΣI(y^(i)=1)

# Reshape means to be column vectors
mean_y0 = mean_y0.reshape(-1, 1)
mean_y1 = mean_y1.reshape(-1, 1)

# Subtract Means from Input X Based on Corresponding Y
X_mu = train_X.copy()
X_mu[:, train_Y == 0] -= mean_y0 #! Σ = 1/n * Σ((x^(i) - μy^(i)) * (x^(i) - μy^(i))^T)
X_mu[:, train_Y == 1] -= mean_y1

covariance_matrix = (X_mu @ X_mu.T) / train_data_size

In [None]:
# Distributions for Two Classes
prob_y0 = 1- prior_prob
prob_y1 = prior_prob

# Calculate the Inverse and Determinant of the Covariance Matrix
covariance_matrix_inv = np.linalg.inv(covariance_matrix)
covariance_matrix_det = np.linalg.det(covariance_matrix)

# Constant Factor for the Multivariate Guassian Distribution
const_factor = 1 / ((2 * np.pi) ** (data_dim / 2) * np.sqrt(covariance_matrix_det))

# Subtract the Means from the Test Set
diff_y0 = test_X - mean_y0
diff_y1 = test_X - mean_y1

# Compute Exponential Part
exp_arg0 = np.sum((diff_y0.T @ covariance_matrix_inv) * diff_y0.T, axis=1)
exp_arg1 = np.sum((diff_y1.T @ covariance_matrix_inv) * diff_y1.T, axis=1)

# Compute the Probability Densities
p_x_given_y0 = const_factor * np.exp(-0.5 * exp_arg0) #! p(x|y=0) = (1 / (2π)^(n/2) * |Σ|^(1/2)) * exp(-1/2 * (x - μ0)^T * Σ^-1 * (x - μ0))
p_x_given_y1 = const_factor * np.exp(-0.5 * exp_arg1) #! p(x|y=1) = (1 / (2π)^(n/2) * |Σ|^(1/2)) * exp(-1/2 * (x - μ1)^T * Σ^-1 * (x - μ1))

In [None]:
p_x_given_y0, p_x_given_y1