In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Load Data
data = oracle.q2_train_test_emnist(23475, "EMNIST/emnist-balanced-train.csv", "EMNIST/emnist-balanced-test.csv")
training_set, test_set = data[0], data[1]

# Extract training data and labels
training_label = training_set[:, 0]
training_data = training_set[:, 1:]
test_label = test_set[:, 0]
test_data =  test_set[:, 1:]

def avg(r):
    """Compute mean vector for a given dataset X."""
    return np.mean(r, axis=0)

# Function to Compute Covariance Matrix
def cov(r):
    """Compute covariance matrix using the given mean."""    
    return np.cov(r, rowvar=False) 

 #Separate Data by Class
training_class0 = training_data[training_label == 9]
training_class1 = training_data[training_label == 20]


# Compute Mean & Covariance for Each Class
avg_class0 = avg(training_class0)
avg_class1 = avg(training_class1)

cov_class0 = cov(training_class0)
cov_class1 = cov(training_class1)


modified_cov_class0 = cov_class0 + (10**(-7)) * np.identity(cov_class0.shape[0])
modified_cov_class1 = cov_class1 + (10**(-7)) * np.identity(cov_class1.shape[0])


# Inverses & Determinants for Efficiency
C0_inv = np.linalg.pinv(cov_class0)
C1_inv = np.linalg.pinv(cov_class1)

cov_1_cov_0_inv = np.matmul(modified_cov_class1,np.linalg.pinv(modified_cov_class0))

slogdetcov_1_cov_0_inv = np.linalg.slogdet(cov_1_cov_0_inv)[1]

def log_posterior_ratio(r,p0,p1):
    """
    Computes the log posterior ratio used for decision making
    """

    dif_class0 = r - avg_class0
    t_1 = -0.5 * np.matmul(np.matmul(dif_class0.T, C0_inv), dif_class0)
    dif_class1 = r - avg_class1
    t_2 = 0.5 * np.matmul(np.matmul(dif_class1.T, C1_inv), dif_class1)
    t_3 = -np.log(p1) + np.log(p0)
    t_4 = 0.5 * slogdetcov_1_cov_0_inv
    return t_1 + t_2 +  t_3 + t_4
   
#Modified Bayes Classifier with Reject Option
def modified_bayes_classifier(r, epsilon):
    """
    Classifies r using the Modified Bayes Classifier with reject option.
    """
    p0 = 0.5  
    p1 = 0.5
    ratio = log_posterior_ratio(r,p0,p1)
    
    
    if ratio >=  np.log((0.5 + epsilon) / (0.5 - epsilon)):
        return 9
    elif ratio <= np.log((0.5 - epsilon) / (0.5 + epsilon)):
        return 20
    else:
        return "reject"
    

epsilon_values = [0.01, 0.1, 0.25, 0.4]
misclassification_losses = []
rejected_samples = []

for epsilon in epsilon_values:
    rejected = 0
    misclassified = 0
    classified = 0
    
    for i in range(len(test_data)):
        x = test_data[i]
        prediction = modified_bayes_classifier(x, epsilon)
        
        if prediction == "reject":
            rejected += 1
            
        else:
            classified+= 1
            if prediction != test_label[i]:
                misclassified+= 1
                
    
    if classified > 0 :

        misclassification_loss = misclassified / classified
    else:
        misclassification_loss = 0
    misclassification_losses.append(misclassification_loss)
    rejected_samples.append(rejected)

for i, epsilon in enumerate(epsilon_values):
    print(f"Epsilon: {epsilon}, Misclassification Loss: {misclassification_losses[i]:.4f}, Rejected Samples: {rejected_samples[i]}, Misclassified Samples: {misclassification_losses[i]*classified}")


#Plot Misclassification Loss vs Epsilon
plt.figure(figsize=(8, 5))
plt.plot(epsilon_values, misclassification_losses, marker='o', linestyle='-', color='b', label="Misclassification Loss")
plt.xlabel("Epsilon (ϵ)")
plt.ylabel("Misclassification Loss")
plt.title("Effect of ϵ on Misclassification Loss")
plt.legend()
plt.grid()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Load Data (assuming oracle.q2_train_test_emnist is defined)
data = oracle.q2_train_test_emnist(23475, "EMNIST/emnist-balanced-train.csv", "EMNIST/emnist-balanced-test.csv")
training_set, test_set = data[0], data[1]

# Extract training data and labels
training_label = training_set[:, 0]
training_data = training_set[:, 1:]
test_label = test_set[:, 0]
test_data = test_set[:, 1:]

# Separate data by class (class0: label 9, class1: label 20)
training_class0_all = training_data[training_label == 9]
training_class1_all = training_data[training_label == 20]

# Helper functions
def compute_stats(data_subset):
    """Compute mean, covariance, modified covariance, and inverse covariance."""
    avg_vec = np.mean(data_subset, axis=0)
    cov_mat = np.cov(data_subset, rowvar=False)
    mod_cov_mat = cov_mat + (10**(-7)) * np.identity(cov_mat.shape[0])  # Stability adjustment
    inv_cov = np.linalg.pinv(cov_mat)  # Use pseudo-inverse for numerical stability
    return avg_vec, cov_mat, mod_cov_mat, inv_cov

def log_posterior_ratio(x, avg0, avg1, inv_cov0, inv_cov1, slogdet_cov_ratio, p0, p1):
    """Compute the log posterior ratio for classification."""
    dif0 = x - avg0
    t1 = -0.5 * np.dot(np.dot(dif0.T, inv_cov0), dif0)
    dif1 = x - avg1
    t2 = 0.5 * np.dot(np.dot(dif1.T, inv_cov1), dif1)
    t3 = -np.log(p1) + np.log(p0)
    t4 = 0.5 * slogdet_cov_ratio
    return t1 + t2 + t3 + t4

def modified_bayes_classifier(x, avg0, avg1, inv_cov0, inv_cov1, slogdet_cov_ratio, p0, p1, epsilon):
    """Classify x using the modified Bayesian classifier."""
    ratio = log_posterior_ratio(x, avg0, avg1, inv_cov0, inv_cov1, slogdet_cov_ratio, p0, p1) 
    if ratio >= np.log((0.5 + epsilon) / (0.5 - epsilon)):
        return 9
    elif ratio <= np.log((0.5 - epsilon) / (0.5 + epsilon)):
        return 20
    else:
        return "reject"

# Experiment configurations
experiments = [
    {"n_class0": 2400, "n_class1": 1600, "p0": 0.6,  "p1": 0.4},
    {"n_class0": 2400, "n_class1": 600,  "p0": 0.8,  "p1": 0.2},
    {"n_class0": 2400, "n_class1": 267,  "p0": 0.9,  "p1": 0.1},
    {"n_class0": 2400, "n_class1": 24,   "p0": 0.99, "p1": 0.01}
]

epsilon_values = [0.1, 0.25, 0.4]  # Epsilon values

# Store results for plotting
misclassification_results = []

# Loop over experiments
for exp in experiments:
    # Select class 0 samples
    train_class0 = training_class0_all[:exp["n_class0"]]
    
    # Select class 1 samples randomly
    j = np.random.choice(len(training_class1_all), size=exp["n_class1"], replace=False)
    train_class1 = training_class1_all[j]
    
    # Compute statistics for each class
    avg0, cov0, mod_cov0, inv_cov0 = compute_stats(train_class0)
    avg1, cov1, mod_cov1, inv_cov1 = compute_stats(train_class1)
    
    # Compute determinant ratio term for log ratio computation
    cov_ratio = np.matmul(mod_cov1, np.linalg.pinv(mod_cov0))
    slogdet_cov_ratio = np.linalg.slogdet(cov_ratio)[1]
    
    # Store misclassification errors for this experiment
    misclassification_errors = []

    # Classify test data for each epsilon
    for epsilon in epsilon_values:
        misclassified = 0
        classified = 0
        
        for i in range(len(test_data)):
            x = test_data[i]
            pred = modified_bayes_classifier(x, avg0, avg1, inv_cov0, inv_cov1, slogdet_cov_ratio, exp["p0"], exp["p1"], epsilon)
            
            if pred != "reject":
                classified += 1
                if pred != test_label[i]:
                    misclassified += 1
        
        # Compute misclassification loss
        misclassification_loss = misclassified / classified if classified > 0 else 0
        misclassification_errors.append(misclassification_loss)
        
        print(f"Experiment {exp['p0']} vs {exp['p1']} | Epsilon: {epsilon} | Misclassification Loss: {misclassification_loss:.4f}")

    misclassification_results.append(misclassification_errors)

# **Plot all experiments on a single graph**
plt.figure(figsize=(8, 6))

# Experiment labels for legend
exp_labels = [f"p0={exp['p0']}, p1={exp['p1']}" for exp in experiments]

# Plot each experiment
for i, misclassification_errors in enumerate(misclassification_results):
    plt.plot(epsilon_values, misclassification_errors, marker='o', label=exp_labels[i])

plt.xlabel("Epsilon")
plt.ylabel("Misclassification Loss")
plt.title("Misclassification Loss vs. Epsilon for Different Splits")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold


data = oracle.q2_train_test_emnist(23475, "EMNIST/emnist-balanced-train.csv", "EMNIST/emnist-balanced-test.csv")
training_set, _ = data[0], data[1]

training_label = training_set[:, 0]
training_data = training_set[:, 1:]

def avg(r):
    """Compute mean vector for data r."""
    return np.mean(r, axis=0)

def cov(r):
    """Compute covariance matrix for data r."""
    return np.cov(r, rowvar=False)


K = 5
epsilon = 0.25
kf = KFold(n_splits=K, shuffle=True, random_state=42)

# To store metrics for each fold
fold_metrics = []

fold = 1
for train_idx, val_idx in kf.split(training_data):
    # Split into training and validation sets for this fold
    X_train = training_data[train_idx]
    y_train = training_label[train_idx]
    X_val = training_data[val_idx]
    y_val = training_label[val_idx]
    
  
    train_class0 = X_train[y_train == 9]
    train_class1 = X_train[y_train == 20]
    
    # If either class is missing in this fold, skip it
    if len(train_class0) == 0 or len(train_class1) == 0:
        continue

    avg_class0 = avg(train_class0)
    avg_class1 = avg(train_class1)
    
    cov_class0 = cov(train_class0)
    cov_class1 = cov(train_class1)
    
    # Regularize covariance matrices slightly for numerical stability
    modified_cov_class0 = cov_class0 + (10**(-7)) * np.identity(cov_class0.shape[0])
    modified_cov_class1 = cov_class1 + (10**(-7)) * np.identity(cov_class1.shape[0])
    
    # Compute pseudo-inverses of the original covariance matrices
    C0_inv = np.linalg.pinv(cov_class0)
    C1_inv = np.linalg.pinv(cov_class1)
    
    # Compute a determinant term based on the modified covariances
    cov_1_cov_0_inv = np.matmul(modified_cov_class1, np.linalg.pinv(modified_cov_class0))
    slogdetcov_1_cov_0_inv = np.linalg.slogdet(cov_1_cov_0_inv)[1]
    
    # For this experiment, assume equal priors
    p0_val = 0.5
    p1_val = 0.5
    

    def fold_log_posterior_ratio(r, p0, p1):
        """
        Computes the log posterior ratio for sample r.
        """
        dif_class0 = r - avg_class0
        t_1 = -0.5 * np.matmul(np.matmul(dif_class0.T, C0_inv), dif_class0)
        
        dif_class1 = r - avg_class1
        t_2 = 0.5 * np.matmul(np.matmul(dif_class1.T, C1_inv), dif_class1)
        
        t_3 = -np.log(p1) + np.log(p0)
        t_4 = 0.5 * slogdetcov_1_cov_0_inv
        return t_1 + t_2 + t_3 + t_4

    def fold_modified_bayes_classifier(r, epsilon):
        """
        Classifies sample r using the Modified Bayes Classifier with reject option.
        Returns 9 if the ratio is high, 20 if low, or "reject" otherwise.
        """
        ratio = fold_log_posterior_ratio(r, p0_val, p1_val)
        threshold_high = np.log((0.5 + epsilon) / (0.5 - epsilon))
        threshold_low  = np.log((0.5 - epsilon) / (0.5 + epsilon))
        
        if ratio >= threshold_high:
            return 9
        elif ratio <= threshold_low:
            return 20
        else:
            return "reject"
    
   
    true_labels = []       # true labels for classified samples
    predicted_labels = []  # classifier predictions (9 or 20)
    num_rejected = 0       # count rejected samples
    
    for i in range(len(X_val)):
        x = X_val[i]
        pred = fold_modified_bayes_classifier(x, epsilon)
        if pred == "reject":
            num_rejected += 1
        else:
            true_labels.append(y_val[i])
            predicted_labels.append(pred)
    

    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(len(true_labels)):
        t = true_labels[i]
        p = predicted_labels[i]
        if t == 20 and p == 20:
            TP += 1
        elif t == 9 and p == 9:
            TN += 1
        elif t == 9 and p == 20:
            FP += 1
        elif t == 20 and p == 9:
            FN += 1
    
    # Compute performance metrics over the non-rejected samples
    recall    = TP / (TP + FN) if (TP + FN) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    accuracy  = (TP + TN) / len(true_labels) if len(true_labels) > 0 else 0
    f1_score  = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    # Store metrics for this fold
    fold_metrics.append({
        "fold": fold,
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "recall": recall,
        "precision": precision,
        "accuracy": accuracy,
        "f1_score": f1_score,
        "num_rejected": num_rejected,
        "num_classified": len(true_labels)
    })
    
    print(f"Fold {fold} results:")
    print(f"  Confusion Matrix: TP = {TP}, TN = {TN}, FP = {FP}, FN = {FN}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  F1 Score:  {f1_score:.4f}")
    print(f"  Rejected Samples: {num_rejected}")
    print(f"  Classified Samples: {len(true_labels)}\n")
    
    fold += 1
