Train model using the gaussian NB model 

File Paths:  
- training data: data/sonar_train.data
- validation data: data/sonar_valid.data
- test data: data/sonar_test.data

In [4]:
import numpy as np

#read file and store data to X and Y 
def read_data(filename): 
    file = open(filename, 'r', encoding='utf-8-sig')
    dataset = []
    for line in file:
        data = line.split(',')
        x_data = [float(x) for x in data[:-1]] 
        y_data = int(data[len(data)-1])
        dataset.append((x_data, y_data))

    X = np.array([x for x, y in dataset])
    Y = np.array([y for x, y in dataset])

    return X, Y

#compute mean for each feature
def compute_mean(partition_set): 
    means = np.mean(partition_set, axis=0)
    return means

#compute sd for each feature
def compute_sd(partition_set): 
    standard_devs = np.std(partition_set, axis=0)
    return standard_devs

#calculate mean and sd for each features in each class
def train(X, Y): 
    num_data, num_feature = X.shape
    classes, class_count = np.unique(Y, return_counts=True)
    prob_classes = []
    mean_matrix = []   #row = classes, col = features
    sd_matrix = []

    #get probabilty of each y in the data
    for count in class_count: 
       prob = count / num_data
       prob_classes.append(prob)

    for y in classes: 
        #get subset of X where label = y
        y_indices = np.where(Y == y)[0]
        partition_set = X[y_indices]

        mean_matrix.append(compute_mean(partition_set))
        sd_matrix.append(compute_sd(partition_set))
    
    return classes, np.array(prob_classes), np.array(mean_matrix), np.array(sd_matrix)

#compute log likelihood p(fi|yi)
def compute_likelihood(x, mean, sd): 
    likelihood = (1 / (sd * np.sqrt(2*np.pi))) * np.exp((-(x-mean) ** 2) / (2 * (sd ** 2)))
    return likelihood


#predict y for new x
def prediction(x, classes, prob_classes, mean_matrix, sd_matrix): 
    best_y = classes[0]
    best_posterior = float('-inf')

    #compute posterior for each y and find the largest one
    for i in range(len(classes)): 
        #posterior = p(yi) * p(f1|yi) * p(f2|yi) * ...
        posterior = prob_classes[i]
        for f in range(mean_matrix.shape[1]): 
            posterior *= compute_likelihood(x[f], mean_matrix[i][f], sd_matrix[i][f])

        #compare posterior 
        if posterior > best_posterior: 
            best_posterior = posterior
            best_y = classes[i]

    return best_y

#training dataset
train_X, train_Y = read_data('data/sonar_train.data')
X, Y = read_data('data/sonar_test.data')

#train model 
classes, prob_classes, mean_matrix, sd_matrix = train(train_X, train_Y)

#prediction 
correct_predictions = 0
total_data = X.shape[0]
for x, y in zip(X, Y): 
    pred_y = prediction(x, classes, prob_classes, mean_matrix, sd_matrix)

    if pred_y == y: 
        correct_predictions += 1

#accuracy 
accuracy = (correct_predictions / total_data) * 100
print("accuracy:", accuracy, "%")

accuracy: 69.23076923076923 %
