In [1]:
%matplotlib inline

In [127]:
import numpy as np

training_spam = np.loadtxt(open("data/training_spam.csv"), delimiter=",").astype(int)
print("Shape of the spam training data set:", training_spam.shape)
print(training_spam)

Shape of the spam training data set: (1000, 55)
[[1 0 0 ... 0 0 0]
 [0 0 1 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [1 1 1 ... 1 1 0]
 [1 0 0 ... 1 1 1]]


In [128]:
testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",").astype(int)
print("Shape of the spam testing data set:", testing_spam.shape)
print(testing_spam)

Shape of the spam testing data set: (500, 55)
[[1 0 0 ... 1 1 1]
 [1 1 0 ... 1 1 1]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]]


In [133]:
# This skeleton code simply classifies every input as ham
#
# Here you can see there is a parameter k that is unused, the
# point is to show you how you could set up your own. You can
# also see a train method that does nothing here
# but your classifier would probably do the main work here. 
# Modify this code as much as you like so long as the 
# accuracy test in the cell below runs

class MyClassifier:
    def __init__(self, k):
        # K shall be used to define the number of classes in the classification problem
        self.k = k
        # Declaring an array called log_class_priors which will hold the log of the class priors
        self.log_class_priors = np.array([])
        # Declaring an array called theta which will hold the log of the class conditional feature likelihoods
        self.theta = np.array([])
        
    def estimate_log_class_priors(self, data):
        """
        Given a data set with binary response variable (0s and 1s), 
        calculate the logarithm of the empirical class priors,
        that is, the logarithm of the proportions of 0s and 1s:
            log(p(C=0)) and log(p(C=1))

        :param data: a numpy array of length n_samples
                     that contains the binary response (coded as 0s and 1s).

        :return log_class_priors: a numpy array of length two
        """
        ### YOUR CODE HERE...

        # Defining np array
        out = np.array([])
        
        # Finding the probability of each class in range (0, k-1)
        for i in range(self.k):
            out = np.append(out, (np.count_nonzero(data == i)/len(data)))
        
        # Getting the log of each element in out  
        out = np.log(out)

        # Returning the np array
        return out
    
    def estimate_log_class_conditional_likelihoods(self, input_data, labels, alpha=1.0):
        """
        Given input_data of binary features (words) and labels 
        (binary response variable (0s and 1s)), calculate the logarithm 
        of the empirical class-conditional likelihoods, that is,
        log(P(w_i | c)) for all features w_i and both classes (c in {0, 1}).

        Assume a multinomial feature distribution and use Laplace smoothing
        if alpha > 0.

        :param input_data: a two-dimensional numpy-array with shape = [n_samples, n_features]
                           contains binary features (words)
        :param labels: a numpy array of length n_samples 
                       contains response variable

        :return theta:
            a numpy array of shape = [2, n_features]. theta[j, i] corresponds to the
            logarithm of the probability of feature i appearing in a sample belonging 
            to class j.
        """
        ### YOUR CODE HERE...

        # Getting the magnitude of the dimensions
        n_samples = input_data.shape[0]
        n_features = input_data.shape[1]
        
        # Declaring the arrays for the calculations
        classes = []
        class_conditionals = []
        log_class_conditionals = []
        theta = []
        
        # Looping through each of the classes
        for i in range(self.k):
            # Isolating all the lines in input_data where class label == i, where i is in the range (0, k-1)
            # np.squeeze() removes all the redundant dimensions
            classes.append(np.squeeze(input_data[np.where(labels == i), :]))
            # Calculating the empircal conditional liklihoods for class = i (c == i)
            class_conditionals.append(((np.count_nonzero(classes[i], axis=0)) + alpha) / ((np.count_nonzero(labels == i)) + n_features*alpha))
            # Finding the log of class_conditionals[i]
            log_class_conditionals.append(np.log(class_conditionals[i]))
            # Appending each of the class conditional likelihoods to an array called 'theta'
            theta.append(log_class_conditionals[i])
            
        # Converting theta to a numpy array and returning it
        return np.array(theta)
        
    def train(self, train_data, train_labels):
        
        # Calculating the log of the probability of each class
        self.log_class_priors = self.estimate_log_class_priors(train_labels)
        # Calculating the log of the class conditional feature likelihoods
        # Remember to change alpha to optimise the training process
        self.theta = self.estimate_log_class_conditional_likelihoods(train_data, train_labels, alpha=100)
        
        # Remember to remove this line when you copy it back into the main file
        return self.log_class_priors, self.theta
        
    def predict(self, test_data):
        """
        Given a new data set with binary features, predict the corresponding
        response for each instance (row) of the new_data set.

        :param new_data: a two-dimensional numpy-array with shape = [n_test_samples, n_features].
        :param log_class_priors: a numpy array of length 2.
        :param log_class_conditional_likelihoods: a numpy array of shape = [2, n_features].
            theta[j, i] corresponds to the logarithm of the probability of feature i appearing
            in a sample belonging to class j.
        :return class_predictions: a numpy array containing the class predictions for each row
            of new_data.
        """
        ### YOUR CODE HERE...

        class_predictions = np.array([])

        # Looping through each row in new_data
        for i, row in enumerate(test_data):
            # Summing the log_class_conditional_likelihoods for all the features in the dataset
            row_times_class_conditionals = row*self.theta
            row_class_conditional_likelihoods_sum = np.sum(row_times_class_conditionals, axis=1)

            # Adding the row_class_conditional_likelihoods_sum to the log_class_priors
            log_total = row_class_conditional_likelihoods_sum + self.log_class_priors
            # Finding the class which has the highest probability and appending it to class predictions
            class_predictions = np.append(class_predictions, np.argmax(log_total, axis=0))

        return class_predictions

# Change k to define the number of classes in the classification problem
spam_classifier = MyClassifier(k=2)
spam_classifier.train(train_data=training_spam[:, 1:], train_labels=training_spam[:, 0])

(array([-0.48939034, -0.94933059]),
 array([[-3.3738029 , -3.62650526, -3.10696769, -4.08655855, -3.22521552,
         -3.60792887, -3.9831802 , -3.76003665, -3.67124115, -3.39837416,
         -3.78169815, -2.80727624, -3.50872222, -3.85749198, -3.99214887,
         -3.62650526, -3.60181264, -3.50872222, -2.6081093 , -3.9831802 ,
         -2.96833779, -4.04771872, -3.94808888, -3.9831802 , -2.8995607 ,
         -3.09956025, -3.12952504, -3.42867951, -3.50872222, -3.454655  ,
         -3.62027471, -3.71124648, -3.58368526, -3.7044668 , -3.43382091,
         -3.39341137, -3.08490797, -3.99214887, -3.57178036, -3.62650526,
         -3.85749198, -3.5368931 , -3.60181264, -3.62027471, -3.06688947,
         -3.38355908, -3.99214887, -3.78902419, -3.3450928 , -2.61717966,
         -3.45993206, -3.07766156, -3.60792887, -3.67779855],
        [-3.15803777, -3.26974176, -2.86125093, -3.99993021, -2.79690124,
         -3.10268767, -3.11429322, -3.18690575, -3.29273127, -3.0179224 ,
         -3.29

In [134]:
my_classifier = MyClassifier(k=2)

# You can use this cell to check whether the returned objects of your function are of the right data type.
log_class_priors = my_classifier.estimate_log_class_priors(training_spam[:, 0])
print("result", log_class_priors)

# Check length
assert(len(log_class_priors) == 2)

# Check whether the returned object is a numpy.ndarray
assert(isinstance(log_class_priors, np.ndarray))

# Check wehther the values of this numpy.array are floats.
assert(log_class_priors.dtype == float)

# Check wehther the values are both negative (the logarithm of a probability 0 < p < 1 should be negative).
assert(np.all(log_class_priors < 0))

result [-0.48939034 -0.94933059]


In [135]:
# You can use this cell to check whether the returned objects of your function are of the right data type.
log_class_conditional_likelihoods = my_classifier.estimate_log_class_conditional_likelihoods(
    training_spam[:, 1:], training_spam[:, 0], alpha=1.0)
print(log_class_conditional_likelihoods)

# Check data type(s)
assert(isinstance(log_class_conditional_likelihoods, np.ndarray))

# Check shape of numpy array
assert(log_class_conditional_likelihoods.shape == (2, 54))

# Check data type of array elements
assert(log_class_conditional_likelihoods.dtype == float)

[[-1.82996121 -2.39191618 -1.36699161 -5.80964287 -1.56114762 -2.34390696
  -3.93784069 -2.78921798 -2.513806   -1.87781723 -2.86520389 -0.92684094
  -2.10834089 -3.17058554 -4.0178834  -2.39191618 -2.32840278 -2.10834089
  -0.66214839 -3.93784069 -1.15568252 -4.71103058 -3.6695767  -3.93784069
  -1.05605267 -1.35529557 -1.40292362 -1.93844185 -2.10834089 -1.99193054
  -2.37565566 -2.63158904 -2.28328234 -2.61096975 -1.94891315 -1.86806106
  -1.33230605 -4.0178834  -2.2542948  -2.39191618 -3.17058554 -2.17205671
  -2.32840278 -2.37565566 -1.30429301 -1.8488297  -4.0178834  -2.89187213
  -1.77540223 -0.67384443 -2.00298038 -1.3210065  -2.34390696 -2.53249813]
 [-1.09861229 -1.29325433 -0.6423075  -4.14313473 -0.55171061 -1.00764051
  -1.02644984 -1.14740245 -1.33545468 -0.87410912 -1.33545468 -0.59598343
  -1.26073114 -2.19722458 -2.08171169 -0.69088217 -0.95324644 -1.17638999
  -0.2368424  -1.61170806 -0.33647224 -3.1446059  -1.24485779 -1.03918887
  -3.60413823 -3.8918203  -5.39589769

In [136]:
# You can use this cell to check whether the returned objects of your function are of the right data type.
log_class_priors, log_class_conditional_likelihoods = my_classifier.train(training_spam[:, 1:], training_spam[:, 0])
print("log class priors")
print(log_class_priors)

# Check length
assert(len(log_class_priors) == 2)

# Check whether the returned object is a numpy.ndarray
assert(isinstance(log_class_priors, np.ndarray))

# Check wehther the values of this numpy.array are floats.
assert(log_class_priors.dtype == float)

# Check wehther the values are both negative (the logarithm of a probability 0 < p < 1 should be negative).
assert(np.all(log_class_priors < 0))

print("log class conditional likelihoods")
print(log_class_conditional_likelihoods)

# Check data type(s)
assert(isinstance(log_class_conditional_likelihoods, np.ndarray))

# Check shape of numpy array
assert(log_class_conditional_likelihoods.shape == (2, 54))

# Check data type of array elements
assert(log_class_conditional_likelihoods.dtype == float)

log class priors
[-0.48939034 -0.94933059]
log class conditional likelihoods
[[-3.3738029  -3.62650526 -3.10696769 -4.08655855 -3.22521552 -3.60792887
  -3.9831802  -3.76003665 -3.67124115 -3.39837416 -3.78169815 -2.80727624
  -3.50872222 -3.85749198 -3.99214887 -3.62650526 -3.60181264 -3.50872222
  -2.6081093  -3.9831802  -2.96833779 -4.04771872 -3.94808888 -3.9831802
  -2.8995607  -3.09956025 -3.12952504 -3.42867951 -3.50872222 -3.454655
  -3.62027471 -3.71124648 -3.58368526 -3.7044668  -3.43382091 -3.39341137
  -3.08490797 -3.99214887 -3.57178036 -3.62650526 -3.85749198 -3.5368931
  -3.60181264 -3.62027471 -3.06688947 -3.38355908 -3.99214887 -3.78902419
  -3.3450928  -2.61717966 -3.45993206 -3.07766156 -3.60792887 -3.67779855]
 [-3.15803777 -3.26974176 -2.86125093 -3.99993021 -2.79690124 -3.10268767
  -3.11429322 -3.18690575 -3.29273127 -3.0179224  -3.29273127 -2.82855856
  -3.25172325 -3.66615703 -3.6264167  -2.89504831 -3.06865792 -3.20378379
  -2.56081071 -3.43226068 -2.63750333 