# Q2 (a,b)

In [1]:
import collections
import numpy as np
import util
import svm

In [2]:
def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """

    # *** START CODE HERE ***
    string_list = message.split()
    return [s.lower() for s in string_list]
    # *** END CODE HERE ***

In [3]:
def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    This function should create a dictionary of word to indices using the provided
    training messages. Use get_words to process each message.

    Rare words are often not useful for modeling. Please only add words to the dictionary
    if they occur in at least five messages.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """
    temp_dict = {}
    res_dict = {}
    for msgs in messages:
        slist = get_words(msgs)
        for x in set(slist):
            if x in temp_dict:
                temp_dict[x] += 1
            else:
                temp_dict[x] = 1
        

    idx = 0
    for string, count in temp_dict.items():
        if count > 4:
            res_dict[string] = idx
            idx += 1

    return res_dict


In [4]:
def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array for further processing.

    This function should create a numpy array that contains the number of times each word
    of the vocabulary appears in each message. 
    Each row in the resulting array should correspond to each message 
    and each column should correspond to a word of the vocabulary.

    Use the provided word dictionary to map words to column indices. Ignore words that
    are not present in the dictionary. Use get_words to get the words for a message.

    Args:
        messages: A list of strings where each string is an SMS message.
        word_dictionary: A python dict mapping words to integers.

    Returns:
        A numpy array marking the words present in each message.
        Where the component (i,j) is the number of occurrences of the
        j-th vocabulary word in the i-th message.
    """
    m = len(messages)
    n = len(word_dictionary)
    res = np.zeros((m, n), dtype=int)
    for i in range(m):
        slist = get_words(messages[i])
        for word in slist:
            if word in word_dictionary:
                column_index = word_dictionary[word]
                res[i, column_index] += 1
    return res


In [6]:
def fit_naive_bayes_model(matrix, labels):
    """Fit a naive bayes model.

    This function should fit a Naive Bayes model given a training matrix and labels.

    The function should return the state of that model.

    Feel free to use whatever datatype you wish for the state of the model.

    Args:
        matrix: A numpy array containing word counts for the training data
        labels: The binary (0 or 1) labels for that training data

    Returns: The trained model
    """

    # *** START CODE HERE ***
    nexamples,nwords = matrix.shape
    phiy = (np.sum(labels))/len(labels)
    phi0 = np.array((matrix[labels == 0]).sum(axis=0),dtype = float) + 1
    phi1 = np.array((matrix[labels == 1]).sum(axis=0),dtype = float) + 1
    phi0 /= phi0.sum()
    phi1 /= phi1.sum()
    model = {"phi0":phi0,"phi1":phi1,"phiy":phiy}
    return model
    # *** END CODE HERE ***

In [7]:
def predict_from_naive_bayes_model(model, matrix):
    """Use a Naive Bayes model to compute predictions for a target matrix.

    This function should be able to predict on the models that fit_naive_bayes_model
    outputs.

    Args:
        model: A trained model from fit_naive_bayes_model
        matrix: A numpy array containing word counts

    Returns: A numpy array containg the predictions from the model
    """
    # *** START CODE HERE ***
    py1 = np.log(model['phiy'])
    py0 = np.log(1 - py1)
    logphi0 = np.log(model['phi0'])
    logphi1 = np.log(model['phi1'])
    p1 = np.sum(matrix * logphi1, axis = 1) + py1
    p0 = np.sum(matrix * logphi0, axis = 1) + py0
    output = (p1>p0)
    return output
    # *** END CODE HERE ***

In [9]:
messages , labels = util.load_spam_dataset("spam_val.tsv")

In [11]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,

In [12]:
resdict = create_dictionary(messages)

In [13]:
resdict

{'in': 0,
 'as': 1,
 'but': 2,
 '4': 3,
 'said': 4,
 'he': 5,
 'me': 6,
 'pretty': 7,
 'long': 8,
 'thk': 9,
 'i': 10,
 'good': 11,
 'am': 12,
 'going': 13,
 'to': 14,
 'time': 15,
 'the': 16,
 'do': 17,
 'u': 18,
 "that's": 19,
 'first': 20,
 'love': 21,
 'you': 22,
 ':)': 23,
 'if': 24,
 "i'm": 25,
 'will': 26,
 'later': 27,
 'class.': 28,
 'lor.': 29,
 'ok': 30,
 'very': 31,
 'really': 32,
 'is': 33,
 'and': 34,
 'out': 35,
 'way': 36,
 'not': 37,
 'a': 38,
 'are': 39,
 'this': 40,
 'over': 41,
 'of': 42,
 'your': 43,
 'get': 44,
 'have': 45,
 'by': 46,
 'anything': 47,
 'that': 48,
 'so': 49,
 'my': 50,
 'on': 51,
 'no': 52,
 'its': 53,
 'r': 54,
 '3': 55,
 'den': 56,
 'people': 57,
 'at': 58,
 'go': 59,
 'we': 60,
 'cos': 61,
 'da': 62,
 'watching': 63,
 'lor...': 64,
 'has': 65,
 'about': 66,
 'want': 67,
 'more': 68,
 'person': 69,
 'know': 70,
 'put': 71,
 'some': 72,
 'had': 73,
 'still': 74,
 'they': 75,
 'why': 76,
 'sure': 77,
 'then': 78,
 'take': 79,
 'it': 80,
 'when': 8

In [15]:
matrix = transform_text(messages,resdict)

In [17]:
import pandas as pd
pd.DataFrame(matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,256
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
553,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
model = fit_naive_bayes_model(matrix,labels)

In [19]:
out = predict_from_naive_bayes_model(model,matrix)

In [22]:
(out == labels).sum()

548

In [24]:
548/557 * 100

98.38420107719928