# Notebook imports

In [117]:
import pandas as pd
import numpy as np


# Constants

In [118]:
VOCAB_SIZE = 2500
TRAINING_DATA_FILE = 'SpamData/02_Training/train-data.txt'
TEST_DATA_FILE = 'SpamData/02_Training/test-data.txt'

TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt'
TEST_TARGET_FILE = 'SpamData/03_Testing/test-target.txt'

# Read and Load features from txt files to numpy arrays

In [119]:
sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter = ' ', dtype = int)

In [120]:
sparse_test_data = np.loadtxt(TEST_DATA_FILE, delimiter = ' ', dtype = int)
sparse_train_data[:5]

array([[ 0,  2,  1,  1],
       [ 0,  3,  1,  2],
       [ 0,  4,  1,  1],
       [ 0,  7,  1,  3],
       [ 0, 11,  1,  1]])

In [None]:
print('Number of rows in training data = ', sparse_train_data.shape[0])
print('Number of rows in testing data = ', sparse_test_data.shape[0])
sparse_train_data[-5:]

In [None]:
print('Number of emails in the training file = ', np.unique(sparse_train_data[:,0]).size)

### How to create an empty dataframe

In [None]:
col = ['DOC_ID'] + ['CATEGORY'] + list(range(0,VOCAB_SIZE))
col[:5]

In [None]:
len(col)

In [None]:
index_names = np.unique(sparse_train_data[:,0])
index_names

In [None]:
full_train_data= pd.DataFrame(index = index_names, columns = col)

In [None]:
full_train_data.head()

In [None]:
full_train_data.fillna(value = 0, inplace = True)
full_train_data.head()

# Create a full matrix from a sparse matrix

In [None]:
def make_full_matrix(sparse_matrix, nr_words, doc_idx=0, word_idx=1, cat_idx=2, freq_idx=3):
    """
    Form a full matrix from a sparse matrix. Return a pandas dataframe. 
    Keyword arguments:
    sparse_matrix -- numpy array
    nr_words -- size of the vocabulary. Total number of tokens. 
    doc_idx -- position of the document id in the sparse matrix. Default: 1st column
    word_idx -- position of the word id in the sparse matrix. Default: 2nd column
    cat_idx -- position of the label (spam is 1, nonspam is 0). Default: 3rd column
    freq_idx -- position of occurrence of word in sparse matrix. Default: 4th column
    """
    column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurrence
    
    full_matrix.set_index('DOC_ID', inplace=True)
    return full_matrix
    

In [None]:
%%time
full_train_data = make_full_matrix(sparse_train_data, VOCAB_SIZE)

In [None]:
 full_train_data

# Training the Naive Bayes Model
## Calculating the probability of spam

In [None]:
prob_spam = full_train_data.CATEGORY.sum()/full_train_data.CATEGORY.size
prob_spam#The probablity of spam messages

# Total number of words/tokens

In [None]:
full_train_features = full_train_data.loc[:,full_train_data.columns != 'CATEGORY']
full_train_features

In [None]:
email_length = full_train_features.sum(axis=1)
email_length.shape[0]

In [None]:
email_length[:5]

In [None]:
total_wc = email_length.sum()
total_wc

# Number of tokens in spam and ham emails

In [None]:
spam_length = email_length[full_train_data.CATEGORY == 1]
spam_length.shape

In [None]:
spam_wc = spam_length.sum()
spam_wc

In [None]:
ham_length = email_length[full_train_data.CATEGORY == 0]
ham_length.shape

In [None]:
email_length.shape[0] - spam_length.shape[0] - ham_length.shape[0]#Checking if everything is in order 

In [None]:
ham_wc = ham_length.sum()
ham_wc

In [None]:
ham_wc + spam_wc - total_wc

In [None]:
print('Average number of words in spam emails {:.0f}'.format(spam_wc/spam_length.shape[0]))
print('Average number of words in ham emails {:.0f}'.format(ham_wc/ham_length.shape[0]))

# Summing the tokens occuring in spam

In [None]:
full_train_features.shape

In [None]:
train_spam_tokens = full_train_features.loc[full_train_data.CATEGORY == 1]
train_spam_tokens

In [None]:
train_spam_tokens.shape

In [None]:
summed_spam_tokens = train_spam_tokens.sum(axis = 0) + 1#Laplace smoothing
summed_spam_tokens.shape

# Summing the tokens occuring in ham

In [None]:
train_ham_tokens = full_train_features.loc[full_train_data.CATEGORY == 0]
train_ham_tokens

In [None]:
train_ham_tokens.shape

In [None]:
summed_ham_tokens = train_ham_tokens.sum(axis = 0) + 1#Laplace smoothing
summed_ham_tokens.shape

In [None]:
summed_ham_tokens.tail()

In [None]:
train_ham_tokens[2499].sum() + 1#Correspong to the above output

## P(Token | Spam) = Probability that a token occurs given that the email is spam

In [None]:
prob_token_spam = summed_spam_tokens/ (spam_wc + VOCAB_SIZE)
prob_token_spam

from pygame import mixer
file = 'musicname.mp3'
mixer.init()
mixer.music.load(file)
mixer.music.play

In [None]:
prob_token_spam.sum()

## P(Token | Ham) = Probability that a token occurs given that the email is Non-spam

In [None]:
prob_token_ham = summed_ham_tokens/ (ham_wc + VOCAB_SIZE)
prob_token_ham

In [None]:
prob_token_ham.sum()

## P(Token) = Probability that token occurs

In [None]:
prob_token_all = full_train_features.sum(axis= 0)/ total_wc

In [None]:
prob_token_all

In [None]:
prob_tokens_all.sum()

# Save the trained model

In [None]:
np.savetxt(TOKEN_SPAM_PROB_FILE, prob_token_spam)
np.savetxt(TOKEN_HAM_PROB_FILE, prob_token_ham)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_token_all)

# Preparing Test Data

In [None]:
%%time
full_test_data = make_full_matrix(sparse_test_data, VOCAB_SIZE)

In [None]:
X_test = full_test_data.loc[:,full_test_data.columns != 'CATEGORY']
Y_test = full_test_data.CATEGORY

In [None]:
np.savetxt(TEST_TARGET_FILE, Y_test)
np.savetxt(TEST_FEATURE_MATRIX, X_test)