# Imports 

In [52]:
import pandas as pd
import numpy as np

%matplotlib inline

# Constants

In [53]:
TRAINING_DATA_PATH = 'SpamData/02_Training/training-data'
TEST_DATA_PATH = 'SpamData/02_Training/test-data'

TOKEN_SPAM_PROB_PATH = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_PATH = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_PATH = 'SpamData/03_Testing/prob-all-token.txt'

TEST_TARGET_PATH = 'SpamData/03_Testing/Test-target.txt'
TEST_FEATURES_MATRIX_PATH = 'SpamData/03_Testing/test-features.txt'


VOCAB_SIZE = 2500

### importing data form .txt file to a numpy array

In [54]:
sparse_train_data = np.loadtxt(TRAINING_DATA_PATH, dtype=int ,delimiter=' ')

In [55]:
sparse_test_data = np.loadtxt(TEST_DATA_PATH, dtype=int ,delimiter=' ')

In [56]:
sparse_test_data[:5]

array([[8, 1, 2, 1],
       [8, 1, 3, 4],
       [8, 1, 4, 2],
       [8, 1, 5, 1],
       [8, 1, 6, 2]])

In [57]:
print('Nr of rows in train data is:', sparse_train_data.shape[0])
print('Nr of rows in test data is:', sparse_test_data.shape[0])

Nr of rows in train data is: 265428
Nr of rows in test data is: 110856


In [58]:
print('Nr of emails in train data is:', np.unique(sparse_train_data[:, 0]).size)
print('Nr of emails in test data is:', np.unique(sparse_test_data[:, 0]).size)

Nr of emails in train data is: 4018
Nr of emails in test data is: 1722


# Function to make full matrix from sparse matrix

In [59]:
p = pd.DataFrame(columns=['docid'] + ['label'] + list(range(0, 4)), index=list(range(0,5)))
p.fillna(value=0, inplace=True)
p.at[4, 'docid'] = 6
p.at[4, 'label'] = 1
p.at[2, 2] = 1
p

Unnamed: 0,docid,label,0,1,2,3
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,0
4,6,1,0,0,0,0


In [60]:
# column = ['DOC_ID'] + ['Category'] + list(range(VOCAB_SIZE))
# row = list(range(np.unique(sparse_train_data[:, 0]).size))
# sample_matrix = pd.DataFrame(columns=column, index=row)
# sample_matrix.fillna(value=0, inplace=True)


In [61]:
def create_full_matrix(sparse_matrix, nr_words, doc_idx=0, cat_idx=1, word_idx=2, freq_idx=3):
    '''
    takes a sparse matrix and gives a full matrix in form of DataFrame.
    Keyword Argument:
    sparse_matrix: numpy array
    nr_words: size of vocabulary. Total no. of tokenized word
    doc_idx: position of documnet-id in sparse matrix. default pos = 1st coumn
    cat_idx: position of category (spam matrix is 1, non-spam is 0). default pos = 2nd coumn
    word_idx: position of word-id in sparse matrix. default pos = 3rd coumn
    freq_idx: position of occurence in sparse matrix. default pos = 4th coumn
    '''
    column = ['DOC_ID'] + ['Category'] + list(range(nr_words))
    row = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(columns=column, index=row)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        nr_doc = sparse_matrix[i][doc_idx]
        nr_label = sparse_matrix[i][cat_idx]
        nr_word_id = sparse_matrix[i][word_idx]
        nr_occurence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[nr_doc, 'DOC_ID'] = nr_doc
        full_matrix.at[nr_doc, 'Category'] = nr_label
        full_matrix.at[nr_doc, nr_word_id] = nr_occurence
    
    full_matrix.set_index('DOC_ID', inplace=True)
    return full_matrix

In [62]:
%%time
full_train_data = create_full_matrix(sparse_train_data, VOCAB_SIZE)

Wall time: 16.8 s


In [63]:
full_train_data.dropna(axis=1, inplace=True)
full_train_data

Unnamed: 0_level_0,Category,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,1,2,1,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,6,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,6,0,0,2,4,0,3,14,0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792,0,3,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5795,0,2,2,0,1,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
5796,0,1,9,1,0,0,1,2,1,1,...,0,1,0,0,0,0,0,0,0,0
5797,0,1,1,1,0,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0


## Training the Naive Bayes Model

### Calculating probability of spam

In [64]:
full_train_data.Category.value_counts()[1]

1251

In [65]:
prob_spam = full_train_data.Category.value_counts()[1]/full_train_data.shape[0]
print('Probability of spam is:', prob_spam)

Probability of spam is: 0.3113489298158288


### Calculating total numbers of tokens/words2

In [66]:
full_train_feature = full_train_data.loc[:, full_train_data.columns != 'Category']
full_train_feature.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,2,1,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,2,4,0,3,14,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
email_length = full_train_feature.sum(axis=1)
email_length.shape

(4018,)

In [68]:
email_length

DOC_ID
0        87
1        53
2        40
3       183
4        43
       ... 
5792     18
5795     30
5796    228
5797     57
5798    264
Length: 4018, dtype: int64

In [69]:
total_wc = email_length.sum()
total_wc

450820

### Calculate total number of spam and non-spam tokens/words 

In [70]:
spam_email_length = email_length[full_train_data.Category == 1]
spam_email_length.shape

(1251,)

In [71]:
ham_email_length = email_length[full_train_data.Category == 0]
ham_email_length.shape

(2767,)

In [72]:
spam_wc = spam_email_length.sum()
print('total number of spam word are ', spam_wc)

total number of spam word are  195862


In [73]:
nonspam_wc = total_wc - spam_wc
print('total number of non-spam word are ',nonspam_wc)

total number of non-spam word are  254958


In [74]:
print('the average words in spam msg are {:.0f}'.format(np.mean(spam_email_length)))
print('the average words in spam msg are {:.0f}'.format(np.mean(ham_email_length)))

the average words in spam msg are 157
the average words in spam msg are 92


### Occurence of Spam and non-spam words/tokens

In [75]:
train_spam_token = full_train_feature.loc[full_train_data.Category == 1]
train_spam_token.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,2,1,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,2,4,0,3,14,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
summed_spam_token = train_spam_token.sum(axis=0) + 1 #laplace smoothing
summed_spam_token.shape

(2500,)

In [77]:
summed_spam_token

0       2314
1       1012
2       1415
3       2122
4       1443
        ... 
2495      29
2496      19
2497       2
2498       2
2499       2
Length: 2500, dtype: int64

In [78]:
nonspam_length = full_train_feature.loc[full_train_data.Category == 0]
nonspam_length.shape

(2767, 2500)

In [79]:
summed_ham_token = nonspam_length.sum(axis=0) + 1 #laplace smoothing
summed_ham_token

0       5408
1       2675
2       2068
3        909
4       1663
        ... 
2495      16
2496      12
2497      18
2498      29
2499      34
Length: 2500, dtype: int64

## P(Token|Spam)-Probability of token occurs given the email is spam 

In [80]:
prob_token_spam = summed_spam_token / (spam_wc + VOCAB_SIZE) # 2500 should be added according to laplace smoothing

In [81]:
prob_token_spam.sum()

1.0

## P(Token|ham)-Probability of token occurs given the email is nonspam 

In [82]:
prob_token_ham = summed_ham_token / (nonspam_wc + VOCAB_SIZE)

In [83]:
prob_token_ham.sum()

1.0

## P(Token)-Probability of token occurs 

In [84]:
prob_token = full_train_feature.sum(axis=0)/total_wc
prob_token.sum()

1.0

## Saving the trained model

In [85]:
np.savetxt(TOKEN_SPAM_PROB_PATH, prob_token_spam)
np.savetxt(TOKEN_HAM_PROB_PATH, prob_token_ham)
np.savetxt(TOKEN_ALL_PROB_PATH, prob_token)

## Prepare Test Data

In [91]:
sparse_test_data.shape

(110856, 4)

In [87]:
%%time
full_test_data = create_full_matrix(sparse_test_data, VOCAB_SIZE)

Wall time: 7.33 s


In [88]:
full_test_data

Unnamed: 0_level_0,Category,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,1,0,0,1,4,2,1,2,4,1,...,0,0,0,0,0,0,0,0,0,0
12,1,6,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,1,0,2,1,1,2,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
17,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5789,0,5,5,2,2,1,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5790,0,2,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5791,0,0,4,0,2,4,3,3,1,4,...,0,0,0,0,0,0,0,0,0,0
5793,0,1,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
test_feature = full_test_data.loc[:, full_test_data.columns != 'Category']
test_target = full_test_data.Category
test_target

DOC_ID
8       1
12      1
14      1
15      1
17      1
       ..
5789    0
5790    0
5791    0
5793    0
5794    0
Name: Category, Length: 1722, dtype: int64

In [90]:
# saving to txt file
np.savetxt(TEST_FEATURES_MATRIX_PATH, test_feature)
np.savetxt(TEST_TARGET_PATH, test_target)