# Download packages

In [2]:
! pip install nltk
! python -m nltk.downloader all

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

# Download spam email detection dataset

In [0]:
! curl http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron1.tar.gz --output enron1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1760k  100 1760k    0     0   201k      0  0:00:08  0:00:08 --:--:--  357k


In [0]:
! tar -xf enron1.tar.gz enron1

In [0]:
! ls -1 enron1/ham/*.txt | wc -l # print the number of non-spam emails

3672


In [0]:
! ls -1 enron1/spam/*.txt | wc -l # print the number of spam emails

1500


In [0]:
! cat enron1/ham/0007.1999-12-14.farmer.ham.txt # print an example of non-spam (ham) email

Subject: mcmullen gas for 11 / 99
jackie ,
since the inlet to 3 river plant is shut in on 10 / 19 / 99 ( the last day of
flow ) :
at what meter is the mcmullen gas being diverted to ?
at what meter is hpl buying the residue gas ? ( this is the gas from teco ,
vastar , vintage , tejones , and swift )
i still see active deals at meter 3405 in path manager for teco , vastar ,
vintage , tejones , and swift
i also see gas scheduled in pops at meter 3404 and 3405 .
please advice . we need to resolve this as soon as possible so settlement
can send out payments .
thanks

# Load dataset
- `glob`: it finds all the pathnames matching a specified pattern according to the rules used by the Unix shell, although results are returned in arbitrary order.
```
import glob
glob.glob('./[0-9].*')
['./1.gif', './2.txt']
```


In [0]:
import glob, os

# init
"""
emails: a set of email
labels: a set of label representing whetere the gien email is spam or ham
  - spam: 1
  - ham: 0
"""
emails, labels = [], []
parition = 0

In [0]:
# load spam dataset
file_path = 'enron1/spam'

for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: # [!important] check encofing format
      emails.append(f.read())
      labels.append(1)

file_path = 'enron1/ham'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: 
      emails.append(f.read())
      labels.append(0)

In [0]:
print('# of emails = {}\n# of labels = {}'.format(len(emails), len(labels)))

# Data pre-processing

Now, we are going to pre-process our corpus.

In [0]:
print(emails[1]) # Before pre-processing

Subject: via - gra pro will get you hard consumption
as you already saw , there is new , better via - gra
in the market . it is called via - gra pro , and it
is significally beter and has better influence like
you never imagined . enter now to expreience more .
finally alien dimension should be as complex as the city itself they ll rebuild stuff after a while .
tom hanks warren beatty annette bening and dustin hoffman all turned out at feinstein s in la for pal carole bayer sager new york daily news ny .
great site ! used this site to do an online class assignment very helpful can t wait to start teaching and have more time to browse .
lex kung mababaw lang tingin ko sa pagkakaibigan natin sasabihin ko friend mo siya ? hindi kita friend pero hindi .
celly hi just blog hopping to wish you a very lovely happy valentines day ! have fun take care xoxo celly .
please note nbsp links are to the webpages where these tracks are available - nbsp nbsp no audio files are linked directly or are ho

  - remove number and punctuation
  - remove name entity
  - remove stopword
  - lemmatization

In [0]:
# remove number and punctuation 
def letters_only(word):
  return word.isalpha()

In [0]:
# remove name entity
from nltk.corpus import names
all_names = set(names.words())

In [0]:
# lemmaization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [0]:
# remove stopword
# -> See the next section: vectorization

In [0]:
# put all together to clean texts
def clean_text(doc):
  cleaned_doc = []
  for word in doc.split(' '): # split doc. by blank (' ')
    word = word.lower() # ABD -> abd
  
    if letters_only(word) and word not in all_names and len(word) > 2: # remove number and punc. and name entity
      cleaned_doc.append(lemmatizer.lemmatize(word))
  return ' '.join(cleaned_doc) 

cleaned_emails = [clean_text(doc) for doc in emails]

In [0]:
cleaned_emails[1] # After pre-processing

'real teen love make sex ton video teen girl erotic angel sweet european girl nasty blowjob face cumshots teen defloration rough throat fuck hot college lesbian anal sex coeds group hardcore and more only fresh and exclusive and video content teen mega portal inside your email'

- Vectorization

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
# this code will return term-document matrix
cv = CountVectorizer(stop_words='english', max_features=500)
term_docs = cv.fit_transform(cleaned_emails)
print(term_docs[5])

  (0, 210)	1
  (0, 493)	1
  (0, 69)	1


- How we interpret the reutrn values?

  - `type(term_docs) # scipy.sparse.csr.csr_matrix`
  -  example: 
  ```
  (0, 287)	1 
  (row index, column (term) index) term frequency
  ```
  - print the corresponding terms
  ```
  cv.get_feature_names()[287]
  >>> 'new'
  ```

In [0]:
feature_names = cv.get_feature_names() # ['w1', 'w2', ...., 'wn']
feature_mapping = cv.vocabulary_ # {'word', 'column index'}

# Build and train naive Bayes model 
Now, it's your turn to build the naive Bayes model by yourself!

In [0]:
def get_label_index(labels):
  """
  groupby document indices on label (spam: 1 or ham: 0)
  @args:
    - labels (list): a list of cl ordered by document indices
  @return:
    - dictiontory, with class as key, a coressponding list of indices as value
  """
  from collections import defaultdict
  label_index = defaultdict(list)
  for index, label in enumerate(labels):
    label_index[label].append(index)
  return label_index

label_index = get_label_index(labels)
# label_index = { 1: [0, ...., 1499], 
#                  0: [1500, ....., 5172]}
# label_index

In [0]:
def get_prior(label_index):
  """
  compute class prior probability of spam (or ham)
  @args:
    - label_index (grouped sample indices by class)
  @return:
    - dictionary, with class labels as key, corresponding prior as the value
  """
  prior = {label: len(index) for label, index in label_index.items()}
  ######################## 
  # write your code here #
  ########################
  sum = 0
  # print(prior)
  for i in prior.values():
    sum += i

  for idx in prior.keys():
    prior[idx]/=sum
  
  return prior
prior = get_prior(label_index)
print(prior)
# >> {1: 0.2900232018561485, 0: 0.7099767981438515}

{1: 0.2900232018561485, 0: 0.7099767981438515}


In [0]:
import numpy as np
def get_likelihood(term_docs, label_index, smoothing=0):
  """
  compute likelihood based on traning samples
  ex. P(spam|email) when postrior probaility P(email|spam)
  @args:
    - term_doc_matrix (sparse matrix): document-word vector
      >> ex. <5172x500 sparse matrix of type '<class 'numpy.int64'>'
	            with 95288 stored elements in Compressed Sparse Row format>
    - label_index (dictiontory): with class as key, a coressponding list of indices as value
      >> ex. { 1: [0, ...., 1499], 0: [1500, ....., 5172]}
    - smoothing (interger: additive Laplace smoothing
  @return
    - dictiontory, with class as key, coressponding prob. P(feature|class) vector as value
  """
  # print(term_docs)
  likelihood = {}
  likelihood[0]=[]
  likelihood[1]=[]
  x = term_docs.toarray()
  number_of_spam = 0
  number_of_ham = 0
  for col in range(len(x[0])):
    for row in range(len(x)):
      if x[row][col] != 0:
        if row < len(label_index[1]):
          number_of_spam+=x[row][col]
        else:
          number_of_ham+=x[row][col]
  total_smoothing = len(x[0]) * 27.1
  # 스팸 계산
  for col in range(len(x[0])):
    s = 0
    for row in range(len(label_index[1])):
      s += x[row][col]
    likelihood[1].append((s+smoothing)/(number_of_spam+total_smoothing))
  # 안 스팸 계산
  for col in range(len(x[0])):
    s = 0
    for row in range(len(label_index[1]),len(x)):
      s += x[row][col]
    likelihood[0].append((s+smoothing)/(number_of_ham+total_smoothing))
  likelihood[1] = np.array(likelihood[1])
  likelihood[0] = np.array(likelihood[0])
  return likelihood

In [0]:
smoothing = 1
likelihood = get_likelihood(term_docs, label_index, smoothing)

In [0]:
print(likelihood[0].shape)
likelihood

(500,)


{0: array([1.15338776e-03, 9.99602722e-04, 8.39409978e-04, 7.30478912e-04,
        8.97079366e-05, 2.49900680e-04, 1.81978957e-03, 8.20186849e-04,
        1.38406531e-03, 2.01202086e-03, 8.52225398e-04, 7.30478912e-04,
        1.08290295e-03, 1.03804898e-03, 1.60192744e-04, 7.17663493e-04,
        9.16302495e-04, 2.05687483e-03, 1.08290295e-03, 5.76693878e-05,
        9.93195012e-04, 1.00601043e-03, 1.42251157e-03, 2.12095193e-03,
        1.93512835e-03, 1.53144263e-03, 5.06209071e-04, 2.96676962e-03,
        5.25432200e-04, 6.27955556e-04, 5.74130794e-03, 1.80697415e-03,
        4.10093424e-04, 1.11494150e-03, 1.27513424e-03, 7.04848073e-04,
        6.59994105e-04, 4.61355102e-04, 5.44655329e-04, 1.28794966e-03,
        8.13779139e-04, 1.92231293e-05, 4.48539683e-04, 8.45817688e-04,
        1.18542630e-03, 1.90949751e-03, 7.43294332e-04, 1.10853379e-03,
        7.04848073e-04, 9.86787302e-04, 1.03804898e-03, 8.07371429e-04,
        6.08732427e-04, 3.13977778e-04, 2.69123810e-04, 1.255

In [0]:
import math
from decimal import Decimal
def get_posterior(term_doc_matrix, prior, likelihood):
  """
  compute posterior based on traning samples
  @args:
    - term_doc_matrix (sparse matrix)
    - prior (dictionary) with class label as key, corresponding prior as the value
    - likelihood (dictionary) with label as key, corresponding conditional probability vector as value
  @return
    - dictiontory, with class as key, corresponding posterior as value
  { 1: [0, ...., 1499], 0: [1500, ....., 5172]}
  """
  x = term_doc_matrix.toarray()
  posteriors = []
  # 스팸일 확률 계산
  for row in range(len(x)):
    prob_of_spam = 0
    prob_of_ham = 0
    y = {}
    for col in range(len(x[0])):
      if x[row][col] != 0:
        prob_of_spam+=Decimal(f'{math.log10(likelihood[1][col])}')
        prob_of_ham+=Decimal(f'{math.log10(likelihood[0][col])}')
        # prob_of_spam+=Decimal(f'{math.log10(likelihood[1][col]) * x[row][col]}')
    prob_of_spam += Decimal(f'{math.log10(prior[1])}')
    prob_of_ham += Decimal(f'{math.log10(prior[0])}')
    sum_of_prob = Decimal(f'{(10**prob_of_ham) + (10**prob_of_spam)}')
    y[1] = float(10**prob_of_spam / sum_of_prob)
    y[0] = float(10**prob_of_ham / sum_of_prob)
    posteriors.append(y)
  return posteriors

In [0]:
posteriors = get_posterior(term_docs, prior, likelihood)

In [0]:
emails_test = [
    '''Subject: flat screens
    hello ,
    please call or contact regarding the other flat screens requested .
    trisha tlapek - eb 3132 b
    michael sergeev - eb 3132 a
    also the sun blocker that was taken away from eb 3131 a .
    trisha should two monitors also michael .
    thanks
    kevin moore''',
    '''Subject: having problems in bed ? we can help !
    cialis allows men to enjoy a fully normal sex life without having to plan the sexual act .
    if we let things terrify us , life will not be worth living .
    brevity is the soul of lingerie .
    suspicion always haunts the guilty mind .''',
]

In [0]:
cleaned_test = [clean_text(doc) for doc in emails_test]
term_docs_test = cv.transform(cleaned_test)
posteriors_test = get_posterior(term_docs_test, prior, likelihood)
print(posteriors_test)

[{1: 0.009492669676634903, 0: 0.9905073303233651}, {1: 0.9646806124872326, 0: 0.03531938751276739}]


# Train naive Bayes model

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=486)
print(len(X_train), len(Y_train), len(X_test), len(Y_test))

3465 3465 1707 1707


In [0]:
term_docs_train = cv.fit_transform(X_train) # get counter vector for X_train
label_index = get_label_index(Y_train)
prior = get_prior(label_index)
likelihood = get_likelihood(term_docs_train, label_index, smoothing)

In [0]:
term_docs_test = cv.transform(X_test)
posterior = get_posterior(term_docs_test, prior, likelihood)

In [0]:
correct = 0.0
for pred, actual in zip(posterior, Y_test):
    if actual == 1:
        if pred[1] >= 0.5:
            correct += 1
    elif pred[0] > 0.5:
        correct += 1

In [0]:
print('The accuracy on {0} testing samples is: {1:.1f}%'.format(len(Y_test), correct/len(Y_test)*100))

The accuracy on 1707 testing samples is: 72.7%


# Classification Report

Use `classification_report` in `scikit-learn` to report recall, precision, f1-score and accuracy of your model.

In [0]:
# before, you should make a list that has predicted labels of your model.
prediction = []
for i in posterior:
  if i[1] >= i[0]:
    a = 1
  else:
    a = 0
  prediction.append(a)
######################## 
# write your code here #
########################

In [0]:
from sklearn.metrics import classification_report
report = classification_report(Y_test, prediction)
print(report)

              precision    recall  f1-score   support

           0       0.73      1.00      0.84      1228
           1       0.93      0.03      0.06       479

    accuracy                           0.73      1707
   macro avg       0.83      0.51      0.45      1707
weighted avg       0.78      0.73      0.62      1707

