## Downloading Dataset and Installing Requirements

In [1]:
!wget http://www.aueb.gr/users/ion/data/lingspam_public.tar.gz --no-check-certificate

--2023-10-31 20:49:51--  http://www.aueb.gr/users/ion/data/lingspam_public.tar.gz
Resolving www.aueb.gr (www.aueb.gr)... 195.251.255.156
Connecting to www.aueb.gr (www.aueb.gr)|195.251.255.156|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://pages.aueb.gr/users/ion/data/lingspam_public.tar.gz [following]
--2023-10-31 20:49:52--  http://pages.aueb.gr/users/ion/data/lingspam_public.tar.gz
Resolving pages.aueb.gr (pages.aueb.gr)... 195.251.255.230
Connecting to pages.aueb.gr (pages.aueb.gr)|195.251.255.230|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www2.aueb.gr/users/ion/data/lingspam_public.tar.gz [following]
--2023-10-31 20:49:53--  https://www2.aueb.gr/users/ion/data/lingspam_public.tar.gz
Resolving www2.aueb.gr (www2.aueb.gr)... 195.251.255.230
Connecting to www2.aueb.gr (www2.aueb.gr)|195.251.255.230|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request 

In [None]:
!tar -xzvf lingspam_public.tar.gz

In [285]:
from sklearn.model_selection import train_test_split
import os
import re
import string
import numpy as np
import nltk
from nltk.corpus import stopwords
from collections import Counter
nltk.download('stopwords')
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score
from sklearn.svm import SVC

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preprocessing and Feature Selection Using Information Gain

In [230]:
def clean_data(data):
  data = data.lower()
  data = re.sub(r'\d+', '', data)
  data = data.translate(str.maketrans('','', string.punctuation))
  data = data.replace('subject', '')
  stop_words = set(stopwords.words('english'))
  data = ' '.join([word for word in data.split() if word not in stop_words])
  data = ' '.join([word for word in data.split() if len(word)>1])
  data = re.sub(' +', ' ', data)
  data = data.replace('\n','')
  data = data.replace('\t','')
  data = data.strip()
  return data

In [231]:
data_path = 'lingspam_public/lemm_stop'
emails, labels = [], []
all_words = ''
total_words_in_spam = ''
total_words_in_ham = ''

# Load emails
for root, dirs, files in os.walk(data_path):
    if 'part10' in dirs:
        dirs.remove('part10')
    print(files)
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r', encoding='latin1') as f:
                text = clean_data(f.read())
                all_words = all_words + text
                emails.append(text)
                # Mark spam emails
                labels.append(1 if 'sp' in file else 0)
                if 'sp' in file:
                  total_words_in_spam+=text
                else:
                  total_words_in_ham+=text

[]
['6-11msg1.txt', '6-1127msg1.txt', '6-160msg3.txt', '6-109msg1.txt', '6-175msg1.txt', '6-1103msg1.txt', 'spmsga56.txt', 'spmsga73.txt', '6-151msg2.txt', '6-1146msg3.txt', '6-1122msg1.txt', '6-112msg1.txt', '6-1112msg1.txt', '6-189msg2.txt', 'spmsga71.txt', '6-199msg3.txt', '6-1104msg1.txt', '6-232msg2.txt', '6-162msg2.txt', '6-166msg3.txt', '6-16msg1.txt', '6-1130msg1.txt', '6-1056msg3.txt', '6-1117msg1.txt', '6-1109msg1.txt', '6-186msg3.txt', '6-1065msg1.txt', '6-1156msg1.txt', '6-21msg1.txt', '6-1116msg1.txt', '6-121msg1.txt', '6-1082msg1.txt', 'spmsga44.txt', '6-1118msg3.txt', '6-110msg3.txt', '6-1138msg3.txt', '6-1095msg2.txt', '6-173msg1.txt', '6-205msg3.txt', 'spmsga67.txt', 'spmsga45.txt', '6-1155msg2.txt', 'spmsga42.txt', '6-176msg1.txt', '6-205msg1.txt', '6-163msg1.txt', '6-1111msg1.txt', '6-120msg1.txt', 'spmsga53.txt', 'spmsga49.txt', 'spmsga58.txt', '6-151msg1.txt', '6-15msg2.txt', '6-1139msg1.txt', '6-187msg1.txt', '6-197msg1.txt', 'spmsga48.txt', '6-1125msg5.txt', '6-2

In [232]:
test_data_path = 'lingspam_public/lemm_stop/part10'
test_emails = []
test_labels = []
for filename in os.listdir(test_data_path):
    file_path = os.path.join(test_data_path, filename)
    with open(file_path, 'r') as file:
        text = clean_data(file.read())
        test_emails.append(text)
        test_labels.append(1 if 'sp' in filename else 0)

In [233]:
total_spam = labels.count(1)
total_ham = labels.count(0)

In [234]:
print(f"There are {total_spam} spam emails and {total_ham} ham emails in part1 to part9")

There are 432 spam emails and 2170 ham emails in part1 to part9


In [235]:
all_words_len = len(set(all_words.split(" ")))

In [28]:
print(f"There are total {all_words_len} words in all emails")

There are total 50412 words in all emails


In [236]:
p = total_ham/(total_spam + total_ham)
HC = -p*np.log(p) - (1-p)*np.log(1-p)
print('HC:', HC)

HC: 0.4495288323617157


In [None]:
IG = {}

word_counts = {'spam': {}, 'ham': {}}

for idx, email in enumerate(emails):
    label = 'spam' if labels[idx] == 1 else 'ham'
    seen = set()
    for word in email.split(" "):
        if word not in word_counts[label]:
            word_counts['spam'][word] = 0
            word_counts['ham'][word] = 0
        if word not in seen:
          word_counts[label][word] += 1
          seen.add(word)


for word in all_words.split(" "):

  spam_count = word_counts['spam'].get(word,0)
  ham_count = word_counts['ham'].get(word,0)

  n_with_word = spam_count + ham_count    # number of emails with term
  n_without_word = len(emails) - n_with_word        # number of emails without term

  # use smoothing for terms never occur
  if n_with_word == 0:
    spam_count += 1
    ham_count += 1
    n_with_word = spam_count + ham_count
    n_without_word = len(emails) - n_with_word

  if n_without_word == 0:   # all the email contain this term
    continue

  H_term = (-1) * (((total_spam-spam_count)/len(emails)) * (np.log((total_spam-spam_count)/n_without_word))
                   + (spam_count/len(emails)) * (np.log(spam_count/n_with_word))
                   + ((total_ham-ham_count)/len(emails)) * (np.log((total_ham-ham_count)/n_without_word))
                   + (ham_count/len(emails)) * (np.log(ham_count/n_with_word)))    # entropy(uncertainty) after this term

  ig = HC - H_term             # information gain
  IG[word] = [spam_count, ham_count, ig]

In [238]:
# Create a DataFrame from the IG dictionary
data_frame = pd.DataFrame(list(IG.items()), columns=['Word', 'Values'])
data_frame[['Num spam with word', 'Num ham with word', 'IG']] = pd.DataFrame(data_frame['Values'].tolist(), index=data_frame.index)
data_frame.drop(columns=['Values'], inplace=True)

# Sort the DataFrame based on IG values in descending order
sorted_frame = data_frame.sort_values(by='IG', ascending=False)

df = sorted_frame.set_index('Word')
df[0:10]

Unnamed: 0_level_0,Num spam with word,Num ham with word,IG
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
language,7,1447,0.14171
remove,185,22,0.117082
free,251,129,0.114876
university,17,1276,0.100511
money,169,58,0.082261
click,116,13,0.070129
market,132,41,0.064122
business,143,67,0.060185
today,144,80,0.056067
advertise,92,10,0.054907


In [239]:
df_1 = df.iloc[0:10]
df_2 = df.iloc[0:100]
df_3 = df.iloc[0:1000]

## Bernoulli NB classifier with binary features

In [249]:
def bernoulli_nb(words, df, spam_count, ham_count):

    p_spam = (spam_count) / (spam_count + ham_count)
    p_ham = (ham_count) / (spam_count + ham_count)
    p_words_in_spam, p_words_in_ham = 1, 1

    all_words_set = set(words.split(" "))
    for idx, row in df.iterrows():
      word, spam, ham = idx, row['Num spam with word'], row['Num ham with word']
      p_word_given_spam = (spam+1)/(spam_count+2)
      p_word_given_ham = (ham+1)/(ham_count+2)

      if word in all_words_set:
        p_words_in_spam*=p_word_given_spam
        p_words_in_ham*=p_word_given_ham
      else:
        p_words_in_spam *= (1-p_word_given_spam)
        p_words_in_ham *= (1-p_word_given_ham)

    p_s_given_words = p_spam * p_words_in_spam
    p_h_given_words = p_ham * p_words_in_ham

    return p_s_given_words > p_h_given_words

## Multinomial NB with binary features

In [247]:
def multinomial_nb(words, df, spam_count, ham_count):
  p_spam = (spam_count) / (spam_count + ham_count)
  p_ham = (ham_count) / (spam_count + ham_count)
  p_words_in_spam, p_words_in_ham = 1, 1
  all_words_set = set(words.split(" "))

  spam_words = total_words_in_spam.split(" ")
  ham_words = total_words_in_ham.split(" ")
  corpus = set(all_words.split(" "))

  for idx, row in df.iterrows():
    word, spam, ham = idx, row['Num spam with word'], row['Num ham with word']

    p_word_given_spam = (spam+1)/(spam_count + len(corpus))
    p_word_given_ham = (ham+1)/(ham_count + len(corpus))

    if word in all_words_set:
        p_words_in_spam*=p_word_given_spam
        p_words_in_ham*=p_word_given_ham

  p_s_given_words = p_spam * p_words_in_spam
  p_h_given_words = p_ham * p_words_in_ham

  return p_s_given_words > p_h_given_words

## Multinomial NB with term frequency (TF) features

In [251]:
def multinomial_nb_with_tf(words, df, spam_count, ham_count):

  spam_words = total_words_in_spam.split(" ")
  ham_words = total_words_in_ham.split(" ")
  corpus = set(all_words.split(" "))
  spam_dict = Counter(spam_words)
  ham_dict = Counter(ham_words)

  p_spam = (spam_count) / (spam_count + ham_count)
  p_ham = (ham_count) / (spam_count + ham_count)
  p_words_in_spam, p_words_in_ham = 1, 1

  word_counts = Counter(words.split(" "))

  top_words = list(df.index)

  for word, count in word_counts.items():
        if word not in top_words:
            continue

        count_s = spam_dict.get(word, 0)
        count_h = ham_dict.get(word, 0)

        p_word_given_spam = (count_s + 1) / (len(spam_words) + 2)
        p_word_given_ham = (count_h + 1) / (len(ham_words) + 2)

        p_words_in_spam *= (p_word_given_spam**count)
        p_words_in_ham *= (p_word_given_ham**count)

  p_s_given_words = p_spam * p_words_in_spam
  p_h_given_words = p_ham * p_words_in_ham

  return p_s_given_words > p_h_given_words

## Testing Naive Bayes Classifiers

### N = 10

In [252]:
preds_bernoulli = []
preds_multinomial_nb = []
preds_multinomial_nb_tf = []
for email in test_emails:
  preds_bernoulli.append(bernoulli_nb(email, df_1, total_spam, total_ham))
  preds_multinomial_nb.append(multinomial_nb(email, df_1, total_spam, total_ham))
  preds_multinomial_nb_tf.append(multinomial_nb_with_tf(email, df_1, total_spam, total_ham))

print(f"Bernoulli Naive Bayes Classifier: recall_score={recall_score(test_labels,preds_bernoulli)} and preceision_score={precision_score(test_labels,preds_bernoulli)}")
print(f"Multionomial Naive Bayes Classifier with binary features: recall_score={recall_score(test_labels,preds_multinomial_nb)} and preceision_score={precision_score(test_labels,preds_multinomial_nb)}")
print(f"Multinomial Naive Bayes Classifier with term frequency features: recall_score={recall_score(test_labels,preds_multinomial_nb_tf)} and preceision_score={precision_score(test_labels,preds_multinomial_nb_tf)}")

Bernoulli Naive Bayes Classifier: recall_score=0.6326530612244898 and preceision_score=0.8857142857142857
Multionomial Naive Bayes Classifier with binary features: recall_score=0.5714285714285714 and preceision_score=0.875
Bernoulli Naive Bayes Classifier with term frequency features: recall_score=0.8163265306122449 and preceision_score=0.851063829787234


### N=100

In [253]:
preds_bernoulli = []
preds_multinomial_nb = []
preds_multinomial_nb_tf = []
for email in test_emails:
  preds_bernoulli.append(bernoulli_nb(email, df_2, total_spam, total_ham))
  preds_multinomial_nb.append(multinomial_nb(email, df_2, total_spam, total_ham))
  preds_multinomial_nb_tf.append(multinomial_nb_with_tf(email, df_2, total_spam, total_ham))

print(f"Bernoulli Naive Bayes Classifier: recall_score={recall_score(test_labels,preds_bernoulli)} and preceision_score={precision_score(test_labels,preds_bernoulli)}")
print(f"Multionomial Naive Bayes Classifier with binary features: recall_score={recall_score(test_labels,preds_multinomial_nb)} and preceision_score={precision_score(test_labels,preds_multinomial_nb)}")
print(f"Multinomial Naive Bayes Classifier with term frequency features: recall_score={recall_score(test_labels,preds_multinomial_nb_tf)} and preceision_score={precision_score(test_labels,preds_multinomial_nb_tf)}")

Bernoulli Naive Bayes Classifier: recall_score=0.673469387755102 and preceision_score=0.9705882352941176
Multionomial Naive Bayes Classifier with binary features: recall_score=0.8163265306122449 and preceision_score=0.975609756097561
Bernoulli Naive Bayes Classifier with term frequency features: recall_score=0.9795918367346939 and preceision_score=0.8


### N = 1000

In [254]:
preds_bernoulli = []
preds_multinomial_nb = []
preds_multinomial_nb_tf = []
for email in test_emails:
  preds_bernoulli.append(bernoulli_nb(email, df_3, total_spam, total_ham))
  preds_multinomial_nb.append(multinomial_nb(email, df_3, total_spam, total_ham))
  preds_multinomial_nb_tf.append(multinomial_nb_with_tf(email, df_3, total_spam, total_ham))

print(f"Bernoulli Naive Bayes Classifier: recall_score={recall_score(test_labels,preds_bernoulli)} and preceision_score={precision_score(test_labels,preds_bernoulli)}")
print(f"Multionomial Naive Bayes Classifier with binary features: recall_score={recall_score(test_labels,preds_multinomial_nb)} and preceision_score={precision_score(test_labels,preds_multinomial_nb)}")
print(f"Multinomial Naive Bayes Classifier with term frequency features: recall_score={recall_score(test_labels,preds_multinomial_nb_tf)} and preceision_score={precision_score(test_labels,preds_multinomial_nb_tf)}")

Bernoulli Naive Bayes Classifier: recall_score=0.5510204081632653 and preceision_score=1.0
Multionomial Naive Bayes Classifier with binary features: recall_score=0.2857142857142857 and preceision_score=1.0
Bernoulli Naive Bayes Classifier with term frequency features: recall_score=0.673469387755102 and preceision_score=0.9705882352941176


## SVM Classifier
### Using Binary Features

In [281]:
def load_data_from_path(base_path, features):
    data = {}
    for file in os.listdir(base_path):
        with open(os.path.join(base_path, file), 'r') as f:
            text = clean_data(f.read())
        unique_words = set(text.split(" "))
        feature_values = [1 if feature in unique_words else 0 for feature in features]
        class_label = 1 if file.startswith('sp') else 0
        feature_values.append(class_label)
        data[file] = feature_values
    return data

def get_data(df):
    features = list(df.index)
    base_path = 'lingspam_public/lemm_stop/'
    train_data = {}
    for i in range(1, 10):
        train_data.update(load_data_from_path(os.path.join(base_path, f'part{i}'), features))
    return train_data

def get_test(features):
    return load_data_from_path('lingspam_public/lemm_stop/part10', features[:-1])

### N = 10

In [282]:
features = list(df_1.index) + ['class']

train_data_dict = get_data(df_1)
train_data_df = pd.DataFrame.from_dict(train_data_dict, orient='index', columns=features)

test_data_dict = get_test(features)
test_data_df = pd.DataFrame.from_dict(test_data_dict, orient='index', columns=features)

X_train = train_data_df.drop('class', axis=1)
y_train = train_data_df['class']

X_test = test_data_df.drop('class', axis=1)
y_test = test_data_df['class']

# Model training and prediction
model = SVC(kernel='linear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Precision: {precision} and Recall: {recall}")

Precision: 0.8888888888888888 and Recall: 0.8163265306122449


### N = 100

In [283]:
features = list(df_2.index) + ['class']

train_data_dict = get_data(df_2)
train_data_df = pd.DataFrame.from_dict(train_data_dict, orient='index', columns=features)

test_data_dict = get_test(features)
test_data_df = pd.DataFrame.from_dict(test_data_dict, orient='index', columns=features)

X_train = train_data_df.drop('class', axis=1)
y_train = train_data_df['class']

X_test = test_data_df.drop('class', axis=1)
y_test = test_data_df['class']

# Model training and prediction
model = SVC(kernel='linear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Precision: {precision} and Recall: {recall}")

Precision: 0.9333333333333333 and Recall: 0.8571428571428571


### N = 1000

In [284]:
features = list(df_3.index) + ['class']

train_data_dict = get_data(df_3)
train_data_df = pd.DataFrame.from_dict(train_data_dict, orient='index', columns=features)

test_data_dict = get_test(features)
test_data_df = pd.DataFrame.from_dict(test_data_dict, orient='index', columns=features)

X_train = train_data_df.drop('class', axis=1)
y_train = train_data_df['class']

X_test = test_data_df.drop('class', axis=1)
y_test = test_data_df['class']

# Model training and prediction
model = SVC(kernel='linear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Precision: {precision} and Recall: {recall}")

Precision: 1.0 and Recall: 0.9183673469387755


# Final Observations:

## For N = 10 features:

Bernoulli NB Classifier: recall_score=0.6326530612244898, preceision_score=0.8857142857142857

Multionomial NB Classifier with binary features: recall = 0.5714285714285714, preceision = 0.875

Multinomial NB Classifier with term frequency features: recall = 0.8163265306122449, preceision = 0.851063829787234

SVM Classifier: recall = 0.8163265306122449, precision = 0.8888888888888888


## For N = 100 features:
Bernoulli Naive Bayes Classifier: recall_score=0.673469387755102 and preceision_score=0.9705882352941176

Multionomial Naive Bayes Classifier with binary features: recall_score=0.8163265306122449 and preceision_score=0.975609756097561

Multinomial Naive Bayes Classifier with term frequency features: recall_score=0.9795918367346939 and preceision_score=0.8

SVM Classifier: recall = 0.8571428571428571, precision = 0.9333333333333333

## For N = 1000 features:
Bernoulli Naive Bayes Classifier: recall_score=0.5510204081632653 and preceision_score=1.0

Multionomial Naive Bayes Classifier with binary features: recall_score=0.2857142857142857 and preceision_score=1.0

Multinomial Naive Bayes Classifier with term frequency features: recall_score=0.673469387755102 and preceision_score=0.9705882352941176

SVM Classifier: recall = 0.9183673469387755, precision = 1.0
