In [1]:
import pandas as pd
import numpy as np
import os
np.random.seed(42)

# Data Exploration

In [2]:
index_path = './trec06p/label/index'
data_dir = './trec06p/data/'
with open(index_path) as f:
    index_list = f.readlines()

index_dict = {}
for index in index_list:
    index = index.split()
    index_dict[index[1]] = 1 if index[0] == 'spam' else 0

corpus = []
labels = []
for key, value in index_dict.items():
    with open(os.path.join(data_dir, key)) as f:
        try:
            corpus.append(f.read())
            labels.append(value)
        except:
            pass

data = pd.DataFrame({'label': labels, 'text': corpus})
data.head()

Unnamed: 0,label,text
0,0,Received: from rodan.UU.NET by aramis.rutgers....
1,1,Received: from unknown (HELO groucho.cs.psu.ed...
2,1,Received:\n\tfrom 24-151-178-89.dhcp.kgpt.tn.c...
3,0,Received: from psuvax1.cs.psu.edu ([130.203.2....
4,1,Received: from 201-1-198-159.dsl.telesp.net.br...


In [3]:
emails = [
    {"text": "Win big prizes now!", "label": "spam"},
    {"text": "Important information about your account", "label": "spam"},
    {"text": "Dear friend, I have a business proposal for you", "label": "spam"},
    {"text": "This is a normal email, nothing suspicious", "label": "not spam"},
    {"text": "Another normal message", "label": "not spam"},
    {"text": "Limited time offer, buy now and save big!", "label": "spam"}
]
emails_df = pd.DataFrame(emails, columns=["label", "text"])
emails_df['label'] = emails_df['label'].apply(lambda x: 1 if x == 'spam' else 0)
data = emails_df

In [4]:
data.label.value_counts()

1    4
0    2
Name: label, dtype: int64

### Converting to lower case

In [5]:
# data['text'] = data['text'].apply(lambda x: x.lower())

### Data balancing

In [6]:
# balanced_data = data.groupby('label').apply(lambda x: x.sample(n=10000, replace=True)).reset_index(drop=True)
# balanced_data.label.value_counts()

balanced_data = data

### Train test split

In [7]:
train_df = balanced_data.sample(frac=0.7)
test_df = balanced_data.drop(train_df.index)

In [8]:
train_df

Unnamed: 0,label,text
0,1,Win big prizes now!
1,1,Important information about your account
5,1,"Limited time offer, buy now and save big!"
2,1,"Dear friend, I have a business proposal for you"


# Feature Engineering

"earn money,” “act now,” “click here,” “buy now,” “limited time offer,” “get rich quick,” “earn extra cash,” “make money fast,” “guaranteed,” “winner,” “bonus,” and “urgent.

In [9]:
def extract_features(text: str):
    # List of keywords to check in the text
    keywords = [
        # 'FREE', 'free', 'earn money', 'act now', 'click here', 'buy now',
        # 'limited time offer', 'get rich quick', 'earn extra cash', 'make money fast',
        # 'guaranteed', 'winner', 'bonus', 'urgent', 'credit card', 'lowest price',
        # 'amazing', 'incredible deal', 'no cost', 'risk free', 'special promotion',
        # 'exclusive offer', 'million dollars', 'once in a lifetime', 'password',
        # 'account suspended', 'confidentiality', 'discount',
        # 'win', 'winner', 'cash', 'prize', 'exclusive', 'urgent', 'important', 
        'free', 'act now', 'offer', 'credit', 'cheap', 'bonus', 'click', 'apply', 'buy', 'limited', 'guaranteed', 'save'
    ]

    # Create a dictionary of features based on the presence of keywords
    features = {f'has_{keyword.replace(" ", "_").lower()}': text.lower().count(keyword.lower()) > 0 for keyword in keywords}

    return features


# Pre-processing

In [10]:
def preprocess(X):
    features = X.iloc[:,0].apply(extract_features).apply(pd.Series)
    X = pd.concat([X, features], axis=1).drop('text', axis=1)
    return X

X_train = preprocess(train_df.drop('label', axis=1))
X_test = preprocess(test_df.drop('label', axis=1))
y_train = train_df['label']
y_test = test_df['label']

In [11]:
X_train.sum()

has_free          0
has_act_now       0
has_offer         1
has_credit        0
has_cheap         0
has_bonus         0
has_click         0
has_apply         0
has_buy           1
has_limited       1
has_guaranteed    0
has_save          1
dtype: int64

In [15]:
X_train

Unnamed: 0,has_free,has_act_now,has_offer,has_credit,has_cheap,has_bonus,has_click,has_apply,has_buy,has_limited,has_guaranteed,has_save
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,True,False,False,False,False,False,True,True,False,True
2,False,False,False,False,False,False,False,False,False,False,False,False


# Modelling

In [12]:
from naive_bayes import BernoulliNB

myBNB = BernoulliNB(log_likelihood=True)
myBNB.fit(X_train, y_train)
y_pred = myBNB.predict(X_test)
myBNB.score(X_test, y_test)

Model Parameters:
Log Likelihood:  True


0.0

In [13]:
y_test

3    0
4    0
Name: label, dtype: int64

In [14]:
y_pred

[1, 1]