In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score
np.random.seed(42)

# Data Exploration

In [2]:
index_path = './trec06p/label/index'
data_dir = './trec06p/data/'
with open(index_path) as f:
    index_list = f.readlines()

index_dict = {}
for index in index_list:
    index = index.split()
    index_dict[index[1]] = 1 if index[0] == 'spam' else 0

corpus = []
labels = []
for key, value in index_dict.items():
    with open(os.path.join(data_dir, key)) as f:
        try:
            corpus.append(f.read())
            labels.append(value)
        except:
            pass

raw_data = pd.DataFrame({'label': labels, 'text': corpus})
raw_data.head()

Unnamed: 0,label,text
0,0,Received: from rodan.UU.NET by aramis.rutgers....
1,1,Received: from unknown (HELO groucho.cs.psu.ed...
2,1,Received:\n\tfrom 24-151-178-89.dhcp.kgpt.tn.c...
3,0,Received: from psuvax1.cs.psu.edu ([130.203.2....
4,1,Received: from 201-1-198-159.dsl.telesp.net.br...


### Toy dataset (没什么用，拿来测试模型能不能正常跑)

In [3]:
toy = [
    {"text": "Win big prizes now!", "label": "spam"},
    {"text": "Important information about your account", "label": "spam"},
    {"text": "Dear friend, I have a business proposal for you", "label": "spam"},
    {"text": "This is a normal email, nothing suspicious", "label": "not spam"},
    {"text": "Another normal message", "label": "not spam"},
    {"text": "Limited time offer, buy now and save big!", "label": "spam"}
]
toy_df = pd.DataFrame(toy, columns=["label", "text"])
toy_df['label'] = toy_df['label'].apply(lambda x: 1 if x == 'spam' else 0)
# data = toy_df # uncomment this line to use the toy dataset

# Doing things to the dataset

### 1.1 Converting to lower case

In [4]:
data = raw_data.copy()
data['text'] = data['text'].apply(lambda x: x.lower())

### 1.2 Data balancing （发现没有balance效果更好)

In [5]:
# data = data.groupby('label').apply(lambda x: x.sample(n=10000, replace=True)).reset_index(drop=True)
data.label.value_counts()

1    20030
0    12371
Name: label, dtype: int64

### 2. Train test split

In [6]:
def train_test_split(data: pd.DataFrame, test_size=0.3):
    train_df = data.sample(frac=1-test_size)
    test_df = data.drop(train_df.index)
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

train_df, test_df = train_test_split(data, test_size=0.3)

# Feature Engineering

Using just a bunch of keywords

In [7]:
def get_features_bernoulli(text: str):
    keywords = [
        'FREE', 'free', 'earn money', 'act now', 'click here', 'buy now',
        'limited time offer', 'get rich quick', 'earn extra cash', 'make money fast',
        'guaranteed', 'winner', 'bonus', 'urgent', 'credit card', 'lowest price',
        'amazing', 'incredible deal', 'no cost', 'risk free', 'special promotion',
        'exclusive offer', 'million dollars', 'once in a lifetime', 'password',
        'account suspended', 'confidentiality', 'discount',
        'win', 'winner', 'cash', 'prize', 'exclusive', 'urgent', 'important', 
        'free', 'act now', 'offer', 'credit', 'cheap', 'bonus', 'click', 'apply', 'buy', 'limited', 'guaranteed', 'save'
    ]

    features = {f'has_{keyword.replace(" ", "_")}': text.lower().count(keyword.lower()) > 0 for keyword in keywords}
    return features

def get_features_multinomial(text: str):
    keywords = [
        'FREE', 'free', 'earn money', 'act now', 'click here', 'buy now',
        'limited time offer', 'get rich quick', 'earn extra cash', 'make money fast',
        'guaranteed', 'winner', 'bonus', 'urgent', 'credit card', 'lowest price',
        'amazing', 'incredible deal', 'no cost', 'risk free', 'special promotion',
        'exclusive offer', 'million dollars', 'once in a lifetime', 'password',
        'account suspended', 'confidentiality', 'discount',
        'win', 'winner', 'cash', 'prize', 'exclusive', 'urgent', 'important', 
        'free', 'act now', 'offer', 'credit', 'cheap', 'bonus', 'click', 'apply', 'buy', 'limited', 'guaranteed', 'save'
    ]

    features = {f'count_{keyword.replace(" ", "_")}': text.lower().count(keyword.lower()) for keyword in keywords}
    return features


Using sklearn's tfidf vectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7)

# Pre-processing

In [9]:
def preprocess(X, func):
    features = X.iloc[:,0].apply(func).apply(pd.Series)
    X = pd.concat([X, features], axis=1).drop('text', axis=1)
    return X

In [10]:
y_train = train_df['label']
y_test = test_df['label']

BernoulliNB

In [11]:
X_train_bern = preprocess(train_df.drop('label', axis=1), get_features_bernoulli)
X_test_bern = preprocess(test_df.drop('label', axis=1), get_features_bernoulli)
X_train_bern.head()

Unnamed: 0,has_FREE,has_free,has_earn_money,has_act_now,has_click_here,has_buy_now,has_limited_time_offer,has_get_rich_quick,has_earn_extra_cash,has_make_money_fast,...,has_exclusive,has_important,has_offer,has_credit,has_cheap,has_click,has_apply,has_buy,has_limited,has_save
0,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


MultinomialNB (my features)

In [12]:
X_train_multi = preprocess(train_df.drop('label', axis=1), get_features_multinomial)
X_test_multi = preprocess(test_df.drop('label', axis=1), get_features_multinomial)
X_train_multi.head()

Unnamed: 0,count_FREE,count_free,count_earn_money,count_act_now,count_click_here,count_buy_now,count_limited_time_offer,count_get_rich_quick,count_earn_extra_cash,count_make_money_fast,...,count_exclusive,count_important,count_offer,count_credit,count_cheap,count_click,count_apply,count_buy,count_limited,count_save
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


MultinomialNB (tfidf features)

In [13]:
def preprocess_tfidf(X, vectorizer):
    features = vectorizer.transform(X['text'])
    return features

# Fit the vectorizer on the training data only
vectorizer.fit(train_df['text'])

X_train_tfidf = preprocess_tfidf(train_df, vectorizer)
X_test_tfidf = preprocess_tfidf(test_df, vectorizer)

# Modelling

### 1. MultinomialNB with Tfidf

In [14]:
from naive_bayes import MultinomialNB

multi = MultinomialNB()
multi.fit(X_train_tfidf, y_train)
y_test_multi_tfidf = multi.predict(X_test_tfidf)
accuracy_score(y_test_multi_tfidf, y_test)

0.9265432098765433

### 1.2 MultinomialNB with no Tfidf and just some keywords

In [15]:
from naive_bayes import MultinomialNB

multi = MultinomialNB()
model = multi.fit(X_train_multi, y_train)
y_test_multi = multi.predict(X_test_multi)
accuracy_score(y_test_multi, y_test)

0.6309670781893004

### 2. BernoulliNB

In [16]:
from naive_bayes import BernoulliNB
# from baseline import BernoulliNB

bern = BernoulliNB()
bern.fit(X_train_bern, y_train)
y_pred_bern = bern.predict(X_test_bern)
accuracy_score(y_test, y_pred_bern)

0.6358024691358025

### 3. Sklearn baseline for comparison

In [17]:
from sklearn.naive_bayes import BernoulliNB as sk_BernoulliNB
from sklearn.naive_bayes import MultinomialNB as sk_MultinomialNB

bern_sk = sk_BernoulliNB()
bern_sk.fit(X_train_bern, y_train)
y_pred_sk = bern_sk.predict(X_test_bern)
print(accuracy_score(y_test, y_pred_sk))

multi_sk = sk_MultinomialNB()
multi_sk.fit(X_train_tfidf, y_train)
y_pred_sk = multi_sk.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_sk))

0.6356995884773663
0.9265432098765433


For BernoulliNB, my model performs slightly better than sklearn's.
FOr MultinomialNB, my model performs identically to sklearn's

We'll be using MultinomialNB from now on, since it has the best results.

# Question 1: 5%, 50%, 100% of dataset for training

In [18]:
for size in [0.05, 0.5, 1]:
    print("Train size:", size)
    
    train_df, test_df = train_test_split(data, test_size=1-size)

    if len(test_df) == 0:
        test_df = data.copy()

    X_train = preprocess_tfidf(train_df, vectorizer)
    X_test = preprocess_tfidf(test_df, vectorizer)
    y_train = train_df['label']
    y_test = test_df['label']

    multi = MultinomialNB()
    multi.fit(X_train, y_train)
    y_pred = multi.predict(X_test)
    print(f"MultinomialNB accuracy for train size {size}: {accuracy_score(y_pred, y_test)}\n")

    # print("\n")
    

Train size: 0.05
MultinomialNB accuracy for train size 0.05: 0.9126409148500698
Train size: 0.5
MultinomialNB accuracy for train size 0.5: 0.9300043207209432
Train size: 1
MultinomialNB accuracy for train size 1: 0.9309589210209561


# K-Fold cross validation

In [19]:
from sklearn.model_selection import KFold

k = 5
iteration = 1
kf = KFold(n_splits=k, shuffle=True, random_state=42)

for train_index, test_index in kf.split(data):
    train_df = data.iloc[train_index]
    test_df = data.iloc[test_index]

    X_train = preprocess_tfidf(train_df, vectorizer)
    X_test = preprocess_tfidf(test_df, vectorizer)
    y_train = train_df['label']
    y_test = test_df['label']

    multi = MultinomialNB()
    multi.fit(X_train, y_train)
    y_pred = multi.predict(X_test)
    print(f"Iteration {iteration}: {accuracy_score(y_pred, y_test)}")
    iteration += 1


    

Iteration 1: 0.9305662706372473
Iteration 2: 0.9308641975308642
Iteration 3: 0.9325617283950617
Iteration 4: 0.9313271604938271
Iteration 5: 0.9253086419753086
