Todo: Create a class for a classifier/model, call train method to update the weights (class attributes), predict should also be a method

In [1]:
import pandas as pd
import numpy as np
from naive_bayes import *
import os

### Data Processing

In [2]:
index_path = './trec06p/label/index'
data_dir = './trec06p/data/'
with open(index_path) as f:
    index_list = f.readlines()

index_dict = {}
for index in index_list:
    index = index.split()
    index_dict[index[1]] = 1 if index[0] == 'spam' else 0

corpus = []
labels = []
for key, value in index_dict.items():
    with open(os.path.join(data_dir, key)) as f:
        try:
            corpus.append(f.read())
            labels.append(value)
        except:
            pass

data = pd.DataFrame({'label': labels, 'text': corpus})
data.head()

Unnamed: 0,label,text
0,0,Received: from rodan.UU.NET by aramis.rutgers....
1,1,Received: from unknown (HELO groucho.cs.psu.ed...
2,1,Received:\n\tfrom 24-151-178-89.dhcp.kgpt.tn.c...
3,0,Received: from psuvax1.cs.psu.edu ([130.203.2....
4,1,Received: from 201-1-198-159.dsl.telesp.net.br...


In [3]:
data.label.value_counts()

1    20030
0    12371
Name: label, dtype: int64

Looks like we need to do some balancing

In [4]:
balanced_data = data.groupby('label').apply(lambda x: x.sample(n=10000)).reset_index(drop=True)
balanced_data.label.value_counts()

0    10000
1    10000
Name: label, dtype: int64

Train test split

In [5]:
train_df = balanced_data.sample(frac=0.8)
test_df = balanced_data.drop(train_df.index)

### Feature Engineering

"earn money,” “act now,” “click here,” “buy now,” “limited time offer,” “get rich quick,” “earn extra cash,” “make money fast,” “guaranteed,” “winner,” “bonus,” and “urgent.

In [6]:

def extract_features(text: str):
    features = {
        'has_FREE': text.count('FREE') > 0,
        'has_earn_money': text.count('earn money') > 0,
        'has_act_now': text.count('act now') > 0,
        'has_click_here': text.count('click here') > 0,
        'has_buy_now': text.count('buy now') > 0,
        'has_limited_time_offer': text.count('limited time offer') > 0,
        'has_get_rich_quick': text.count('get rich quick') > 0,
        'has_earn_extra_cash': text.count('earn extra cash') > 0,
        'has_make_money_fast': text.count('make money fast') > 0,
        'has_guaranteed': text.count('guaranteed') > 0,
        'has_winner': text.count('winner') > 0,
        'has_bonus': text.count('bonus') > 0,
        'has_urgent': text.count('urgent') > 0
    }
    return features

def preprocess(X):
    features = X.iloc[:,0].apply(extract_features).apply(pd.Series)
    X = pd.concat([X, features], axis=1)
    return X

X_train = preprocess(train_df.drop('label', axis=1))
X_test = preprocess(test_df.drop('label', axis=1))
y_train = train_df['label']
y_test = test_df['label']

### Modelling

In [7]:
from naive_bayes import *

In [8]:
myBNB = bernoulliNB()
myBNB.fit(X_train, y_train)
y_pred = myBNB.predict(X_test)
myBNB.score(X_test, y_test)

0.51875