In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import re, string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import *

In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [42]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.pipeline import Pipeline

In [39]:
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb

In [3]:
df = pd.read_csv("data/spam.csv", encoding = 'latin-1') 
df = df.dropna(how="any", axis=1)
df.columns = ['target', 'message']

df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['message_len'] = df['message'].apply(lambda x: len(x.split(' ')))

In [5]:
max(df['message_len'])

171

### Pre-process data

In [6]:
# Define stop words
# nltk.download('stopwords')
stop_words = stopwords.words('english')
add_words = ['hehe', 'im', 'c', 'u']
stop_words = stop_words + add_words 

# Define stemmer 
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    # Set to lower case, remove symbols, numbers, breaks
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    # Remove stop words
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    
    # Apply stemmer
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    return text

In [7]:
df['message_clean'] = df['message'].apply(preprocess_text)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say earli hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though


In [13]:
# Encode target variable
le = LabelEncoder()
le.fit(df['target'])

df['target_encoded'] = le.transform(df['target'])
df.head()

Unnamed: 0,target,message,message_len,message_clean,target_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...,0
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...,1
3,ham,U dun say so early hor... U c already then say...,11,dun say earli hor alreadi say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though,0


In [22]:
X = df['message_clean']
y = df['target_encoded']

print(len(X), len(y))

5572 5572


In [23]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

4179 4179
1393 1393


In [28]:
# Vectorize
vect = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=100)
vect.fit(X_train)

CountVectorizer(max_features=100, ngram_range=(1, 2), stop_words='english')

In [29]:
# Create document-term matrices
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

In [31]:
# TF-IDF
tfidf_transformer = TfidfTransformer()

tfidf_transformer.fit(X_train_dtm)
X_train_tfidf = tfidf_transformer.transform(X_train_dtm)


In [32]:
X_train_tfidf

<4179x100 sparse matrix of type '<class 'numpy.float64'>'
	with 9160 stored elements in Compressed Sparse Row format>

In [33]:
# Later: GloVE

### Naive Bayes, Bag of Words

In [40]:
NB = MultinomialNB()
NB.fit(X_train_dtm, y_train)

MultinomialNB()

In [59]:
y_pred = NB.predict(X_test_dtm)
y_pred_prob = NB.predict_proba(X_test_dtm)[:,1]

In [52]:
print(metrics.accuracy_score(y_test, y_pred))

0.9361091170136396


In [53]:
print(metrics.confusion_matrix(y_test, y_pred))

[[1180   22]
 [  67  124]]


In [54]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1202
           1       0.85      0.65      0.74       191

    accuracy                           0.94      1393
   macro avg       0.90      0.82      0.85      1393
weighted avg       0.93      0.94      0.93      1393



In [61]:
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.9340475298586126


### Naive Bayes, TF-IDF

In [63]:
pipeline = Pipeline([('bow', CountVectorizer()),
                     ('tfdif', TfidfTransformer()),
                     ('model', MultinomialNB())])

In [70]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:,1]

In [71]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1202
           1       1.00      0.71      0.83       191

    accuracy                           0.96      1393
   macro avg       0.98      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393



In [72]:
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.9745624656985303


### XGBoost, TF-IDF

In [80]:
pipeline = Pipeline([('bow', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('model',xgb.XGBClassifier(
                 learning_rate = 0.1,
                 max_depth = 7,
                 n_estimators = 100,
                 use_label_encoder = False,
                 eval_metric = 'auc'))])

In [81]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               eval_metric='auc', gamma=0, gpu_id=-1,
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.1, max_delta_step=0, max_depth=7,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=4, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', use_label_encoder=False,
                  

In [82]:
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:,1]

In [83]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1202
           1       0.98      0.77      0.87       191

    accuracy                           0.97      1393
   macro avg       0.97      0.89      0.92      1393
weighted avg       0.97      0.97      0.97      1393



In [85]:
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.977243425007187
