# Spam Detection

In [4]:
# In the spam detection problem, there are 2 classes: C1 which is the no-spam (ham) class and C2 which is the spam class. 
# X is essentially each email present in the training data. 
# To convert X into a machine-readable form (number), we basically need to convert X into a vector. 
# We achieve it by the following way:

# Create an ordered list of all the words in the vocabulary. 
# For instance, suppose we have the following words in the vocabulary: [lottery, how, won, offer, thanks, the, you].
# To convert an email into a vector, map out the number of times each word occurs in that email. 
# For instance, consider the following email: you won the lottery. 
# The vector form of the above email would be [1, 0, 1, 0, 0, 1, 1].

# Now that we have mapped each email into a vector, we can apply the Naive Bayes algorithm on the data. 
# Observe that in the above process, we assumed that each word is produced independent of each other 
# and we discarded the ordering of words in the email. 
# This exactly is the “Naive” assumption and that’s how we plan to apply the Naive Bayes algorithm to this problem.

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

In [6]:
# Any results you write to the current directory are saved as output.
data = pd.read_csv('spam.csv',encoding='latin-1')

data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"class", "v2":"text"})
data.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data['length'] = data['text'].apply(len)
data.head()

Unnamed: 0,class,text,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [9]:
# check the length of each text messages to see whether it is correlated with the text classified as a spam or not.

In [10]:
def pre_process(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [11]:
textFeatures = data['text'].copy()
textFeatures = textFeatures.apply(pre_process)
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(textFeatures)

features_train, features_test, labels_train, labels_test = train_test_split(features, data['class'], test_size=0.3, random_state=111)

In [12]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)
prediction = mnb.predict(features_test)
accuracy_score(labels_test,prediction)

0.9850478468899522

In [13]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(labels_test, prediction))

[[1429   11]
 [  14  218]]


In [14]:
from sklearn.metrics import classification_report
target_names = ['spam', 'ham']
print(classification_report(labels_test, prediction, target_names=target_names))

             precision    recall  f1-score   support

       spam       0.99      0.99      0.99      1440
        ham       0.95      0.94      0.95       232

avg / total       0.98      0.99      0.99      1672



# Building Machine Learning Classifiers: Explore Random Forest model with grid-search

In [16]:
# Build our own Grid-search
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [17]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(features_train, labels_train)
    labels_pred = rf_model.predict(features_test)
    precision, recall, fscore, support = score(labels_test, labels_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((labels_pred==labels_test).sum() / len(labels_pred), 3)))

In [18]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 1.0 / Recall: 0.276 / Accuracy: 0.9
Est: 10 / Depth: 20 ---- Precision: 1.0 / Recall: 0.491 / Accuracy: 0.929
Est: 10 / Depth: 30 ---- Precision: 1.0 / Recall: 0.634 / Accuracy: 0.949
Est: 10 / Depth: None ---- Precision: 0.979 / Recall: 0.797 / Accuracy: 0.969
Est: 50 / Depth: 10 ---- Precision: 1.0 / Recall: 0.147 / Accuracy: 0.882
Est: 50 / Depth: 20 ---- Precision: 1.0 / Recall: 0.534 / Accuracy: 0.935
Est: 50 / Depth: 30 ---- Precision: 1.0 / Recall: 0.69 / Accuracy: 0.957
Est: 50 / Depth: None ---- Precision: 1.0 / Recall: 0.828 / Accuracy: 0.976
Est: 100 / Depth: 10 ---- Precision: 1.0 / Recall: 0.22 / Accuracy: 0.892
Est: 100 / Depth: 20 ---- Precision: 1.0 / Recall: 0.504 / Accuracy: 0.931
Est: 100 / Depth: 30 ---- Precision: 1.0 / Recall: 0.69 / Accuracy: 0.957
Est: 100 / Depth: None ---- Precision: 1.0 / Recall: 0.828 / Accuracy: 0.976


# Building Machine Learning Classifiers: Explore Gradient Boosting model with grid-search

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

In [22]:
print(dir(GradientBoostingClassifier))
print(GradientBoostingClassifier())

['_SUPPORTED_LOSS', '__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_check_initialized', '_check_params', '_clear_state', '_decision_function', '_estimator_type', '_fit_stage', '_fit_stages', '_get_param_names', '_init_decision_function', '_init_state', '_is_initialized', '_make_estimator', '_resize_state', '_staged_decision_function', '_validate_estimator', '_validate_y', 'apply', 'decision_function', 'feature_importances_', 'fit', 'get_params', 'n_features', 'predict', 'predict_log_proba', 'predict_proba', 'score', 's

In [23]:
def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(features_train, labels_train)
    labels_pred = gb_model.predict(features_test)
    precision, recall, fscore, train_support = score(labels_test, labels_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} / LR: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        est, max_depth, lr, round(precision, 3), round(recall, 3), 
        round((labels_pred==labels_test).sum()/len(labels_pred), 3)))

In [24]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

  'precision', 'predicted', average, warn_for)


Est: 50 / Depth: 3 / LR: 0.01 ---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.861
Est: 50 / Depth: 3 / LR: 0.1 ---- Precision: 0.958 / Recall: 0.685 / Accuracy: 0.952
Est: 50 / Depth: 3 / LR: 1 ---- Precision: 0.877 / Recall: 0.797 / Accuracy: 0.956
Est: 50 / Depth: 7 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.017 / Accuracy: 0.864
Est: 50 / Depth: 7 / LR: 0.1 ---- Precision: 0.895 / Recall: 0.806 / Accuracy: 0.96
Est: 50 / Depth: 7 / LR: 1 ---- Precision: 0.885 / Recall: 0.832 / Accuracy: 0.962
Est: 50 / Depth: 11 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.004 / Accuracy: 0.862
Est: 50 / Depth: 11 / LR: 0.1 ---- Precision: 0.887 / Recall: 0.815 / Accuracy: 0.96
Est: 50 / Depth: 11 / LR: 1 ---- Precision: 0.9 / Recall: 0.819 / Accuracy: 0.962
Est: 50 / Depth: 15 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.004 / Accuracy: 0.862
Est: 50 / Depth: 15 / LR: 0.1 ---- Precision: 0.913 / Recall: 0.819 / Accuracy: 0.964
Est: 50 / Depth: 15 / LR: 1 ---- Precision: 0.87 / Recall: 0.81 / Accuracy: 