In [74]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [75]:
# Dataset available using filepath 'smsspamcollection/SMSSpamCollection'
df = pd.read_table('SMSSpamCollection', names=['label', 'sms_message'])

# Output printing out first 5 rows
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocessing

In [76]:
# Convert the values in the 'label' column to numerical values
df['label'] = df.label.map({'ham':0, 'spam':1})

In [77]:
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [78]:
df.shape

(5572, 2)

## Train and Test Sets

In [79]:
# split into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


## Applying Bag of Words

In [80]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix
testing_data = count_vector.transform(X_test)

## Machine learning Models

In [81]:
# Import the Bagging, RandomForest, and AdaBoost Classifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier , AdaBoostClassifier

In [82]:
# Instantiate a NaiveBAyesClassifier with:
naive_bayes = MultinomialNB()

# Instantiate a BaggingClassifier with:
# 200 weak learners (n_estimators) and everything else as default values
bagging = BaggingClassifier(n_estimators=200)


# Instantiate a RandomForestClassifier with:
# 200 weak learners (n_estimators) and everything else as default values
randomforest = RandomForestClassifier(n_estimators=200)

# Instantiate an a AdaBoostClassifier with:
# With 300 weak learners (n_estimators) and a learning_rate of 0.2
adaboost = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)

In [83]:
# Fit your NaiveBayesClassifier to the training data
naive_bayes.fit(training_data, y_train)
# Fit your BaggingClassifier to the training data
bagging.fit(training_data, y_train)

# Fit your RandomForestClassifier to the training data
randomforest.fit(training_data, y_train)

# Fit your AdaBoostClassifier to the training data
adaboost.fit(training_data, y_train)

AdaBoostClassifier(learning_rate=0.2, n_estimators=300)

In [84]:
# Predict using NaiveBayesClassifier on the test data
naivebayes_predictions = naive_bayes.predict(testing_data)

# Predict using BaggingClassifier on the test data
bagging_predictions = bagging.predict(testing_data)

# Predict using RandomForestClassifier on the test data
randomforest_predictions = randomforest.predict(testing_data)

# Predict using AdaBoostClassifier on the test data
adaboost_predictions = adaboost.predict(testing_data)

In [85]:
def print_metrics(y_true, preds, model_name=None):
    '''
    INPUT:
    y_true - the y values that are actually true in the dataset (NumPy array or pandas series)
    preds - the predictions for those values from some model (NumPy array or pandas series)
    model_name - (str - optional) a name associated with the model if you would like to add it to the print statements 
    
    OUTPUT:
    None - prints the accuracy, precision, recall, and F1 score
    '''
    if model_name == None:
        print('Accuracy score: ', format(accuracy_score(y_true, preds)))
        print('Precision score: ', format(precision_score(y_true, preds)))
        print('Recall score: ', format(recall_score(y_true, preds)))
        print('F1 score: ', format(f1_score(y_true, preds)))
        print('\n\n')
    
    else:
        print('Accuracy score for ' + model_name + ' :' , format(accuracy_score(y_true, preds)))
        print('Precision score ' + model_name + ' :', format(precision_score(y_true, preds)))
        print('Recall score ' + model_name + ' :', format(recall_score(y_true, preds)))
        print('F1 score ' + model_name + ' :', format(f1_score(y_true, preds)))
        print('\n\n')

In [86]:
# Naive Bayes Classifier scores
print_metrics(y_test, naivebayes_predictions, 'Baggi')

# Print Bagging scores
print_metrics(y_test, bagging_predictions, 'Bagging')

# Print Random Forest scores
print_metrics(y_test, randomforest_predictions, 'RandomForest')

# Print AdaBoost scores
print_metrics(y_test, adaboost_predictions, 'AdaBoost')

Accuracy score for Baggi : 0.9885139985642498
Precision score Baggi : 0.9720670391061452
Recall score Baggi : 0.9405405405405406
F1 score Baggi : 0.9560439560439562



Accuracy score for Bagging : 0.9741564967695621
Precision score Bagging : 0.9116022099447514
Recall score Bagging : 0.8918918918918919
F1 score Bagging : 0.9016393442622951



Accuracy score for RandomForest : 0.9842067480258435
Precision score RandomForest : 1.0
Recall score RandomForest : 0.8810810810810811
F1 score RandomForest : 0.9367816091954023



Accuracy score for AdaBoost : 0.9770279971284996
Precision score AdaBoost : 0.9693251533742331
Recall score AdaBoost : 0.8540540540540541
F1 score AdaBoost : 0.9080459770114943





### spam detection needs high precision. Ranfomforest has the highest precision score