### Import libraries

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC
import tests as t
%matplotlib inline

### Wrangling

In [22]:
# Import file
df = pd.read_csv('../Data Files/SMSSpamCollection',sep='\t',header=None)

#Rename the columns
df.rename(columns={0:'label',1:'sms_message'},inplace=True)

def convert(x):
    '''
    If email is spam, return the value 1
    If email is ham, return the value 0
    '''
    if x=='ham':
        return 0
    else:
        return 1

# Map convert() to each item in label column
df['label'] = df['label'].map(convert)

# Split our dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
X_train = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data 
# into the CountVectorizer()
X_test = count_vector.transform(X_test)

### Instantiating the models

In [24]:
#model = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=2), n_estimators = 4)

# Instantiate a BaggingClassifier with:
# 200 weak learners (n_estimators) and everything else as default values
bagging_model = BaggingClassifier(n_estimators=200)


# Instantiate a RandomForestClassifier with:
# 200 weak learners (n_estimators) and everything else as default values
rfr_model = RandomForestClassifier(n_estimators=200)

# Instantiate an a AdaBoostClassifier with:
# With 300 weak learners (n_estimators) and a learning_rate of 0.2
ada_model = AdaBoostClassifier(n_estimators=300,learning_rate=0.2)

# Instantiate SVM Classifier
svm_model = SVC(gamma='auto',C=10)

# Instantiate Naive Bayes
naive_bayes = MultinomialNB()

### Fitting the models

In [25]:
# Fit your BaggingClassifier to the training data
bagging_model.fit(X_train,y_train)

# Fit your RandomForestClassifier to the training data
rfr_model.fit(X_train,y_train)

# Fit your AdaBoostClassifier to the training data
ada_model.fit(X_train,y_train)

# SVM
svm_model.fit(X_train,y_train)

# Naive Bayes
naive_bayes.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
# The models you fit above were fit on which data?

a = 'X_train'
b = 'X_test'
c = 'y_train'
d = 'y_test'
e = 'training_data'
f = 'testing_data'

# Change models_fit_on to only contain the correct string names
# of values that you oassed to the above models

# models_fit_on = {a, b, c, d, e, f} # update this to only contain correct letters
models_fit_on = {a, c}

# Checks your solution - don't change this
t.test_one(models_fit_on)

AttributeError: module 'test' has no attribute 'test_one'

### Make Predictions

In [26]:
# Predict using BaggingClassifier on the test data
bagging_predictions = bagging_model.predict(X_test)

# Predict using RandomForestClassifier on the test data
rfr_predictions = rfr_model.predict(X_test)

# Predict using AdaBoostClassifier on the test data
ada_predictions = ada_model.predict(X_test)

# Naive Bayes
naive_predictions = naive_bayes.predict(X_test)

# Predict using SVM
svm_predictions = svm_model.predict(X_test)

In [27]:
def print_metrics(y_true, preds, model_name=None):
    '''
    INPUT:
    y_true - the y values that are actually true in the dataset (NumPy array or pandas series)
    preds - the predictions for those values from some model (NumPy array or pandas series)
    model_name - (str - optional) a name associated with the model if you would like to add it to the print statements 
    
    OUTPUT:
    None - prints the accuracy, precision, recall, and F1 score
    '''
    if model_name == None:
        print('Accuracy score: ', format(accuracy_score(y_true, preds)))
        print('Precision score: ', format(precision_score(y_true, preds)))
        print('Recall score: ', format(recall_score(y_true, preds)))
        print('F1 score: ', format(f1_score(y_true, preds)))
        print('\n\n')
    
    else:
        print('Accuracy score for ' + model_name + ' :' , format(accuracy_score(y_true, preds)))
        print('Precision score ' + model_name + ' :', format(precision_score(y_true, preds)))
        print('Recall score ' + model_name + ' :', format(recall_score(y_true, preds)))
        print('F1 score ' + model_name + ' :', format(f1_score(y_true, preds)))
        print('\n\n')

# Print Bagging scores
print_metrics(y_test,bagging_predictions,'Bagging Classifier')

# Print Random Forest scores
print_metrics(y_test,rfr_predictions,'Random Forest Classifier')

# Print AdaBoost scores
print_metrics(y_test,ada_predictions,'Adaboost Classifier')

# Naive Bayes Classifier scores
print_metrics(y_test,naive_predictions,'Naive Bayes Classifier')

# SVM Classifier scores
print_metrics(y_test,svm_predictions,'SVM Classifier')

Accuracy score for Bagging Classifier : 0.9741564967695621
Precision score Bagging Classifier : 0.9116022099447514
Recall score Bagging Classifier : 0.8918918918918919
F1 score Bagging Classifier : 0.9016393442622951



Accuracy score for Random Forest Classifier : 0.9806173725771715
Precision score Random Forest Classifier : 1.0
Recall score Random Forest Classifier : 0.8540540540540541
F1 score Random Forest Classifier : 0.9212827988338192



Accuracy score for Adaboost Classifier : 0.9770279971284996
Precision score Adaboost Classifier : 0.9693251533742331
Recall score Adaboost Classifier : 0.8540540540540541
F1 score Adaboost Classifier : 0.9080459770114943



Accuracy score for Naive Bayes Classifier : 0.9885139985642498
Precision score Naive Bayes Classifier : 0.9720670391061452
Recall score Naive Bayes Classifier : 0.9405405405405406
F1 score Naive Bayes Classifier : 0.9560439560439562



Accuracy score for SVM Classifier : 0.9468772433596554
Precision score SVM Classifier : 0.9

- 'We have imbalanced classes, which metric do we definitely not want to use?': accuracy
- 'We really want to make sure the positive cases are all caught even if that means we -      identify some negatives as positives': recall (False -ve)    
- 'When we identify something as positive, we want to be sure it is truly positive':          Precision
- 'We care equally about identifying positive and negative cases': f1-score 

##### use the answers you found to the previous questiona, then match the model that did best for each metric
a = "naive-bayes"
b = "bagging"
c = "random-forest"
d = 'ada-boost'
e = "svm"

- 'We have imbalanced classes, which metric do we definitely not want to use?': a,
- 'We really want to make sure the positive cases are all caught even if that means we identify some negatives as      positives': a,    
- 'When we identify something as positive, we want to be sure it is truly positive': c, 
- 'We care equally about identifying positive and negative cases': a  
