In [2]:
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

import matplotlib.pyplot as plt
import re
import os
import gensim
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB # import naive bayes
from sklearn.naive_bayes import MultinomialNB# import naive bayes
from sklearn.tree import DecisionTreeClassifier # import Decision Tree
from sklearn.ensemble import RandomForestClassifier # import random forest

# Import File

In [3]:
data = pd.read_csv("Spam Email.csv", usecols=["CATEGORY", "MESSAGE"])

# Data Preprocessing (taken directly from Lab 3)

In [4]:
# remove non alphabetical characters
remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)

# tokenn alphabets-only list
tokenize = lambda x: word_tokenize(x)

# assign Porter Stemmer to a lambda function to run on each line of value
Porter_Stemmer = PorterStemmer()
stem = lambda w: [ Porter_Stemmer.stem(x) for x in w ]

# assign lemmatizer to a lambda function to run on each line of value
lemmatizer = WordNetLemmatizer()
lemmatizer_2 = lambda x: [ lemmatizer.lemmatize(word) for word in x ]

# apply all above methods to the MESSAGE column
data['MESSAGE'] = data['MESSAGE'].apply(remove_non_alphabets)
data['MESSAGE'] = data['MESSAGE'].apply(tokenize)
data['MESSAGE'] = data['MESSAGE'].apply(stem)
data['MESSAGE'] = data['MESSAGE'].apply(lemmatizer_2)
data['MESSAGE'] = data['MESSAGE'].apply(lambda x: ' '.join(x))
data.head()

Unnamed: 0,CATEGORY,MESSAGE
0,1,dear homeown interest rate are at their lowest...
1,1,attent thi is a must for all comput user new s...
2,1,thi is a multi part messag in mime format next...
3,1,import inform the new domain name are final av...
4,1,thi is the bottom line If you can give away CD...


# Split data into simple training and test sets, per the HW instructions (taken directly from Lab 3)

In [5]:
# split into 30 percent test data and 70 percent training data
train_corpus, test_corpus, train_labels, test_labels = train_test_split(data["MESSAGE"], data["CATEGORY"], test_size=0.3)

# Creation of features for Machine Learning models (taken directly from Lab 3)

## Bag of Words ("BoW" for short; "binary", in the words of the homework assignment)

In [6]:
# build bag of words features' vectorizer and get features
BoW_vectorizer=CountVectorizer(min_df=1, ngram_range=(1,1))
BoW_train_features = BoW_vectorizer.fit_transform(train_corpus)
BoW_test_features = BoW_vectorizer.transform(test_corpus)

## TF-IDF

In [7]:
# build tfidf features' vectorizer and get features
tfidf_vectorizer=TfidfVectorizer(min_df=1, norm='l2', smooth_idf=True, use_idf=True, ngram_range=(1,1))

tfidf_train_features = tfidf_vectorizer.fit_transform(train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(test_corpus)

## Word2Vec ("frequency", in the words of the homework assignment)

In [8]:
# tokenize documents for word2vec
tokenized_train = [nltk.word_tokenize(text)
                   for text in train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                   for text in test_corpus]  

# build word2vec model                   
wv_model = gensim.models.Word2Vec(tokenized_train,
                               vector_size=200,                          #set the size or dimension for the word vectors 
                               window=60,                        #specify the length of the window of words taken as context
                               min_count=10)                   #ignores all words with total frequency lower than 10

def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector 
   

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

# averaged word vector features from word2vec
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=wv_model,
                                                 num_features=200)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=wv_model,
                                                num_features=200)

# Define a function for training and testing models (taken directly from Lab 3, with a minor modification)

In [9]:
# define a function that trains the model, performs predictions and evaluates the predictions
def train_predict_model(classifier, train_features, train_labels, test_features):
    
    # build model    
    classifier.fit(train_features, train_labels)
    
    # predict using model and return predictions
    predictions = classifier.predict(test_features) 
    return predictions

# Run Models on Bag of Words Features

## Naive Bayes model ("NB" for short)

### Train and test model (taken directly from Lab 3)

In [12]:
# assign naive bayes function to an object
mnb = MultinomialNB()

# predict and evaluate naive bayes
mnb_BoW_predictions = train_predict_model(classifier = mnb, train_features = BoW_train_features, train_labels = train_labels,
                                           test_features = BoW_test_features)

### Analyze Confusion Matrix

In [13]:
#Count the predicted one and zeros and the actual ones and zeros
#confusion_matrix(y_true, y_pred)

#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1] .
NB_BoW_confusion_matrix = confusion_matrix(test_labels, mnb_BoW_predictions)

NB_BoW_true_positive = NB_BoW_confusion_matrix[1,1]
NB_BoW_true_negative = NB_BoW_confusion_matrix[0,0]
NB_BoW_false_positive = NB_BoW_confusion_matrix[0,1]
NB_BoW_false_negative = NB_BoW_confusion_matrix[1,0]

print('This is the Confusion Matrix for the Naive Bayes model with Bag of Words features:\n', NB_BoW_confusion_matrix)

#Calculate positive precision
NB_BoW_positive_precision = round(NB_BoW_true_positive / (NB_BoW_true_positive + NB_BoW_false_positive), 2)
print('\nThe positive precision for the Naive Bayes model with Bag of Words features is', NB_BoW_positive_precision)

#Calculate negative precision
NB_BoW_negative_precision = round(NB_BoW_true_negative / (NB_BoW_true_negative + NB_BoW_false_negative), 2)
print('The negative precision for the Naive Bayes model with Bag of Words features is', NB_BoW_negative_precision)

#Calculate positive recall
NB_BoW_positive_recall = round(NB_BoW_true_positive / (NB_BoW_true_positive + NB_BoW_false_negative), 2)
print('The positive recall for the Naive Bayes model with Bag of Words features is', NB_BoW_positive_recall)

#Calculate negative recall
NB_BoW_negative_recall = round(NB_BoW_true_negative / (NB_BoW_true_negative + NB_BoW_false_positive), 2)
print('The negative recall for the Naive Bayes model with Bag of Words features is', NB_BoW_negative_recall)

This is the Confusion Matrix for the Naive Bayes model with Bag of Words features:
 [[1164    6]
 [ 130  439]]

The positive precision for the Naive Bayes model with Bag of Words features is 0.99
The negative precision for the Naive Bayes model with Bag of Words features is 0.9
The positive recall for the Naive Bayes model with Bag of Words features is 0.77
The negative recall for the Naive Bayes model with Bag of Words features is 0.99


## Decision Tree ("DT" for short)

### Train and test model (taken directly from Lab 3)

In [14]:
# assign decision tree function to an object
DT = DecisionTreeClassifier()

# predict and evaluate decision tree
DT_BoW_predictions = train_predict_model(classifier=DT, train_features=BoW_train_features, train_labels=train_labels,
                                         test_features=BoW_test_features)

### Analyze Confusion Matrix

In [15]:
#Count the predicted one and zeros and the actual ones and zeros
#confusion_matrix(y_true, y_pred)

#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1] .
DT_BoW_confusion_matrix = confusion_matrix(test_labels, DT_BoW_predictions)

DT_BoW_true_positive = DT_BoW_confusion_matrix[1,1]
DT_BoW_true_negative = DT_BoW_confusion_matrix[0,0]
DT_BoW_false_positive = DT_BoW_confusion_matrix[0,1]
DT_BoW_false_negative = DT_BoW_confusion_matrix[1,0]

print('This is the Confusion Matrix for the Decision Tree model with Bag of Words features:\n', DT_BoW_confusion_matrix)

#Calculate positive precision
DT_BoW_positive_precision = round(DT_BoW_true_positive / (DT_BoW_true_positive + DT_BoW_false_positive), 2)
print('\nThe positive precision for the Decision Tree model with Bag of Words features is', DT_BoW_positive_precision)

#Calculate negative precision
DT_BoW_negative_precision = round(DT_BoW_true_negative / (DT_BoW_true_negative + DT_BoW_false_negative), 2)
print('The negative precision for the Decision Tree model with Bag of Words features is', DT_BoW_negative_precision)

#Calculate positive recall
DT_BoW_positive_recall = round(DT_BoW_true_positive / (DT_BoW_true_positive + DT_BoW_false_negative), 2)
print('The positive recall for the Decision Tree model with Bag of Words features is', DT_BoW_positive_recall)

#Calculate negative recall
DT_BoW_negative_recall = round(DT_BoW_true_negative / (DT_BoW_true_negative + DT_BoW_false_positive), 2)
print('The negative recall for the Decision Tree model with Bag of Words features is', DT_BoW_negative_recall)

This is the Confusion Matrix for the Decision Tree model with Bag of Words features:
 [[1125   45]
 [  40  529]]

The positive precision for the Decision Tree model with Bag of Words features is 0.92
The negative precision for the Decision Tree model with Bag of Words features is 0.97
The positive recall for the Decision Tree model with Bag of Words features is 0.93
The negative recall for the Decision Tree model with Bag of Words features is 0.96


## Random Forest model ("RF" for short)

### Train and test model (taken directly from Lab 3)

In [16]:
# assign random forest function to an object
RF = RandomForestClassifier(criterion="entropy")

# predict and evaluate random forest
RF_BoW_predictions = train_predict_model(classifier=RF, train_features=BoW_train_features, train_labels=train_labels,
                                         test_features=BoW_test_features)

### Analyze Confusion Matrix

In [17]:
#Count the predicted one and zeros and the actual ones and zeros
#confusion_matrix(y_true, y_pred)

#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1] .
RF_BoW_confusion_matrix = confusion_matrix(test_labels, RF_BoW_predictions)

RF_BoW_true_positive = RF_BoW_confusion_matrix[1,1]
RF_BoW_true_negative = RF_BoW_confusion_matrix[0,0]
RF_BoW_false_positive = RF_BoW_confusion_matrix[0,1]
RF_BoW_false_negative = RF_BoW_confusion_matrix[1,0]

print('This is the Confusion Matrix for the Random Forest model with Bag of Words features:\n', RF_BoW_confusion_matrix)

#Calculate positive precision
RF_BoW_positive_precision = round(RF_BoW_true_positive / (RF_BoW_true_positive + RF_BoW_false_positive), 2)
print('\nThe positive precision for the Random Forest model with Bag of Words features is', RF_BoW_positive_precision)

#Calculate negative precision
RF_BoW_negative_precision = round(RF_BoW_true_negative / (RF_BoW_true_negative + RF_BoW_false_negative), 2)
print('The negative precision for the Random Forest model with Bag of Words features is', RF_BoW_negative_precision)

#Calculate positive recall
RF_BoW_positive_recall = round(RF_BoW_true_positive / (RF_BoW_true_positive + RF_BoW_false_negative), 2)
print('The positive recall for the Random Forest model with Bag of Words features is', RF_BoW_positive_recall)

#Calculate negative recall
RF_BoW_negative_recall = round(RF_BoW_true_negative / (RF_BoW_true_negative + RF_BoW_false_positive), 2)
print('The negative recall for the Random Forest model with Bag of Words features is', RF_BoW_negative_recall)

This is the Confusion Matrix for the Random Forest model with Bag of Words features:
 [[1163    7]
 [  42  527]]

The positive precision for the Random Forest model with Bag of Words features is 0.99
The negative precision for the Random Forest model with Bag of Words features is 0.97
The positive recall for the Random Forest model with Bag of Words features is 0.93
The negative recall for the Random Forest model with Bag of Words features is 0.99


# Run Models on TF-IDF Features

## Naive Bayes model ("NB" for short)

### Train and test model (taken directly from Lab 3)

In [18]:
# assign naive bayes function to an object
mnb = MultinomialNB()

# predict and evaluate naive bayes
mnb_tfidf_predictions = train_predict_model(classifier = mnb, train_features = tfidf_train_features, train_labels = train_labels,
                                           test_features = tfidf_test_features)

### Analyze Confusion Matrix

In [19]:
#Count the predicted one and zeros and the actual ones and zeros
#confusion_matrix(y_true, y_pred)

#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1] .
NB_tfidf_confusion_matrix = confusion_matrix(test_labels, mnb_tfidf_predictions)

NB_tfidf_true_positive = NB_tfidf_confusion_matrix[1,1]
NB_tfidf_true_negative = NB_tfidf_confusion_matrix[0,0]
NB_tfidf_false_positive = NB_tfidf_confusion_matrix[0,1]
NB_tfidf_false_negative = NB_tfidf_confusion_matrix[1,0]

print('This is the Confusion Matrix for the Naive Bayes model with TF-IDF features:\n', NB_tfidf_confusion_matrix)

#Calculate positive precision
NB_tfidf_positive_precision = round(NB_tfidf_true_positive / (NB_tfidf_true_positive + NB_tfidf_false_positive), 2)
print('\nThe positive precision for the Naive Bayes model with TF-IDF features is', NB_tfidf_positive_precision)

#Calculate negative precision
NB_tfidf_negative_precision = round(NB_tfidf_true_negative / (NB_tfidf_true_negative + NB_tfidf_false_negative), 2)
print('The negative precision for the Naive Bayes model with TF-IDF features is', NB_tfidf_negative_precision)

#Calculate positive recall
NB_tfidf_positive_recall = round(NB_tfidf_true_positive / (NB_tfidf_true_positive + NB_tfidf_false_negative), 2)
print('The positive recall for the Naive Bayes model with TF-IDF features is', NB_tfidf_positive_recall)

#Calculate negative recall
NB_tfidf_negative_recall = round(NB_tfidf_true_negative / (NB_tfidf_true_negative + NB_tfidf_false_positive), 2)
print('The negative recall for the Naive Bayes model with TF-IDF features is', NB_tfidf_negative_recall)

This is the Confusion Matrix for the Naive Bayes model with TF-IDF features:
 [[1166    4]
 [ 203  366]]

The positive precision for the Naive Bayes model with TF-IDF features is 0.99
The negative precision for the Naive Bayes model with TF-IDF features is 0.85
The positive recall for the Naive Bayes model with TF-IDF features is 0.64
The negative recall for the Naive Bayes model with TF-IDF features is 1.0


## Decision Tree ("DT" for short)

### Train and test model (taken directly from Lab 3)

In [20]:
# assign decision tree function to an object
DT = DecisionTreeClassifier()

# predict and evaluate decision tree
DT_tfidf_predictions = train_predict_model(classifier=DT, train_features=tfidf_train_features, train_labels=train_labels,
                                         test_features=tfidf_test_features)

### Analyze Confusion Matrix

In [21]:
#Count the predicted one and zeros and the actual ones and zeros
#confusion_matrix(y_true, y_pred)

#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1] .
DT_tfidf_confusion_matrix = confusion_matrix(test_labels, DT_tfidf_predictions)

DT_tfidf_true_positive = DT_tfidf_confusion_matrix[1,1]
DT_tfidf_true_negative = DT_tfidf_confusion_matrix[0,0]
DT_tfidf_false_positive = DT_tfidf_confusion_matrix[0,1]
DT_tfidf_false_negative = DT_tfidf_confusion_matrix[1,0]

print('This is the Confusion Matrix for the Decision Tree model with TF-IDF features:\n', DT_tfidf_confusion_matrix)

#Calculate positive precision
DT_tfidf_positive_precision = round(DT_tfidf_true_positive / (DT_tfidf_true_positive + DT_tfidf_false_positive), 2)
print('\nThe positive precision for the Decision Tree model with TF-IDF features is', DT_tfidf_positive_precision)

#Calculate negative precision
DT_tfidf_negative_precision = round(DT_tfidf_true_negative / (DT_tfidf_true_negative + DT_tfidf_false_negative), 2)
print('The negative precision for the Decision Tree model with TF-IDF features is', DT_tfidf_negative_precision)

#Calculate positive recall
DT_tfidf_positive_recall = round(DT_tfidf_true_positive / (DT_tfidf_true_positive + DT_tfidf_false_negative), 2)
print('The positive recall for the Decision Tree model with TF-IDF features is', DT_tfidf_positive_recall)

#Calculate negative recall
DT_tfidf_negative_recall = round(DT_tfidf_true_negative / (DT_tfidf_true_negative + DT_tfidf_false_positive), 2)
print('The negative recall for the Decision Tree model with TF-IDF features is', DT_tfidf_negative_recall)

This is the Confusion Matrix for the Decision Tree model with TF-IDF features:
 [[1133   37]
 [  42  527]]

The positive precision for the Decision Tree model with TF-IDF features is 0.93
The negative precision for the Decision Tree model with TF-IDF features is 0.96
The positive recall for the Decision Tree model with TF-IDF features is 0.93
The negative recall for the Decision Tree model with TF-IDF features is 0.97


## Random Forest model ("RF" for short)

### Train and test model (taken directly from Lab 3)

In [22]:
# assign random forest function to an object
RF = RandomForestClassifier(criterion="entropy")

# predict and evaluate random forest
RF_tfidf_predictions = train_predict_model(classifier=RF, train_features=tfidf_train_features, train_labels=train_labels,
                                         test_features=tfidf_test_features)

### Analyze Confusion Matrix

In [23]:
#Count the predicted one and zeros and the actual ones and zeros
#confusion_matrix(y_true, y_pred)

#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1] .
RF_tfidf_confusion_matrix = confusion_matrix(test_labels, RF_tfidf_predictions)

RF_tfidf_true_positive = RF_tfidf_confusion_matrix[1,1]
RF_tfidf_true_negative = RF_tfidf_confusion_matrix[0,0]
RF_tfidf_false_positive = RF_tfidf_confusion_matrix[0,1]
RF_tfidf_false_negative = RF_tfidf_confusion_matrix[1,0]

print('This is the Confusion Matrix for the Random Forest model with TF-IDF features:\n', RF_tfidf_confusion_matrix)

#Calculate positive precision
RF_tfidf_positive_precision = round(RF_tfidf_true_positive / (RF_tfidf_true_positive + RF_tfidf_false_positive), 2)
print('\nThe positive precision for the Random Forest model with TF-IDF features is', RF_tfidf_positive_precision)

#Calculate negative precision
RF_tfidf_negative_precision = round(RF_tfidf_true_negative / (RF_tfidf_true_negative + RF_tfidf_false_negative), 2)
print('The negative precision for the Random Forest model with TF-IDF features is', RF_tfidf_negative_precision)

#Calculate positive recall
RF_tfidf_positive_recall = round(RF_tfidf_true_positive / (RF_tfidf_true_positive + RF_tfidf_false_negative), 2)
print('The positive recall for the Random Forest model with TF-IDF features is', RF_tfidf_positive_recall)

#Calculate negative recall
RF_tfidf_negative_recall = round(RF_tfidf_true_negative / (RF_tfidf_true_negative + RF_tfidf_false_positive), 2)
print('The negative recall for the Random Forest model with TF-IDF features is', RF_tfidf_negative_recall)

This is the Confusion Matrix for the Random Forest model with TF-IDF features:
 [[1164    6]
 [  46  523]]

The positive precision for the Random Forest model with TF-IDF features is 0.99
The negative precision for the Random Forest model with TF-IDF features is 0.96
The positive recall for the Random Forest model with TF-IDF features is 0.92
The negative recall for the Random Forest model with TF-IDF features is 0.99


# Run Models on Word2Vec Features

## Naive Bayes model ("NB" for short) (Please note, per TA instructions, we should use the GaussianNB Python package for Word2Vec)

### Train and test model (taken directly from Lab 3)

In [24]:
# assign naive bayes function to an object
gnb = GaussianNB()

# predict and evaluate naive bayes
gnb_wv_predictions = train_predict_model(classifier = gnb, train_features = avg_wv_train_features, train_labels = train_labels,
                                           test_features = avg_wv_test_features)

### Analyze Confusion Matrix

In [25]:
#Count the predicted one and zeros and the actual ones and zeros
#confusion_matrix(y_true, y_pred)

#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1] .
NB_wv_confusion_matrix = confusion_matrix(test_labels, gnb_wv_predictions)

NB_wv_true_positive = NB_wv_confusion_matrix[1,1]
NB_wv_true_negative = NB_wv_confusion_matrix[0,0]
NB_wv_false_positive = NB_wv_confusion_matrix[0,1]
NB_wv_false_negative = NB_wv_confusion_matrix[1,0]

print('This is the Confusion Matrix for the Naive Bayes model with Word2Vec features:\n', NB_wv_confusion_matrix)

#Calculate positive precision
NB_wv_positive_precision = round(NB_wv_true_positive / (NB_wv_true_positive + NB_wv_false_positive), 2)
print('\nThe positive precision for the Naive Bayes model with Word2Vec features is', NB_wv_positive_precision)

#Calculate negative precision
NB_wv_negative_precision = round(NB_wv_true_negative / (NB_wv_true_negative + NB_wv_false_negative), 2)
print('The negative precision for the Naive Bayes model with Word2Vec features is', NB_wv_negative_precision)

#Calculate positive recall
NB_wv_positive_recall = round(NB_wv_true_positive / (NB_wv_true_positive + NB_wv_false_negative), 2)
print('The positive recall for the Naive Bayes model with Word2Vec features is', NB_wv_positive_recall)

#Calculate negative recall
NB_wv_negative_recall = round(NB_wv_true_negative / (NB_wv_true_negative + NB_wv_false_positive), 2)
print('The negative recall for the Naive Bayes model with Word2Vec features is', NB_wv_negative_recall)

This is the Confusion Matrix for the Naive Bayes model with Word2Vec features:
 [[1125   45]
 [ 102  467]]

The positive precision for the Naive Bayes model with Word2Vec features is 0.91
The negative precision for the Naive Bayes model with Word2Vec features is 0.92
The positive recall for the Naive Bayes model with Word2Vec features is 0.82
The negative recall for the Naive Bayes model with Word2Vec features is 0.96


## Decision Tree ("DT" for short)

### Train and test model (taken directly from Lab 3)

In [26]:
# assign decision tree function to an object
DT = DecisionTreeClassifier()

# predict and evaluate decision tree
DT_wv_predictions = train_predict_model(classifier=DT, train_features=avg_wv_train_features, train_labels=train_labels,
                                         test_features=avg_wv_test_features)

### Analyze Confusion Matrix

In [27]:
#Count the predicted one and zeros and the actual ones and zeros
#confusion_matrix(y_true, y_pred)

#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1] .
DT_wv_confusion_matrix = confusion_matrix(test_labels, DT_wv_predictions)

DT_wv_true_positive = DT_wv_confusion_matrix[1,1]
DT_wv_true_negative = DT_wv_confusion_matrix[0,0]
DT_wv_false_positive = DT_wv_confusion_matrix[0,1]
DT_wv_false_negative = DT_wv_confusion_matrix[1,0]

print('This is the Confusion Matrix for the Decision Tree model with Word2Vec features:\n', DT_wv_confusion_matrix)

#Calculate positive precision
DT_wv_positive_precision = round(DT_wv_true_positive / (DT_wv_true_positive + DT_wv_false_positive), 2)
print('\nThe positive precision for the Decision Tree model with Word2Vec features is', DT_wv_positive_precision)

#Calculate negative precision
DT_wv_negative_precision = round(DT_wv_true_negative / (DT_wv_true_negative + DT_wv_false_negative), 2)
print('The negative precision for the Decision Tree model with Word2Vec features is', DT_wv_negative_precision)

#Calculate positive recall
DT_wv_positive_recall = round(DT_wv_true_positive / (DT_wv_true_positive + DT_wv_false_negative), 2)
print('The positive recall for the Decision Tree model with Word2Vec features is', DT_wv_positive_recall)

#Calculate negative recall
DT_wv_negative_recall = round(DT_wv_true_negative / (DT_wv_true_negative + DT_wv_false_positive), 2)
print('The negative recall for the Decision Tree model with Word2Vec features is', DT_wv_negative_recall)

This is the Confusion Matrix for the Decision Tree model with Word2Vec features:
 [[1152   18]
 [  25  544]]

The positive precision for the Decision Tree model with Word2Vec features is 0.97
The negative precision for the Decision Tree model with Word2Vec features is 0.98
The positive recall for the Decision Tree model with Word2Vec features is 0.96
The negative recall for the Decision Tree model with Word2Vec features is 0.98


## Random Forest model ("RF" for short)

### Train and test model (taken directly from Lab 3)

In [28]:
# assign random forest function to an object
RF = RandomForestClassifier(criterion="entropy")

# predict and evaluate random forest
RF_wv_predictions = train_predict_model(classifier=RF, train_features=avg_wv_train_features, train_labels=train_labels,
                                         test_features=avg_wv_test_features)

### Analyze Confusion Matrix

In [29]:
#Count the predicted one and zeros and the actual ones and zeros
#confusion_matrix(y_true, y_pred)

#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1] .
RF_wv_confusion_matrix = confusion_matrix(test_labels, RF_wv_predictions)

RF_wv_true_positive = RF_wv_confusion_matrix[1,1]
RF_wv_true_negative = RF_wv_confusion_matrix[0,0]
RF_wv_false_positive = RF_wv_confusion_matrix[0,1]
RF_wv_false_negative = RF_wv_confusion_matrix[1,0]

print('This is the Confusion Matrix for the Random Forest model with Word2Vec features:\n', RF_wv_confusion_matrix)

#Calculate positive precision
RF_wv_positive_precision = round(RF_wv_true_positive / (RF_wv_true_positive + RF_wv_false_positive), 2)
print('\nThe positive precision for the Random Forest model with Word2Vec features is', RF_wv_positive_precision)

#Calculate negative precision
RF_wv_negative_precision = round(RF_wv_true_negative / (RF_wv_true_negative + RF_wv_false_negative), 2)
print('The negative precision for the Random Forest model with Word2Vec features is', RF_wv_negative_precision)

#Calculate positive recall
RF_wv_positive_recall = round(RF_wv_true_positive / (RF_wv_true_positive + RF_wv_false_negative), 2)
print('The positive recall for the Random Forest model with Word2Vec features is', RF_wv_positive_recall)

#Calculate negative recall
RF_wv_negative_recall = round(RF_wv_true_negative / (RF_wv_true_negative + RF_wv_false_positive), 2)
print('The negative recall for the Random Forest model with Word2Vec features is', RF_wv_negative_recall)

This is the Confusion Matrix for the Random Forest model with Word2Vec features:
 [[1161    9]
 [  15  554]]

The positive precision for the Random Forest model with Word2Vec features is 0.98
The negative precision for the Random Forest model with Word2Vec features is 0.99
The positive recall for the Random Forest model with Word2Vec features is 0.97
The negative recall for the Random Forest model with Word2Vec features is 0.99


# Calculating the business costs of email mis-classification.
#### Per the rules laid out in the assignment, mis-classifying spam to non-spam (a false negative) costs 5 and mis-classifying non-spam to spam (a false positive) costs 100.

In [31]:
false_negative_cost = 5
false_positive_cost = 100

## Models with Bag of Words features

### Naive Bayes

In [33]:
NB_BoW_false_negative_cost = NB_BoW_false_negative * false_negative_cost
NB_BoW_false_positive_cost = NB_BoW_false_positive * false_positive_cost

NB_BoW_total_cost = NB_BoW_false_negative_cost + NB_BoW_false_positive_cost

print('The Naive Bayes model with Bag of Words features has', NB_BoW_false_negative, 'false negatives and', NB_BoW_false_positive,
     'false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business', NB_BoW_total_cost,'units.')

The Naive Bayes model with Bag of Words features has 130 false negatives and 6 false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business 1250 units.


### Decision Tree

In [34]:
DT_BoW_false_negative_cost = DT_BoW_false_negative * false_negative_cost
DT_BoW_false_positive_cost = DT_BoW_false_positive * false_positive_cost

DT_BoW_total_cost = DT_BoW_false_negative_cost + DT_BoW_false_positive_cost

print('The Decision Tree model with Bag of Words features has', DT_BoW_false_negative, 'false negatives and', DT_BoW_false_positive,
     'false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business', DT_BoW_total_cost,'units.')

The Decision Tree model with Bag of Words features has 40 false negatives and 45 false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business 4700 units.


### Random Forest

In [35]:
RF_BoW_false_negative_cost = RF_BoW_false_negative * false_negative_cost
RF_BoW_false_positive_cost = RF_BoW_false_positive * false_positive_cost

RF_BoW_total_cost = RF_BoW_false_negative_cost + RF_BoW_false_positive_cost

print('The Random Forest model with Bag of Words features has', RF_BoW_false_negative, 'false negatives and', RF_BoW_false_positive,
     'false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business', RF_BoW_total_cost,'units.')

The Random Forest model with Bag of Words features has 42 false negatives and 7 false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business 910 units.


## Models with TF-IDF features

### Naive Bayes

In [36]:
NB_tfidf_false_negative_cost = NB_tfidf_false_negative * false_negative_cost
NB_tfidf_false_positive_cost = NB_tfidf_false_positive * false_positive_cost

NB_tfidf_total_cost = NB_tfidf_false_negative_cost + NB_tfidf_false_positive_cost

print('The Naive Bayes model with TF-IDF features has', NB_tfidf_false_negative, 'false negatives and', NB_tfidf_false_positive,
     'false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business', NB_tfidf_total_cost,'units.')

The Naive Bayes model with TF-IDF features has 203 false negatives and 4 false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business 1415 units.


### Decision Tree

In [37]:
DT_tfidf_false_negative_cost = DT_tfidf_false_negative * false_negative_cost
DT_tfidf_false_positive_cost = DT_tfidf_false_positive * false_positive_cost

DT_tfidf_total_cost = DT_tfidf_false_negative_cost + DT_tfidf_false_positive_cost

print('The Decision Tree model with TF-IDF features has', DT_tfidf_false_negative, 'false negatives and', DT_tfidf_false_positive,
     'false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business', DT_tfidf_total_cost,'units.')

The Decision Tree model with TF-IDF features has 42 false negatives and 37 false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business 3910 units.


### Random Forest

In [38]:
RF_tfidf_false_negative_cost = RF_tfidf_false_negative * false_negative_cost
RF_tfidf_false_positive_cost = RF_tfidf_false_positive * false_positive_cost

RF_tfidf_total_cost = RF_tfidf_false_negative_cost + RF_tfidf_false_positive_cost

print('The Random Forest model with TF-IDF features has', RF_tfidf_false_negative, 'false negatives and', RF_tfidf_false_positive,
     'false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business', RF_tfidf_total_cost,'units.')

The Random Forest model with TF-IDF features has 46 false negatives and 6 false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business 830 units.


## Models with Word2Vec Features

### Naive Bayes

In [39]:
NB_wv_false_negative_cost = NB_wv_false_negative * false_negative_cost
NB_wv_false_positive_cost = NB_wv_false_positive * false_positive_cost

NB_wv_total_cost = NB_wv_false_negative_cost + NB_wv_false_positive_cost

print('The Naive Bayes model with Word2Vec features has', NB_wv_false_negative, 'false negatives and', NB_wv_false_positive,
     'false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business', NB_wv_total_cost,'units.')

The Naive Bayes model with Word2Vec features has 102 false negatives and 45 false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business 5010 units.


### Decision Tree

In [40]:
DT_wv_false_negative_cost = DT_wv_false_negative * false_negative_cost
DT_wv_false_positive_cost = DT_wv_false_positive * false_positive_cost

DT_wv_total_cost = DT_wv_false_negative_cost + DT_wv_false_positive_cost

print('The Decision Tree model with Word2Vec features has', DT_wv_false_negative, 'false negatives and', DT_wv_false_positive,
     'false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business', DT_wv_total_cost,'units.')

The Decision Tree model with Word2Vec features has 25 false negatives and 18 false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business 1925 units.


### Random Forest

In [41]:
RF_wv_false_negative_cost = RF_wv_false_negative * false_negative_cost
RF_wv_false_positive_cost = RF_wv_false_positive * false_positive_cost

RF_wv_total_cost = RF_wv_false_negative_cost + RF_wv_false_positive_cost

print('The Random Forest model with Word2Vec features has', RF_wv_false_negative, 'false negatives and', RF_wv_false_positive,
     'false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business', RF_wv_total_cost,'units.')

The Random Forest model with Word2Vec features has 15 false negatives and 9 false positives. At a cost of 5 units per false negative and 100 units per false positive, this model costs the business 975 units.


## Summary table of costs

In [42]:
model_names = ['Naive Bayes', 'Decision Tree', 'Random Forest']
bag_of_words_costs = [NB_BoW_total_cost, DT_BoW_total_cost, RF_BoW_total_cost]
tfidf_costs = [NB_tfidf_total_cost, DT_tfidf_total_cost, RF_tfidf_total_cost]
word2vec_costs = [NB_wv_total_cost, DT_wv_total_cost, RF_wv_total_cost]

table_data = ({'Model Names':model_names, 'Costs to Business: Bag of Words Features':bag_of_words_costs,
              'Costs to Business: TF-IDF Features':tfidf_costs, 'Costs to Business: Word2Vec Features':word2vec_costs})

costs_table = pd.DataFrame(table_data)

costs_table.head()

Unnamed: 0,Model Names,Costs to Business: Bag of Words Features,Costs to Business: TF-IDF Features,Costs to Business: Word2Vec Features
0,Naive Bayes,1250,1415,5010
1,Decision Tree,4700,3910,1925
2,Random Forest,910,830,975
