### EmailSpamCorpora

In [1]:
import os
import sys
import random
import nltk
import numpy as np
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from statistics import mean
from nltk.collocations import *
from nltk.util import ngrams
#import sklearn packages for building classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV,train_test_split,StratifiedKFold,cross_val_score,learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

##### Variables utilized across the program

In [2]:
datasetPath = '/Users/rashmichakravarthy/Desktop/NLP/NLP Project/FinalProjectData/EmailSpamCorpora/corpus'
emailDoc = []
punctuations = set([",", ".", "@", "#", "%", "^", "&", "*", "(", ")", "_", "-", "=", "+", "{", "}", "[", "]",
             ":", ";", "'" '"', "<", ">", "?", "/"])
nltkStopwords = nltk.corpus.stopwords.words('english')
moreStopwords = ['could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve"]
stopwords = nltkStopwords + moreStopwords
bigramMeasures = nltk.collocations.BigramAssocMeasures()
predScores_wordVectors = []
vocabSize = []
wordToidx = {}
features = []
labels = []

##### Common utitlity functions

In [3]:
#functions defined to read spam and ham files
def process_files_SpamHam(dirPath,limitStr):
    spamTexts = []
    hamTexts = []
    #converting the limit argument to an int from a string
    limit = int(limitStr)

    #starting lists for spam and ham email texts
    os.chdir(dirPath)
    #processing files in directory that end in .txt up to a limit, assuming sufficient randomization of emails 
    for file in os.listdir("./spam"):
        if (file.endswith(".txt")) and (len(spamTexts) < limit):
            #opening file for reading entire file into a string
            f = open("./spam/"+file, 'r', encoding="latin-1")
            spamTexts.append (f.read())
            f.close()
    for file in os.listdir("./ham"):
        if (file.endswith(".txt")) and (len(hamTexts) < limit):
            #opening file for reading entire file into a string
            f = open("./ham/"+file, 'r', encoding="latin-1")
            hamTexts.append (f.read())
            f.close()

    #printing number of emails read
    print ("Number of spam files:",len(spamTexts))
    print ("Number of ham files:",len(hamTexts))

    #adding all spam texts
    for spam in spamTexts:
        tokens = nltk.word_tokenize(spam)
        emailDoc.append((tokens, 'spam'))
    #adding all regular emails
    for ham in hamTexts:
        tokens = nltk.word_tokenize(ham)
        emailDoc.append((tokens, 'ham'))

    #randomizing the list
    random.shuffle(emailDoc)

    #printing few token lists
    for email in emailDoc[:4]:
        print (email)
        
#function defined to removing punctuation
def remove_punctuations(emailDoc):
    emailDoc_without_punctutations = []
    for email in emailDoc:
        emailDoc_without_punctutations.append(([token for token in email[0] if token not in punctuations], email[1]))
    return emailDoc_without_punctutations

In [4]:
#funtion defined to get word features 
def get_word_features(emails, num):
    allWord_list = []
    for email in emails:
        allWord_list.extend(email[0])
    allWords = nltk.FreqDist(allWord_list)
    wordItems = allWords.most_common(num)
    wordFeatures = [word for (word, count) in wordItems]
    return wordFeatures

In [5]:
#funtion defined to get TF-IDF score
def get_tfidf_scores(emails, num):
    vectorizer = TfidfVectorizer()
    emailCorpus = []
    for email in emailDoc:
        listToStr = ' '.join([str(element) for element in email[0]])
        emailCorpus.append(listToStr)
    v = vectorizer.fit_transform(emailCorpus)
    feature = vectorizer.get_feature_names()
    wordFeatures = feature[:num]
    return wordFeatures

In [6]:
#funtion defined to get unigram bag of words
def get_bag_words(email, wordFeatures, Bool=True):
    emailWords = email
    features = {}
    for word in wordFeatures:
        if Bool:
            features[word] = (word in set(emailWords))
        else:
            features[word] = emailWords.count(word)
    return features

In [7]:
#funtion defined to get bigram bag of words
def get_bigram_bag_words(email, bigramFeatures, Bool=True):
    emailBigrams = [" ".join(bigram) for bigram in ngrams(email, 2)]
    features = {}
    for bigram in bigramFeatures:
        if Bool:
            features[bigram] = (bigram in set(emailBigrams))
        else:
            features[bigram] = emailBigrams.count(bigram)
    return features

In [8]:
#funtion defined to remove stopwords 
def remove_stopwords(emailDoc, stopwords):
    emailDoc_without_stopwords = []
    for email in emailDoc:
        emailDoc_without_stopwords.append(([token for token in email[0] if token not in stopwords], email[1]))
    return emailDoc_without_stopwords

In [9]:
def cross_validation_accuracy_evaluation_metrics(foldNums, featureSets):
    subsetSize = int(len(featureSets)/foldNums)
    print('Each fold size:', subsetSize)
    accuracyList = []
    gold = []
    predicted = []
    #iterating over each fold
    for f in range(foldNums):
        trainingRound = featureSets[:(f*subsetSize)] + featureSets[((f+1)*subsetSize):]
        testingRound = featureSets[(f*subsetSize):][:subsetSize]
        #training utilizizing trainingRound
        classifier = nltk.NaiveBayesClassifier.train(trainingRound)
        #evaluating accuracy against testingRound 
        accuracyRound = nltk.classify.accuracy(classifier, testingRound)
        print("Accuracy of Fold {}: {}".format(f, accuracyRound))
        accuracyList.append(accuracyRound)
        goldList = []
        predictedList = []
        for (features, label) in testingRound:
                gold.append(label)
                predicted.append(classifier.classify(features))
    #finding the mean accuracy against all rounds
    print ('Average accuracy', sum(accuracyList) / foldNums)
    #getting a list of labels
    labels = list(set(gold))
    #depicting each list having values (for each label)
    recallList = []
    precisionList = []
    F1List = []
    for label in labels:
        #comparing gold and predicted lists for each label while computing their values
        TP = TN = FP = FN = 0
        for i, value in enumerate(gold):
            if value == label and predicted[i] == label:  TP += 1
            if value != label and predicted[i] != label:  TN += 1
            if value != label and predicted[i] == label:  FP += 1
            if value == label and predicted[i] != label:  FN += 1
        #utilizing these to compute precision,recall and F1
        precision = TP / (TP + FN)
        recall = TP / (TP + FP)
        recallList.append(recall)
        precisionList.append(precision)
        F1List.append( 2 * (recall * precision) / (recall + precision))

    #evaluated measures represented in a tabular form (with each row per label)
    print('\tPrecision\tRecall\t\tF1')
    # printing measures for each label
    for i, label in enumerate(labels):
        print(label, '\t', "{:8f}".format(precisionList[i]),"{:12f}".format(recallList[i]), "{:12f}".format(F1List[i]))

In [10]:
#Custom Tokenizer
def custom_tokenizers(email):
    tokenizerPattern = r'''(?x)
                        [a-z]+(?:['\-][a-z]+)+
                        |[a-z]+
                        |\$\d+
                     '''
    return nltk.regexp_tokenize(email.lower().replace(" ' ", "'").replace("$ ", "$").replace(" - ", "-"), tokenizerPattern)

In [11]:
#functions defined to generate custom tokens from emailDoc
def generate_custom_tokens(emailDoc):
    newEmail_custom_tokens = []
    for email in emailDoc:
        listToStr = ' '.join([str(element) for element in email[0]])
        newEmail_custom_tokens.append((custom_tokenizers(listToStr), email[1]))
    return newEmail_custom_tokens

In [12]:
def nltk_features_to_dataframe(features):
    POS_negMap = {"spam": 1, "ham": 0}
    firstPass = True
    df = None
    for feature in features:
        if firstPass:
            df = {f: [] for f in feature[0].keys()}
            df["label"] = []
            firstPass = False
        df["label"].append(POS_negMap[feature[1]])
        for f, value in feature[0].items():
            df[f].append(value)
    return pd.DataFrame(df)

In [13]:
#function defined to evaluate utilizing Scikit-learn Model 
def get_evaluation_metrics_sklearn(reference, hypothesis):
    labels = set(reference)
    precisionList = []
    recallList = []
    F1List = []
    correct = 0
    for label in labels:
        TP = TN = FP = FN = 0
        for index, value in enumerate(reference):
            if value == label and hypothesis[index] == label:
                TP += 1
                correct == 1
            elif value == label and hypothesis[index] != label: FN += 1
            elif value != label and hypothesis[index] == label: FP += 1
            else:
                TN += 1
                correct += 1
        precision = TP / (TP + FN)
        recall = TP / (TP + FP)
        recallList.append(recall)
        precisionList.append(precision)
        F1List.append(2 * (precision * recall ) / (precision + recall))
    print("Accuracy: {}".format(round(float(correct) / float(len(reference)), 3)))
    print("\n\tPrecision\tRecall\t\tF1")
    for index, label in enumerate(labels):
        print("{}\t{}\t\t{}\t\t{}".format(label, round(precisionList[index], 3),
                                          round(recallList[index], 3), round(F1List[index], 3)))
    print(confusion_matrix(reference, hypothesis))

In [14]:
def text_to_vector(text):
    wordVector = np.zeros(vocabSize)
    for word in text.split(" "):
        if wordToidx.get(word) is None:
            continue
        else:
            wordVector[wordToidx.get(word)] += 1
    return np.array(wordVector)

In [15]:
#fitting the data into the models
def train(clf, features, targets):
    clf.fit(features, targets)

def predict(clf, features):
    return (clf.predict(features))

### STEP - 1
##### Process spam/ham files 

In [16]:
process_files_SpamHam(datasetPath, 4000)

Number of spam files: 1500
Number of ham files: 3672
(['Subject', ':', 'feeling', 'down', 'about', 'the', 'slze', 'of', 'your', 'johnson', '.', '.', '.', 'rtxyj', 'nxfgr', 'ktrqr', 'abtyw', 'ifpyc', 'edvve', 'smrzz', 'ejwah', 'mgjdq', 'gfsae', 'gnydw', 'mexzx', 'vsdbr', 'lubbp', 'bvdkf', 'otdmbipdtc', 'viukj', 'dnyuv', 'bwekh', 'ctqpm', 'qywdu', 'ywipb', 'stcuy', 'fbnzx', 'slcxz', 'exanh', 'cpxqw', 'rpjiw', 'hbqcu', 'pifce', 'qypyl', 'hntql', 'uignp', 'fpsus', 'wrgcr', 'ymkqh', 'nkzzv', 'wmkmp', 'eoqlt', 'lthje', 'jttsb', 'uhmrq', 'sjkct', 'pqhop', 'gsnoq', 'otvhj', 'ujcrh', 'iagpn', 'baqhr', 'ajdhs', 'ntznk', 'uuzqc', 'kkesa', 'eocwz', 'vaous'], 'spam')
(['Subject', ':', '?', '?', '?', '?', '?', '?', '?', '?', 'erp', '!', '?', '?', '?', '?', '?', '?', '?', '?', '½', '?', '?', '?', '?', '?', '?', '?', '?', '?', 'erp', '?', '?', '·', '?', '?', '?', '£', '½', '£', '?', 'erp', '+', '²', '?', '?', '?', '+', '°', '?', '?', '«', '?', '?', '?', '?', '»', '?', '?', '½', '?', '¨', '?', '?', '?'

### STEP - 2
##### Term Frequency - Inverse Document Frequency (TF-IDF) & 1k features

In [17]:
wordFeatures = get_tfidf_scores(emailDoc, 1000)
bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.6634429400386848
Accuracy of Fold 1: 0.6634429400386848
Accuracy of Fold 2: 0.6711798839458414
Accuracy of Fold 3: 0.7272727272727273
Accuracy of Fold 4: 0.7253384912959381
Accuracy of Fold 5: 0.7137330754352031
Accuracy of Fold 6: 0.6731141199226306
Accuracy of Fold 7: 0.6673114119922631
Accuracy of Fold 8: 0.7137330754352031
Accuracy of Fold 9: 0.6344294003868471
Average accuracy 0.6852998065764024
	Precision	Recall		F1
ham 	 0.628984     0.897047     0.739472
spam 	 0.823215     0.475347     0.602686


##### Utilizing built-in NLTK tokenizer and 1k features to get a baseline

In [18]:
wordFeatures = get_word_features(emailDoc, 1000)
bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.9226305609284333
Accuracy of Fold 1: 0.9342359767891683
Accuracy of Fold 2: 0.9226305609284333
Accuracy of Fold 3: 0.9458413926499033
Accuracy of Fold 4: 0.9458413926499033
Accuracy of Fold 5: 0.9361702127659575
Accuracy of Fold 6: 0.9187620889748549
Accuracy of Fold 7: 0.9245647969052224
Accuracy of Fold 8: 0.9206963249516441
Accuracy of Fold 9: 0.9284332688588007
Average accuracy 0.929980657640232
	Precision	Recall		F1
ham 	 0.901662     0.999698     0.948152
spam 	 0.999333     0.805810     0.892198


##### Changing the size of feature set with TF-IDF scores

In [19]:
for size in [500, 1000, 2500, 3000, 5500, 7000, 10000]:
    print("\n---------------------------------------------------------------")
    print("\n\tResults for varying size of vocabulary {}".format(size))
    print("\n---------------------------------------------------------------")
    wordFeatures = get_tfidf_scores(emailDoc, size)
    bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in emailDoc]
    cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)


---------------------------------------------------------------

	Results for varying size of vocabulary 500

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.6692456479690522
Accuracy of Fold 1: 0.6518375241779497
Accuracy of Fold 2: 0.6634429400386848
Accuracy of Fold 3: 0.7001934235976789
Accuracy of Fold 4: 0.7137330754352031
Accuracy of Fold 5: 0.7156673114119922
Accuracy of Fold 6: 0.6808510638297872
Accuracy of Fold 7: 0.6479690522243714
Accuracy of Fold 8: 0.7021276595744681
Accuracy of Fold 9: 0.6382978723404256
Average accuracy 0.6783365570599613
	Precision	Recall		F1
ham 	 0.633342     0.880015     0.736575
spam 	 0.788526     0.467563     0.587037

---------------------------------------------------------------

	Results for varying size of vocabulary 1000

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.6634429400386848
Accuracy of Fold 1: 0.6634429400386848


##### Changing the size of feature set in baseline model

In [20]:
for size in [500, 1000, 2500, 3000, 5500, 7000, 10000]:
    print("\n---------------------------------------------------------------")
    print("\n\tResults for varying size of vocabulary {}".format(size))
    print("\n---------------------------------------------------------------")
    wordFeatures = get_word_features(emailDoc, size)
    bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in emailDoc]
    cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)


---------------------------------------------------------------

	Results for varying size of vocabulary 500

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.9187620889748549
Accuracy of Fold 1: 0.9148936170212766
Accuracy of Fold 2: 0.9110251450676983
Accuracy of Fold 3: 0.9264990328820116
Accuracy of Fold 4: 0.9361702127659575
Accuracy of Fold 5: 0.9168278529980658
Accuracy of Fold 6: 0.8955512572533849
Accuracy of Fold 7: 0.9052224371373307
Accuracy of Fold 8: 0.90715667311412
Accuracy of Fold 9: 0.9129593810444874
Average accuracy 0.9145067698259188
	Precision	Recall		F1
ham 	 0.882048     0.997228     0.936109
spam 	 0.993996     0.774831     0.870836

---------------------------------------------------------------

	Results for varying size of vocabulary 1000

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.9226305609284333
Accuracy of Fold 1: 0.9342359767891683
Ac

##### Removing punctuations

In [21]:
remove_punctuations_emailDoc = remove_punctuations(emailDoc)
wordFeatures = get_word_features(remove_punctuations_emailDoc, 1000)
bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in remove_punctuations_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.9323017408123792
Accuracy of Fold 1: 0.9400386847195358
Accuracy of Fold 2: 0.9264990328820116
Accuracy of Fold 3: 0.9458413926499033
Accuracy of Fold 4: 0.9458413926499033
Accuracy of Fold 5: 0.9381044487427466
Accuracy of Fold 6: 0.9323017408123792
Accuracy of Fold 7: 0.9226305609284333
Accuracy of Fold 8: 0.9245647969052224
Accuracy of Fold 9: 0.9284332688588007
Average accuracy 0.9336557059961315
	Precision	Recall		F1
ham 	 0.906837     0.999700     0.951007
spam 	 0.999333     0.814130     0.897275


##### Removing stopwords

In [22]:
remove_stopwords_emailDoc = remove_stopwords(emailDoc, stopwords)
wordFeatures = get_word_features(remove_stopwords_emailDoc, 1000)
bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.9284332688588007
Accuracy of Fold 1: 0.9381044487427466
Accuracy of Fold 2: 0.9245647969052224
Accuracy of Fold 3: 0.9516441005802708
Accuracy of Fold 4: 0.9477756286266924
Accuracy of Fold 5: 0.9342359767891683
Accuracy of Fold 6: 0.9148936170212766
Accuracy of Fold 7: 0.9245647969052224
Accuracy of Fold 8: 0.9342359767891683
Accuracy of Fold 9: 0.9284332688588007
Average accuracy 0.9326885880077368
	Precision	Recall		F1
ham 	 0.906020     0.999099     0.950286
spam 	 0.997999     0.812602     0.895808


##### Removing punctuations & stopwords

In [23]:
remove_punctuations_emailDoc = remove_punctuations(emailDoc)
remove_stopwords_emailDoc = remove_stopwords(remove_punctuations_emailDoc, stopwords)
wordFeatures = get_word_features(remove_stopwords_emailDoc, 1000)
bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.9361702127659575
Accuracy of Fold 1: 0.9361702127659575
Accuracy of Fold 2: 0.9245647969052224
Accuracy of Fold 3: 0.9555125725338491
Accuracy of Fold 4: 0.9516441005802708
Accuracy of Fold 5: 0.9400386847195358
Accuracy of Fold 6: 0.9226305609284333
Accuracy of Fold 7: 0.9323017408123792
Accuracy of Fold 8: 0.9226305609284333
Accuracy of Fold 9: 0.9284332688588007
Average accuracy 0.9350096711798839
	Precision	Recall		F1
ham 	 0.909561     0.998803     0.952096
spam 	 0.997332     0.818281     0.898978


##### Custom tokenizer on emailDoc

In [54]:
custom_tokenizer_emailDoc = generate_custom_tokens(emailDoc)
wordFeatures = get_word_features(custom_tokenizer_emailDoc, 1000)
bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in custom_tokenizer_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.9342359767891683
Accuracy of Fold 1: 0.941972920696325
Accuracy of Fold 2: 0.9226305609284333
Accuracy of Fold 3: 0.9593810444874274
Accuracy of Fold 4: 0.9458413926499033
Accuracy of Fold 5: 0.941972920696325
Accuracy of Fold 6: 0.9342359767891683
Accuracy of Fold 7: 0.9303675048355899
Accuracy of Fold 8: 0.9226305609284333
Accuracy of Fold 9: 0.9342359767891683
Average accuracy 0.936750483558994
	Precision	Recall		F1
ham 	 0.912558     0.998212     0.953465
spam 	 0.995997     0.823043     0.901298


##### Custom tokenizer on emailDoc while having punctuations removed

In [55]:
custom_tokenizer_emailDoc = generate_custom_tokens(emailDoc)
remove_punctuations_emailDoc = remove_punctuations(custom_tokenizer_emailDoc)
wordFeatures = get_word_features(custom_tokenizer_emailDoc, 1000)
bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in remove_punctuations_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.9342359767891683
Accuracy of Fold 1: 0.941972920696325
Accuracy of Fold 2: 0.9226305609284333
Accuracy of Fold 3: 0.9593810444874274
Accuracy of Fold 4: 0.9458413926499033
Accuracy of Fold 5: 0.941972920696325
Accuracy of Fold 6: 0.9342359767891683
Accuracy of Fold 7: 0.9303675048355899
Accuracy of Fold 8: 0.9226305609284333
Accuracy of Fold 9: 0.9342359767891683
Average accuracy 0.936750483558994
	Precision	Recall		F1
ham 	 0.912558     0.998212     0.953465
spam 	 0.995997     0.823043     0.901298


##### Custom tokenizer on emailDoc while having stopwords removed

In [56]:
custom_tokenizer_emailDoc = generate_custom_tokens(emailDoc)
remove_stopwords_emailDoc = remove_stopwords(custom_tokenizer_emailDoc, stopwords)
wordFeatures = get_word_features(custom_tokenizer_emailDoc, 1000)
bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.9361702127659575
Accuracy of Fold 1: 0.9439071566731141
Accuracy of Fold 2: 0.9226305609284333
Accuracy of Fold 3: 0.9555125725338491
Accuracy of Fold 4: 0.9439071566731141
Accuracy of Fold 5: 0.941972920696325
Accuracy of Fold 6: 0.9342359767891683
Accuracy of Fold 7: 0.9323017408123792
Accuracy of Fold 8: 0.9245647969052224
Accuracy of Fold 9: 0.9323017408123792
Average accuracy 0.9367504835589943
	Precision	Recall		F1
ham 	 0.913648     0.997027     0.953518
spam 	 0.993329     0.824474     0.901059


##### Custom tokenizer on emailDoc while having punctuations & stopwords removed

In [57]:
custom_tokenizer_emailDoc = generate_custom_tokens(emailDoc)
remove_punctuations_emailDoc = remove_punctuations(custom_tokenizer_emailDoc)
remove_stopwords_emailDoc = remove_stopwords(remove_punctuations_emailDoc, stopwords)
word_features = get_word_features(remove_stopwords_emailDoc, 1000)
bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.9361702127659575
Accuracy of Fold 1: 0.9439071566731141
Accuracy of Fold 2: 0.9226305609284333
Accuracy of Fold 3: 0.9555125725338491
Accuracy of Fold 4: 0.9439071566731141
Accuracy of Fold 5: 0.941972920696325
Accuracy of Fold 6: 0.9342359767891683
Accuracy of Fold 7: 0.9323017408123792
Accuracy of Fold 8: 0.9245647969052224
Accuracy of Fold 9: 0.9323017408123792
Average accuracy 0.9367504835589943
	Precision	Recall		F1
ham 	 0.913648     0.997027     0.953518
spam 	 0.993329     0.824474     0.901059


##### Bigrams

In [28]:
emailTokens = [token for email in emailDoc for token in email[0]]
emailBigram_finder = BigramCollocationFinder.from_words(emailTokens)
email_bigrams_freq = emailBigram_finder.score_ngrams(bigramMeasures.raw_freq)
email_bigrams_pmi = emailBigram_finder.score_ngrams(bigramMeasures.pmi)

In [29]:
email_bigrams_freqList = [" ".join(bigram[0]) for bigram in email_bigrams_freq[:1000]]
email_bigrams_pmiList = [" ".join(bigram[0]) for bigram in email_bigrams_pmi[:1000]]
print("\n---------------------------------------------------------------")
print("\n\tFrequency".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_freqList), label) for (email, label) in emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)
print("\n---------------------------------------------------------------")
print("\n\tPMI".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_pmiList), label) for (email, label) in emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)


---------------------------------------------------------------

	Frequency

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.8646034816247582
Accuracy of Fold 1: 0.8239845261121856
Accuracy of Fold 2: 0.8529980657640233
Accuracy of Fold 3: 0.8762088974854932
Accuracy of Fold 4: 0.8646034816247582
Accuracy of Fold 5: 0.8646034816247582
Accuracy of Fold 6: 0.8375241779497099
Accuracy of Fold 7: 0.8491295938104448
Accuracy of Fold 8: 0.8568665377176016
Accuracy of Fold 9: 0.839458413926499
Average accuracy 0.8529980657640233
	Precision	Recall		F1
ham 	 0.795696     0.996588     0.884883
spam 	 0.993329     0.665029     0.796683

---------------------------------------------------------------

	PMI

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.7388781431334622
Accuracy of Fold 1: 0.7524177949709865
Accuracy of Fold 2: 0.758220502901354
Accuracy of Fold 3: 0.73114119922630

##### Bigrams having punctuations removed

In [30]:
remove_punctuations_emailDoc = remove_punctuations(emailDoc)
emailTokens = [token for email in remove_punctuations_emailDoc for token in email[0]]
emailBigram_finder = BigramCollocationFinder.from_words(emailTokens)
email_bigrams_freq = emailBigram_finder.score_ngrams(bigramMeasures.raw_freq)
email_bigrams_pmi = emailBigram_finder.score_ngrams(bigramMeasures.pmi)

In [31]:
email_bigrams_freqList = [" ".join(bigram[0]) for bigram in email_bigrams_freq[:1000]]
email_bigrams_pmiList = [" ".join(bigram[0]) for bigram in email_bigrams_pmi[:1000]]
print("\n---------------------------------------------------------------")
print("\n\tFrequency".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_freqList), label) for (email, label) in remove_punctuations_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)
print("\n---------------------------------------------------------------")
print("\n\tPMI".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_pmiList), label) for (email, label) in remove_punctuations_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)


---------------------------------------------------------------

	Frequency

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.8646034816247582
Accuracy of Fold 1: 0.851063829787234
Accuracy of Fold 2: 0.8355899419729207
Accuracy of Fold 3: 0.851063829787234
Accuracy of Fold 4: 0.8897485493230174
Accuracy of Fold 5: 0.8646034816247582
Accuracy of Fold 6: 0.8491295938104448
Accuracy of Fold 7: 0.8336557059961315
Accuracy of Fold 8: 0.8471953578336557
Accuracy of Fold 9: 0.8529980657640233
Average accuracy 0.8539651837524177
	Precision	Recall		F1
ham 	 0.797603     0.995918     0.885796
spam 	 0.991995     0.666816     0.797533

---------------------------------------------------------------

	PMI

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.7350096711798839
Accuracy of Fold 1: 0.7427466150870407
Accuracy of Fold 2: 0.7543520309477756
Accuracy of Fold 3: 0.72727272727272

##### Bigrams having stopwords removed

In [32]:
remove_stopwords_emailDoc = remove_stopwords(emailDoc, stopwords)
emailTokens = [token for email in remove_stopwords_emailDoc for token in email[0]]
emailBigram_finder = BigramCollocationFinder.from_words(emailTokens)
email_bigrams_freq = emailBigram_finder.score_ngrams(bigramMeasures.raw_freq)
email_bigrams_pmi = emailBigram_finder.score_ngrams(bigramMeasures.pmi)

In [33]:
email_bigrams_freqList = [" ".join(bigram[0]) for bigram in email_bigrams_freq[:1000]]
email_bigrams_pmiList = [" ".join(bigram[0]) for bigram in email_bigrams_pmi[:1000]]
print("\n---------------------------------------------------------------")
print("\n\tFrequency".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_freqList), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)
print("\n---------------------------------------------------------------")
print("\n\tPMI".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_pmiList), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)


---------------------------------------------------------------

	Frequency

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.8588007736943907
Accuracy of Fold 1: 0.816247582205029
Accuracy of Fold 2: 0.8297872340425532
Accuracy of Fold 3: 0.839458413926499
Accuracy of Fold 4: 0.8452611218568665
Accuracy of Fold 5: 0.8471953578336557
Accuracy of Fold 6: 0.8181818181818182
Accuracy of Fold 7: 0.8201160541586073
Accuracy of Fold 8: 0.8375241779497099
Accuracy of Fold 9: 0.8123791102514507
Average accuracy 0.832495164410058
	Precision	Recall		F1
ham 	 0.766276     0.997164     0.866605
spam 	 0.994663     0.634738     0.774948

---------------------------------------------------------------

	PMI

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.7330754352030948
Accuracy of Fold 1: 0.7446808510638298
Accuracy of Fold 2: 0.7601547388781431
Accuracy of Fold 3: 0.729206963249516

##### Bigrams having punctuations & stopwords removed

In [34]:
remove_punctuations_emailDoc = remove_punctuations(emailDoc)
remove_stopwords_emailDoc = remove_stopwords(remove_punctuations_emailDoc, stopwords)
emailTokens = [token for email in remove_stopwords_emailDoc for token in email[0]]
emailBigram_finder = BigramCollocationFinder.from_words(emailTokens)
email_bigrams_freq = emailBigram_finder.score_ngrams(bigramMeasures.raw_freq)
email_bigrams_pmi = emailBigram_finder.score_ngrams(bigramMeasures.pmi)

In [35]:
email_bigrams_freqList = [" ".join(bigram[0]) for bigram in email_bigrams_freq[:1000]]
email_bigrams_pmiList = [" ".join(bigram[0]) for bigram in email_bigrams_pmi[:1000]]
print("\n---------------------------------------------------------------")
print("\n\tFrequency".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_freqList), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)
print("\n---------------------------------------------------------------")
print("\n\tPMI".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_pmiList), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)


---------------------------------------------------------------

	Frequency

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.8588007736943907
Accuracy of Fold 1: 0.8123791102514507
Accuracy of Fold 2: 0.8104448742746615
Accuracy of Fold 3: 0.8317214700193424
Accuracy of Fold 4: 0.8607350096711799
Accuracy of Fold 5: 0.8433268858800773
Accuracy of Fold 6: 0.816247582205029
Accuracy of Fold 7: 0.8143133462282398
Accuracy of Fold 8: 0.8413926499032882
Accuracy of Fold 9: 0.7988394584139265
Average accuracy 0.8288201160541586
	Precision	Recall		F1
ham 	 0.759466     0.999283     0.863024
spam 	 0.998666     0.628992     0.771848

---------------------------------------------------------------

	PMI

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.7330754352030948
Accuracy of Fold 1: 0.7408123791102514
Accuracy of Fold 2: 0.7524177949709865
Accuracy of Fold 3: 0.7272727272727

##### Bigrams with custom tokenizers

In [36]:
custom_tokenizer_emailDoc = generate_custom_tokens(emailDoc)
emailTokens = [token for email in custom_tokenizer_emailDoc for token in email[0]]
emailBigram_finder = BigramCollocationFinder.from_words(emailTokens)
email_bigrams_freq = emailBigram_finder.score_ngrams(bigramMeasures.raw_freq)
email_bigrams_pmi = emailBigram_finder.score_ngrams(bigramMeasures.pmi)

In [37]:
email_bigrams_freqList = [" ".join(bigram[0]) for bigram in email_bigrams_freq[:1000]]
email_bigrams_pmiList = [" ".join(bigram[0]) for bigram in email_bigrams_pmi[:1000]]
print("\n---------------------------------------------------------------")
print("\n\tFrequency".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_freqList), label) for (email, label) in custom_tokenizer_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)
print("\n---------------------------------------------------------------")
print("\n\tPMI".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_pmiList), label) for (email, label) in custom_tokenizer_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)


---------------------------------------------------------------

	Frequency

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.8704061895551257
Accuracy of Fold 1: 0.8665377176015474
Accuracy of Fold 2: 0.8471953578336557
Accuracy of Fold 3: 0.8626692456479691
Accuracy of Fold 4: 0.8878143133462283
Accuracy of Fold 5: 0.8704061895551257
Accuracy of Fold 6: 0.8452611218568665
Accuracy of Fold 7: 0.8433268858800773
Accuracy of Fold 8: 0.8684719535783365
Accuracy of Fold 9: 0.8607350096711799
Average accuracy 0.8622823984526112
	Precision	Recall		F1
ham 	 0.810951     0.993990     0.893189
spam 	 0.987992     0.680920     0.806206

---------------------------------------------------------------

	PMI

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.7369439071566731
Accuracy of Fold 1: 0.7485493230174082
Accuracy of Fold 2: 0.7620889748549323
Accuracy of Fold 3: 0.729206963249

##### Bigrams with custom tokenizers while having stopwords removed 

In [63]:
custom_tokenizer_emailDoc = generate_custom_tokens(emailDoc)
remove_stopwords_emailDoc = remove_stopwords(custom_tokenizer_emailDoc, stopwords)
emailTokens = [token for email in custom_tokenizer_emailDoc for token in email[0]]
emailBigram_finder = BigramCollocationFinder.from_words(emailTokens)
email_bigrams_freq = emailBigram_finder.score_ngrams(bigramMeasures.raw_freq)
email_bigrams_pmi = emailBigram_finder.score_ngrams(bigramMeasures.pmi)

In [64]:
email_bigrams_freqList = [" ".join(bigram[0]) for bigram in email_bigrams_freq[:1000]]
email_bigrams_pmiList = [" ".join(bigram[0]) for bigram in email_bigrams_pmi[:1000]]
print("\n---------------------------------------------------------------")
print("\n\tFrequency".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_freqList), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)
print("\n---------------------------------------------------------------")
print("\n\tPMI".format(1000))
print("\n---------------------------------------------------------------")
bagOfWords_features = [(get_bigram_bag_words(email, email_bigrams_pmiList), label) for (email, label) in remove_stopwords_emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)


---------------------------------------------------------------

	Frequency

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.8143133462282398
Accuracy of Fold 1: 0.7524177949709865
Accuracy of Fold 2: 0.7369439071566731
Accuracy of Fold 3: 0.7524177949709865
Accuracy of Fold 4: 0.7872340425531915
Accuracy of Fold 5: 0.793036750483559
Accuracy of Fold 6: 0.7620889748549323
Accuracy of Fold 7: 0.7562862669245648
Accuracy of Fold 8: 0.7891682785299806
Accuracy of Fold 9: 0.7524177949709865
Average accuracy 0.76963249516441
	Precision	Recall		F1
ham 	 0.676382     0.998793     0.806562
spam 	 0.997999     0.557377     0.715276

---------------------------------------------------------------

	PMI

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.7369439071566731
Accuracy of Fold 1: 0.7485493230174082
Accuracy of Fold 2: 0.7620889748549323
Accuracy of Fold 3: 0.729206963249516

##### Combining features

In [58]:
wordFeatures = get_word_features(emailDoc, 5000)
baseline5k_freq = [(get_bag_words(email, wordFeatures, False), label) for (email, label) in emailDoc]
cross_validation_accuracy_evaluation_metrics(10, baseline5k_freq)

Each fold size: 517
Accuracy of Fold 0: 0.9593810444874274
Accuracy of Fold 1: 0.9458413926499033
Accuracy of Fold 2: 0.9535783365570599
Accuracy of Fold 3: 0.9671179883945842
Accuracy of Fold 4: 0.9381044487427466
Accuracy of Fold 5: 0.9458413926499033
Accuracy of Fold 6: 0.9497098646034816
Accuracy of Fold 7: 0.9613152804642167
Accuracy of Fold 8: 0.9516441005802708
Accuracy of Fold 9: 0.9361702127659575
Average accuracy 0.9508704061895553
	Precision	Recall		F1
ham 	 0.955598     0.974715     0.965062
spam 	 0.939293     0.896244     0.917264


In [59]:
emailTokens = [token for email in emailDoc for token in email[0]]
emailBigram_finder = BigramCollocationFinder.from_words(emailTokens)
email_bigrams_freq = emailBigram_finder.score_ngrams(bigramMeasures.raw_freq)
email_bigrams_freqList = [" ".join(bigram[0]) for bigram in email_bigrams_freq[:10000]]
bigram10k_bool = [(get_bigram_bag_words(email, email_bigrams_freqList), label) for (email, label) in emailDoc]
cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)

Each fold size: 517
Accuracy of Fold 0: 0.9361702127659575
Accuracy of Fold 1: 0.9439071566731141
Accuracy of Fold 2: 0.9226305609284333
Accuracy of Fold 3: 0.9555125725338491
Accuracy of Fold 4: 0.9439071566731141
Accuracy of Fold 5: 0.941972920696325
Accuracy of Fold 6: 0.9342359767891683
Accuracy of Fold 7: 0.9323017408123792
Accuracy of Fold 8: 0.9245647969052224
Accuracy of Fold 9: 0.9323017408123792
Average accuracy 0.9367504835589943
	Precision	Recall		F1
ham 	 0.913648     0.997027     0.953518
spam 	 0.993329     0.824474     0.901059


In [60]:
baseline5k_bigram10k_joined = []
for i in range(len(baseline5k_freq)):
    joinedFeatures = {feature: value for feature, value in baseline5k_freq[i][0].items()}
    for feature, value in bigram10k_bool[i][0].items(): joinedFeatures[feature] = value
    baseline5k_bigram10k_joined.append((joinedFeatures, baseline5k_freq[i][1]))
cross_validation_accuracy_evaluation_metrics(10, baseline5k_bigram10k_joined)

Each fold size: 517
Accuracy of Fold 0: 0.9671179883945842
Accuracy of Fold 1: 0.9748549323017408
Accuracy of Fold 2: 0.9632495164410058
Accuracy of Fold 3: 0.97678916827853
Accuracy of Fold 4: 0.9806576402321083
Accuracy of Fold 5: 0.97678916827853
Accuracy of Fold 6: 0.9690522243713733
Accuracy of Fold 7: 0.9535783365570599
Accuracy of Fold 8: 0.9516441005802708
Accuracy of Fold 9: 0.9574468085106383
Average accuracy 0.9671179883945842
	Precision	Recall		F1
ham 	 0.953963     0.999715     0.976303
spam 	 0.999333     0.898620     0.946304


##### Additional experiments - changing the size of common words within Custom Tokenizer Model

In [43]:
for n in [500, 1000, 2500, 3000, 5500, 7000, 10000]:
    print("\n---------------------------------------------------------------")
    print("\n\tResults for varying size of vocabulary {}".format(size))
    print("\n---------------------------------------------------------------")    
    custom_tokenizer_emailDoc = generate_custom_tokens(emailDoc)
    wordFeatures = get_word_features(custom_tokenizer_emailDoc, 1000)
    bagOfWords_features = [(get_bag_words(email, wordFeatures), label) for (email, label) in custom_tokenizer_emailDoc]
    cross_validation_accuracy_evaluation_metrics(10, bagOfWords_features)    


---------------------------------------------------------------

	Results for varying size of vocabulary 10000

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.9342359767891683
Accuracy of Fold 1: 0.941972920696325
Accuracy of Fold 2: 0.9226305609284333
Accuracy of Fold 3: 0.9593810444874274
Accuracy of Fold 4: 0.9458413926499033
Accuracy of Fold 5: 0.941972920696325
Accuracy of Fold 6: 0.9342359767891683
Accuracy of Fold 7: 0.9303675048355899
Accuracy of Fold 8: 0.9226305609284333
Accuracy of Fold 9: 0.9342359767891683
Average accuracy 0.936750483558994
	Precision	Recall		F1
ham 	 0.912558     0.998212     0.953465
spam 	 0.995997     0.823043     0.901298

---------------------------------------------------------------

	Results for varying size of vocabulary 10000

---------------------------------------------------------------
Each fold size: 517
Accuracy of Fold 0: 0.9342359767891683
Accuracy of Fold 1: 0.941972920696325
A

##### Scikit-learn

##### Baseline model with 5k features, frequency counts

In [44]:
df_baseline5k_freq = nltk_features_to_dataframe(baseline5k_freq)
x = df_baseline5k_freq.drop("label", 1)
y = df_baseline5k_freq["label"]
clf = MultinomialNB()
predicted_y = cross_val_predict(clf, x, y, cv=10)
get_evaluation_metrics_sklearn(y, predicted_y)

Accuracy: 0.946

	Precision	Recall		F1
0	0.937		0.986		0.961
1	0.967		0.863		0.912
[[3441  231]
 [  49 1451]]


##### Baseline model with 10k features, frequency counts

In [62]:
df_bigram10k_bool = nltk_features_to_dataframe(bigram10k_bool)
x = df_bigram10k_bool.drop("label", 1)
y = df_bigram10k_bool["label"]
clf = MultinomialNB()
predicted_y = cross_val_predict(clf, x, y, cv=10)
get_evaluation_metrics_sklearn(y, predicted_y)

Accuracy: 0.95

	Precision	Recall		F1
0	0.96		0.969		0.965
1	0.925		0.905		0.915
[[3526  146]
 [ 113 1387]]


##### Analysis using additional classifiers

In [45]:
EmailCorpus = []
vocabSize = []
for email in emailDoc:
    listToStr = ' '.join([str(element) for element in email[0]])
    element = 0 if email[1] == 'ham' else 1
    EmailCorpus.append((listToStr, email[1]))
text = []
label = []
for dt in EmailCorpus:
    text.append(dt[0])
    label.append(dt[1])
totalCount = Counter()
for i in range(len(text)):
    for word in text[i].split(" "):
        totalCount[word] += 1
#sorting in descending order (i.e., word with the highest frequency appears first)
vocab = sorted(totalCount, key=totalCount.get, reverse=True)
vocabSize = len(vocab)
wordToidx = {}
#print vocab_size
for idx, word in enumerate(vocab):
    wordToidx[word] = idx
#converting all titles to vectors
wordVectors = np.zeros((len(text), len(vocab)), dtype=np.int_)
for idx,data in enumerate(text):
    wordVectors[idx] = text_to_vector(data)
wordVectors.shape
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(text)
vectors.shape
features = vectors

In [46]:
#splitting the dataset into train and test sets
x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.15, random_state=111)

In [65]:
#initializing multiple classifier models
svm = SVC(kernel='sigmoid', gamma=1.0)
knn = KNeighborsClassifier(n_neighbors=49)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=7, random_state=111)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=31, random_state=111)
clfs = {'Support Vector' : svm,'K-Nearest Neighbors' : knn, 'Naive Bayes': mnb, 'Decision Tree': dtc, 'Logistic Regression': lrc, 'Random Forest': rfc}

In [66]:
for key,value in clfs.items():
    train(value, x_train, y_train)
    prediction = predict(value, x_test)
    predScores_wordVectors.append((key, [accuracy_score(y_test , prediction)]))
predScores_wordVectors

[('Support Vector', [0.9871134020618557]),
 ('K-Neareast Neighbors', [0.9432989690721649]),
 ('Naive Bayes', [0.9677835051546392]),
 ('Decision Tree', [0.9484536082474226]),
 ('Logistic Regression', [0.9548969072164949]),
 ('Random Forest', [0.9690721649484536]),
 ('Support Vector', [0.9871134020618557]),
 ('K-Nearest Neighbors', [0.9432989690721649]),
 ('Naive Bayes', [0.9677835051546392]),
 ('Decision Tree', [0.9484536082474226]),
 ('Logistic Regression', [0.9548969072164949]),
 ('Random Forest', [0.9690721649484536])]