In [3]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
import pandas as pd
from sklearn.metrics import r2_score
from sklearn import metrics


df=pd.read_csv("./Files/messages.txt", sep='\t')
X=df[[1]]
y=df[[0]].replace({'ham': 0, 'spam': 1}) #Spam will be 1 and ham 0

#we randomly change the order of line in order to limit possible bias (such as sorted dataset for instance)
n_sample = len(X)
np.random.seed(0) #We set the seed to be 0 to have the same results on each computer
order = np.random.permutation(n_sample)
X=X.as_matrix() #We convert to matrix and order to change the order of the lines
y=y.as_matrix()
X = X[order]
y = y[order]

#We choose for our model to be 80/20 partitionized : 80% training and 20% test
X_train = X[:int(.8 * n_sample)]
X_train=pd.DataFrame(X_train).stack()#We reconvert to a dataframe for the rest of the program
y_train = y[:int(.8 * n_sample)]
y_train=pd.DataFrame(y_train).stack()
X_test = X[int(.8 * n_sample):]
X_test=pd.DataFrame(X_test).stack()
y_test = y[int(.8 * n_sample):]
y_test=pd.DataFrame(y_test).stack()


def make_Dictionary(X):
    all_words = []
    for line in X:
        words = line.split()
        all_words += words

    dictionary = Counter(all_words)
    # list_to_remove = dictionary.keys()
    # for item in list_to_remove: # this works with python 2.x version
    for item in list(dictionary): # this works with python 3.x version
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    return dictionary

def extract_features(X):

    features_matrix = np.zeros((len(X), 3000))
    docID = 0
    for line in X:
        words = line.split()
        for word in words:
            wordID = 0
            for i, d in enumerate(dictionary):
                if d[0] == word:
                    wordID = i
                    features_matrix[docID, wordID] = words.count(word)
        docID = docID + 1
    return features_matrix

# Create a dictionary of words with its frequency
dictionary = make_Dictionary(X_train)


# Prepare feature vectors per training mail and its labels
train_matrix = extract_features(X_train)


# Training SVM and Naive bayes classifier and its variants
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = GaussianNB()
model1.fit(train_matrix, y_train)
model2.fit(train_matrix, y_train)
model3.fit(train_matrix, y_train)

# Test the unseen mails for Spam
test_matrix = extract_features(X_test)
result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)
result3= model3.predict(test_matrix)

#We verify our results
print(confusion_matrix(y_test, result1))
print(confusion_matrix(y_test, result2))
print(confusion_matrix(y_test, result3))

#Evaluation  of model 1 :
print("\n Model 1 : Linear SVC : ")
print("r2 score : =", r2_score(y_test, result1))
#We print the confusion matrix
print ("Confusion Matrix :", metrics.confusion_matrix(y_test, result1))
# We print the classification matrix
print('Verification : \n', metrics.classification_report(y_test, result1))

#Evaluation  of model 2 :
print("\n Model 2 : MultinomialNB : ")
print("r2 score : =", r2_score(y_test, result2))
#We print the confusion matrix
print ("Confusion Matrix :", metrics.confusion_matrix(y_test, result2))
# We print the classification matrix
print('Verification : \n', metrics.classification_report(y_test, result2))

#Evaluation  of model 3 :
print("\n Model 3 : GaussianNB : ")
print("r2 score : =", r2_score(y_test, result3))
#We print the confusion matrix
print ("Confusion Matrix :", metrics.confusion_matrix(y_test, result3))
# We print the classification matrix
print('Verification : \n', metrics.classification_report(y_test, result3))


[[887   2]
 [ 16  95]]
[[878  11]
 [ 15  96]]
[[728 161]
 [ 12  99]]

 Model 1 : Linear SVC : 
r2 score : = 0.817590368771
Confusion Matrix : [[887   2]
 [ 16  95]]
Verification : 
              precision    recall  f1-score   support

          0       0.98      1.00      0.99       889
          1       0.98      0.86      0.91       111

avg / total       0.98      0.98      0.98      1000


 Model 2 : MultinomialNB : 
r2 score : = 0.736519421559
Confusion Matrix : [[878  11]
 [ 15  96]]
Verification : 
              precision    recall  f1-score   support

          0       0.98      0.99      0.99       889
          1       0.90      0.86      0.88       111

avg / total       0.97      0.97      0.97      1000


 Model 3 : GaussianNB : 
r2 score : = -0.753159233474
Confusion Matrix : [[728 161]
 [ 12  99]]
Verification : 
              precision    recall  f1-score   support

          0       0.98      0.82      0.89       889
          1       0.38      0.89      0.53       11