In [71]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


In [65]:
words_file = r"word_data.pkl"
authors_file= r"email_authors.pkl"

In [69]:
def data_preprocessing(words_file,authors_file):

    """ Read both features and Labels files"""
    authors_file_handler = open(authors_file, "rb")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "rb")
    words_data = pickle.load(words_file_handler)
    words_file_handler.close()

    """ Split dataset into train and test data"""

    features_train, features_test, labels_train, labels_test = train_test_split(words_data,authors, test_size=0.1, random_state=42)

    """ text vectorization--go from strings to lists of numbers """
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)
    
    """feature selection, because text is super high dimensional and can be really computationally chewy as a result """ 
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
        
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()
    
    return features_train_transformed,features_test_transformed,labels_train,labels_test


""" Training models with all 3 types of naive Bayes algorithm"""

def GaussianNB_algo(features_train_transformed, labels_train,labels_test):
    clf = GaussianNB()
    clf.fit(features_train_transformed, labels_train)
    predicted_result = clf.predict(features_test_transformed)
    GaussianNB_accuracy = accuracy_score(predicted_result,labels_test)
    """You can also find the confusion matrix for the predicted result"""
#     cm = confusion_matrix(labels_test, predicted_result)    
    return GaussianNB_accuracy

def MultinomialNB_algo(features_train_transformed, labels_train,labels_test):
    clf = MultinomialNB()
    clf.fit(features_train_transformed, labels_train)
    predicted_result = clf.predict(features_test_transformed)
    MultinomialNB_accuracy = accuracy_score(predicted_result,labels_test)
    
    return MultinomialNB_accuracy

def BernoulliNB_algo(features_train_transformed, labels_train,labels_test):
    clf = BernoulliNB()
    clf.fit(features_train_transformed, labels_train)
    predicted_result = clf.predict(features_test_transformed)
    BernoulliNB_accuracy = accuracy_score(predicted_result,labels_test)
    
    return BernoulliNB_accuracy
        

In [67]:
features_train_transformed,features_test_transformed,labels_train,labels_test = data_preprocessing(words_file,authors_file)

In [68]:
GaussianNB_accuracy = GaussianNB_algo(features_train_transformed,labels_train,labels_test)
MultinomialNB_accuracy = MultinomialNB_algo(features_train_transformed,labels_train,labels_test)
BernoulliNB_accuracy = BernoulliNB_algo(features_train_transformed,labels_train,labels_test)

print (GaussianNB_accuracy)
print (MultinomialNB_accuracy)
print (BernoulliNB_accuracy)        

0.9732650739476678
0.9829351535836177
0.9078498293515358
