## ML Classifiers to identify user from emails notebook

## email_preprocess.py
This file reads email data from word_data.pkl and labeled data of its associated authors from email_authors.pkl. For simplicity 1 is labeled as Chris and 2 as Sara. The preprocess function reads the data and split the data into training set and testing set. Then vectorize the features set from string to list of number using TDIDF to simplify our feature set and then using feature selection select percentile method, top 10 percentile feature are set as training and testing set data. 

In [1]:
#!/usr/bin/python

import pickle
import cPickle
import numpy

from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif



def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features
        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions
        4 objects are returned:
            -- training/testing features
            -- training/testing labels
    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of Chris training emails:", sum(labels_train)
    print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test



## classifier_author_id.py

In [2]:
#!/usr/bin/python

""" 
    This is the code to accompany the Lesson 1 (Naive Bayes) mini-project. 
    Use a Naive Bayes Classifier to identify emails by their authors
    authors and labels:
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()


no. of Chris training emails: 7936
no. of Sara training emails: 7884


### Naive Bayes Classifier

In [3]:
clf=GaussianNB()
t0 = time()
clf.fit(features_train,labels_train)
t1=time()
print "Training time:", round(time()-t0, 3), "s"
pred=clf.predict(features_test)
print "Prediction time:", round(time()-t1, 3), "s"
print "Accuracy Score",accuracy_score(labels_test,pred)

Training time: 1.34 s
Prediction time: 0.121 s
Accuracy Score 0.973265073948


### SVM Classifier

In [4]:
from sklearn.svm import SVC

clf = SVC(kernel="linear")
t0 = time()
clf.fit(features_train,labels_train)
print "Training time:", round(time()-t0, 3), "s"
t1=time()
pred=clf.predict(features_test)
print "Prediction time:", round(time()-t1, 3), "s"
print "Accuracy Score",accuracy_score(labels_test,pred)


Training time: 273.123 s
Prediction time: 19.424 s
Accuracy Score 0.984072810011


### SVC kernal='rbf' and C=10000

In [5]:
clf = SVC(kernel="rbf",C=10000)
t0 = time()
clf.fit(features_train,labels_train)
print "Training time:", round(time()-t0, 3), "s"
t1=time()
pred=clf.predict(features_test)
print "Prediction time:", round(time()-t1, 3), "s"
print "Accuracy Score",accuracy_score(labels_test,pred)

Training time: 124.737 s
Prediction time: 12.521 s
Accuracy Score 0.990898748578


In [None]:
clf = SVC(kernel="rbf",C=10)
t0 = time()
clf.fit(features_train,labels_train)
print "Training time:", round(time()-t0, 3), "s"
t1=time()
pred=clf.predict(features_test)
print "Prediction time:", round(time()-t1, 3), "s"
print "Accuracy Score",accuracy_score(labels_test,pred)

### Decision_Tree

In [4]:
from sklearn import tree
clf=tree.DecisionTreeClassifier(min_samples_split=40)
t0 = time()
clf.fit(features_train,labels_train)
print "Training time:", round(time()-t0, 3), "s"
t1=time()
pred=clf.predict(features_test)
print "Prediction time:", round(time()-t1, 3), "s"
print "Accuracy Score",accuracy_score(labels_test,pred)

Training time: 51.661 s
Prediction time: 0.015 s
Accuracy Score 0.977246871445


In [8]:
print len(features_train[0])


3785


In [10]:
print (features_train[1])


[ 0.  0.  0. ...,  0.  0.  0.]
