# Enron Person Of Interest Prediction - Code

### Importing Libraries and Setting Notebook Environment

In [94]:
# General
import sys
import pickle
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt

# sklearn - general
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

# sklearn -models 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

#sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

pd.options.display.max_rows = 199
pd.options.display.max_columns = 99

%matplotlib inline

### Importing Data

In [95]:
data_dict = pickle.load(open('final_project_dataset.pkl'))

### Remove Bad Samples

In [96]:
bad_keys = ['TOTAL','LOCKHART EUGENE E','THE TRAVEL AGENCY IN THE PARK']
for i in bad_keys:
    if i in data_dict:
        del data_dict[i]

### Fill NaNs

In [45]:
df = pd.DataFrame.from_dict(data_dict,orient='index')
df = df.replace('NaN',-999)

In [46]:
df.shape

(143, 21)

### Add New Features

In [47]:
df['message_ratio'] = df['to_messages']/df['from_messages']
df['poi_from_ratio'] = df['from_poi_to_this_person']/df['from_messages']
df['poi_to_ratio'] = df['from_this_person_to_poi']/df['to_messages']
df['deferred_ratio'] = df['deferral_payments']/df['total_payments']

### Splitting Data For Testing

In [48]:
target = df['poi'].astype(int)
predictors = df.drop(['poi','email_address'],axis=1)
x = np.array(predictors).tolist()
y = np.array(target).tolist()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)

### Feature Selection - Studying Performance Using Decision Tree

In [49]:
clf = RandomForestClassifier(max_features='sqrt',n_estimators=50)
clf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
pred = clf.predict(x_test)

In [51]:
clf.feature_importances_

array([ 0.03984233,  0.01931425,  0.00770656,  0.05722558,  0.08517278,
        0.08726777,  0.05949108,  0.05969221,  0.        ,  0.0552391 ,
        0.06734616,  0.00777698,  0.03358317,  0.04082677,  0.06120087,
        0.        ,  0.04427055,  0.03816835,  0.03446505,  0.08142809,
        0.05712535,  0.02332294,  0.03953406])

In [52]:
features = DataFrame()
features['Feature_Names'] = predictors.columns
features['Feature_Importance'] = clf.feature_importances_

In [53]:
features.sort(columns='Feature_Importance', ascending=False)

Unnamed: 0,Feature_Names,Feature_Importance
5,bonus,0.087268
4,exercised_stock_options,0.085173
19,message_ratio,0.081428
10,expenses,0.067346
14,from_this_person_to_poi,0.061201
7,shared_receipt_with_poi,0.059692
6,restricted_stock,0.059491
3,total_payments,0.057226
20,poi_from_ratio,0.057125
9,total_stock_value,0.055239


In [54]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.9047619,  1.       ]),
 array([ 1. ,  0.2]),
 array([ 0.95      ,  0.33333333]),
 array([38,  5]))

### Feature Selection - Studying Performance Using PCA + SelectKBest

In [57]:
pca = PCA()
selection = SelectKBest()
combined_features = FeatureUnion([('pca',pca),('univ_select',selection)])

clf = LogisticRegression(class_weight='balanced')
pipeline = Pipeline([('features', combined_features),('logistic',clf)])
param_grid = dict(features__pca__n_components__k=[12,16,20],
                 features__univ_select__k=[8,12,16,20],
                 logistic__C = [.01, .1, 1, 10, 100, 10000, 100000000000L])

# Uncomment Out Below To Run GridSearch CV over hyperparameters.  Note this takes
# a long time.
gridsearch = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, scoring='precision')
gridsearch.fit(x_train,y_train)

Fitting 3 folds for each of 84 candidates, totalling 252 fits
[CV] logistic__C=0.01, features__pca__n_components__k=12, features__univ_select__k=8 


AttributeError: 'NoneType' object has no attribute 'set_params'

We run tester.py using these conditions.  We also edit poi_id to represent the data and features included in this notebook, and run tester.py from the resulting pickled files.  Finally, we manually split the dataset into financial versus email data, and running a number of iterations against these subsets before finally settling on the emails subset as described in the README.  The final feature set as describe contains :  'to_messages', 'shared_recipet_with_poi', 'from_messages', 'from_this_person_to_poi', 'from_poi_to_this_person', 'message_ratio', 'poi_from_ratio', 'poi_to_ratio'.

### Resplitting Data Under New Feature List

In [58]:
predictors = predictors.drop(['salary',
                            'deferral_payments',
                            'total_payments',
                            'exercised_stock_options',
                            'bonus',
                            'restricted_stock',
                            'restricted_stock_deferred',
                            'total_stock_value',
                            'expenses',
                            'loan_advances',
                            'other',
                            'director_fees',
                            'deferred_income',
                            'deferred_ratio',
                            'long_term_incentive'],axis=1)

In [59]:
x = np.array(predictors).tolist()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)

## Modeling

Now we begin modeling with our reduced feature set.  Note that the follow code uses a simple train test split to improve on performance (70/30) split, but this is not the method used on poi_id.py and tester.py.  As such accuracies and precision/recall may differ.  To get the results closest to that reported in the README.md, please uncomment the relevant code under poi_id.py for the model in question.  Note that some models may take some time to run.

### Logistic Regression

##### Final Model

In [60]:
clf = LogisticRegression(class_weight='balanced',
                        n_jobs=-1,
                        C=100000000000L,
                        penalty='l2',
                        random_state=42)

In [61]:
clf.fit(x_train,y_train)

LogisticRegression(C=100000000000L, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [62]:
pred = clf.predict(x_test)
pred

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [63]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.86486486,  0.        ]),
 array([ 0.84210526,  0.        ]),
 array([ 0.85333333,  0.        ]),
 array([38,  5]))

##### GridSearchCV for Logistic Regression

In [74]:
param_grid = {'C': [.1,1,100,10000,100000000000L],
             'penalty': ['l1','l2']}
clf = LogisticRegression(class_weight='balanced',n_jobs=-1)
clf = GridSearchCV(clf, param_grid=param_grid)

In [75]:
clf.fit(x_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.1, 1, 100, 10000, 100000000000L]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [78]:
clf.best_params_

{'C': 1, 'penalty': 'l2'}

In [76]:
pred = clf.predict(x_test)
pred

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [77]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.86486486,  0.        ]),
 array([ 0.84210526,  0.        ]),
 array([ 0.85333333,  0.        ]),
 array([38,  5]))

### Random Forest

In [79]:
param_grid = {'max_depth': [3, None],
              'min_samples_split': [5, 10],
              'min_samples_leaf': [5, 10],
              'criterion' :['gini', 'entropy']}

clf = RandomForestClassifier()
clf = GridSearchCV(clf, param_grid=param_grid)

In [80]:
clf.fit(x_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [5, 10], 'criterion': ['gini', 'entropy'], 'max_depth': [3, None], 'min_samples_leaf': [5, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [81]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 5}

In [82]:
pred = clf.predict(x_test)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [83]:
precision_recall_fscore_support(y_test,pred)

  'precision', 'predicted', average, warn_for)


(array([ 0.88372093,  0.        ]),
 array([ 1.,  0.]),
 array([ 0.9382716,  0.       ]),
 array([38,  5]))

### Naive Bayes

In [87]:
clf = GaussianNB()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
pred

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0])

In [88]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.88888889,  0.125     ]),
 array([ 0.63157895,  0.4       ]),
 array([ 0.73846154,  0.19047619]),
 array([38,  5]))

### AdaBoost

In [92]:
clf = AdaBoostClassifier()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [93]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.875,  0.   ]),
 array([ 0.92105263,  0.        ]),
 array([ 0.8974359,  0.       ]),
 array([38,  5]))