# Enron Person Of Interest Prediction - Code

### Importing Libraries and Setting Notebook Environment

In [1]:
# General
import sys
import pickle
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt

# sklearn - general
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

# sklearn -models 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

#sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

pd.options.display.max_rows = 199
pd.options.display.max_columns = 99

%matplotlib inline

### Importing Data

In [2]:
data_dict = pickle.load(open('final_project_dataset.pkl'))

### Remove Bad Samples

In [3]:
bad_keys = ['TOTAL','LOCKHART EUGENE E','THE TRAVEL AGENCY IN THE PARK']
for i in bad_keys:
    if i in data_dict:
        del data_dict[i]

### Fill NaNs

In [4]:
df = pd.DataFrame.from_dict(data_dict,orient='index')
df = df.replace('NaN',0)
df.head(2)

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
ALLEN PHILLIP K,201955,2902,2869717,4484442,1729541,4175000,126027,1407,-126027,1729541,13868,0,2195,152,65,False,0,-3081055,304805,phillip.allen@enron.com,47
BADUM JAMES P,0,0,178980,182466,257817,0,0,0,0,257817,3486,0,0,0,0,False,0,0,0,0,0


In [5]:
count_missing = DataFrame(len(df) - df.count())
perc_missing = DataFrame((len(df) - df.count())/len(df))
frames = [count_missing, perc_missing]
missing = pd.concat(frames, axis=1)
missing.columns = ['Count of Nulls','Percentage Null']

In [6]:
missing

Unnamed: 0,Count of Nulls,Percentage Null
salary,0,0
to_messages,0,0
deferral_payments,0,0
total_payments,0,0
exercised_stock_options,0,0
bonus,0,0
restricted_stock,0,0
shared_receipt_with_poi,0,0
restricted_stock_deferred,0,0
total_stock_value,0,0


### Add New Features

In [7]:
df['message_ratio'] = df['to_messages']/df['from_messages']
df['poi_from_ratio'] = df['from_poi_to_this_person']/df['from_messages']
df['poi_to_ratio'] = df['from_this_person_to_poi']/df['to_messages']
df['deferred_ratio'] = df['deferral_payments']/df['total_payments']
df = df.replace('inf',0)

### Splitting Data For Testing

In [8]:
target = df['poi'].astype(int)
predictors = df.drop(['poi','email_address'],axis=1)
x = np.array(predictors).tolist()
y = np.array(target).tolist()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)

### Feature Selection - Studying Performance Using Decision Tree

In [9]:
clf = RandomForestClassifier(max_features='sqrt',n_estimators=50)
clf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
pred = clf.predict(x_test)

In [11]:
clf.feature_importances_

array([ 0.05567616,  0.03715432,  0.00693899,  0.03456433,  0.07149913,
        0.13454434,  0.06756891,  0.04220263,  0.        ,  0.05371447,
        0.05823669,  0.00896989,  0.02759337,  0.04388803,  0.0536756 ,
        0.00047619,  0.05912373,  0.05842684,  0.03119722,  0.04405664,
        0.06359699,  0.03204981,  0.01484571])

In [12]:
features = DataFrame()
features['Feature_Names'] = predictors.columns
features['Feature_Importance'] = clf.feature_importances_

In [13]:
features.sort(columns='Feature_Importance', ascending=False)

Unnamed: 0,Feature_Names,Feature_Importance
5,bonus,0.134544
4,exercised_stock_options,0.071499
6,restricted_stock,0.067569
20,poi_from_ratio,0.063597
16,deferred_income,0.059124
17,long_term_incentive,0.058427
10,expenses,0.058237
0,salary,0.055676
9,total_stock_value,0.053714
14,from_this_person_to_poi,0.053676


In [14]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.9047619,  1.       ]),
 array([ 1. ,  0.2]),
 array([ 0.95      ,  0.33333333]),
 array([38,  5]))

### Feature Selection - Studying Performance Using PCA + SelectKBest

In [15]:
pca = PCA()
selection = SelectKBest()
combined_features = FeatureUnion([('pca',pca),('univ_select',selection)])

clf = LogisticRegression(class_weight='balanced')
pipeline = Pipeline([('features', combined_features),('logistic',clf)])
param_grid = dict(features__pca__n_components__k=[12,16,20],
                 features__univ_select__k=[8,12,16,20],
                 logistic__C = [.01, .1, 1, 10, 100, 10000, 100000000000L])

# Uncomment Out Below To Run GridSearch CV over hyperparameters.  Note this takes
# a long time.
# gridsearch = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, scoring='precision')
# gridsearch.fit(x_train,y_train)

We run tester.py using these conditions.  We also edit poi_id to represent the data and features included in this notebook, and run tester.py from the resulting pickled files.  Finally, we manually split the dataset into financial versus email data, and running a number of iterations against these subsets before finally settling on the emails subset as described in the README.  The final feature set as describe contains :  'to_messages', 'shared_recipet_with_poi', 'from_messages', 'from_this_person_to_poi', 'from_poi_to_this_person', 'message_ratio', 'poi_from_ratio', 'poi_to_ratio'.

### Resplitting Data Under New Feature List

In [16]:
predictors = predictors.drop(['salary',
                            'deferral_payments',
                            'total_payments',
                            'exercised_stock_options',
                            'bonus',
                            'restricted_stock',
                            'restricted_stock_deferred',
                            'total_stock_value',
                            'expenses',
                            'loan_advances',
                            'other',
                            'director_fees',
                            'deferred_income',
                            'deferred_ratio',
                            'long_term_incentive'],axis=1)

In [17]:
x = np.array(predictors).tolist()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)

## Modeling

Now we begin modeling with our reduced feature set.  

**Note that the follow code uses a simple train test split to improve on performance (70/30), but this is not the method used on poi_id.py and tester.py.  This is largely sample code for the purposes of the notebook and to demonstrate the processes used to find the optimal model.  As such accuracies and precision/recall may differ.  To get the results closest to that reported in the README.md, please uncomment the relevant code under poi_id.py for the model in question.  Note that some models may take some time to run.

### Logistic Regression

##### Final Model

In [18]:
clf = LogisticRegression(class_weight='balanced',
                        n_jobs=-1,
                        C=100000000000L,
                        penalty='l2',
                        random_state=42)

In [19]:
clf.fit(x_train,y_train)

LogisticRegression(C=100000000000L, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [20]:
pred = clf.predict(x_test)
pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [21]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.89189189,  0.16666667]),
 array([ 0.86842105,  0.2       ]),
 array([ 0.88      ,  0.18181818]),
 array([38,  5]))

##### GridSearchCV for Logistic Regression

In [22]:
param_grid = {'C': [.1,1,100,10000,100000000000L],
             'penalty': ['l1','l2']}
clf = LogisticRegression(class_weight='balanced',n_jobs=-1)
clf = GridSearchCV(clf, param_grid=param_grid)

In [23]:
clf.fit(x_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.1, 1, 100, 10000, 100000000000L]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [24]:
clf.best_params_

{'C': 100, 'penalty': 'l2'}

In [25]:
pred = clf.predict(x_test)
pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [26]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.89189189,  0.16666667]),
 array([ 0.86842105,  0.2       ]),
 array([ 0.88      ,  0.18181818]),
 array([38,  5]))

### Random Forest

In [27]:
param_grid = {'max_depth': [3, None],
              'min_samples_split': [5, 10],
              'min_samples_leaf': [5, 10],
              'criterion' :['gini', 'entropy']}

clf = RandomForestClassifier(max_features='sqrt')
clf = GridSearchCV(clf, param_grid=param_grid)

In [28]:
clf.fit(x_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [5, 10], 'criterion': ['gini', 'entropy'], 'max_depth': [3, None], 'min_samples_leaf': [5, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [29]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 5}

In [30]:
pred = clf.predict(x_test)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [30]:
precision_recall_fscore_support(y_test,pred)

  'precision', 'predicted', average, warn_for)


(array([ 0.88372093,  0.        ]),
 array([ 1.,  0.]),
 array([ 0.9382716,  0.       ]),
 array([38,  5]))

### Naive Bayes

In [31]:
clf = GaussianNB()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
pred

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0])

In [32]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.89655172,  0.14285714]),
 array([ 0.68421053,  0.4       ]),
 array([ 0.7761194 ,  0.21052632]),
 array([38,  5]))

### AdaBoost

In [33]:
clf = AdaBoostClassifier()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [34]:
precision_recall_fscore_support(y_test,pred)

(array([ 0.875,  0.   ]),
 array([ 0.92105263,  0.        ]),
 array([ 0.8974359,  0.       ]),
 array([38,  5]))