In [1]:
#!/usr/bin/python

import sys
import pickle
import pandas as pd
import numpy
import time
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.cross_validation import train_test_split
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import matplotlib.pyplot



In [120]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
# features_list is a list of my selected features
# finance_features has finance features I'm interested in
# email_features is a list of email features
features_list = ['poi', 'bonus', 'expenses', 'bon_plus_expenses', 'bon_sal_ratio', \
                'to_msg_ratio', 'from_msg_ratio']
all_features = ['poi', 'salary', 'bonus', 'long_term_incentive', \
                'deferred_income', 'expenses', 'total_payments', \
                'exercised_stock_options', 'restricted_stock', 'other', 'to_messages', \
                'email_address', 'from_poi_to_this_person', 'from_messages', \
                'from_this_person_to_poi', 'shared_receipt_with_poi', 'to_msg_ratio', \
                'from_msg_ratio', 'bon_plus_expenses', 'bon_sal_ratio'] 


### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

df = pd.DataFrame.from_records(list(data_dict.values()))
employees = pd.Series(list(data_dict.keys()))

# set the index of df to be the employees series:
df.set_index(employees, inplace=True)

In [121]:
# Find how many missing values are in my selected features
for feature in all_features:
    try:
        print "Number of missing values in " + str(feature) + ": " + str(df[feature].value_counts(dropna=False)[0])
    except:
        print "Created feature: ", str(feature)

Number of missing values in poi: 128
Number of missing values in salary: 51
Number of missing values in bonus: 64
Number of missing values in long_term_incentive: 80
Number of missing values in deferred_income: 97
Number of missing values in expenses: 51
Number of missing values in total_payments: 21
Number of missing values in exercised_stock_options: 44
Number of missing values in restricted_stock: 36
Number of missing values in other: 53
Number of missing values in to_messages: 60
Number of missing values in email_address: 35
Number of missing values in from_poi_to_this_person: 12
Number of missing values in from_messages: 60
Number of missing values in from_this_person_to_poi: 20
Number of missing values in shared_receipt_with_poi: 60
Created feature:  to_msg_ratio
Created feature:  from_msg_ratio
Created feature:  bon_plus_expenses
Created feature:  bon_sal_ratio


In [4]:
# Create another working dataframe to make new features 

df_new = df.apply(lambda x: pd.to_numeric(x, errors='coerce')).copy()

In [5]:
# from_msg_ratio is ratio messages received from poi to total messages received
df_new['to_msg_ratio'] = df_new.from_this_person_to_poi.divide(df_new.to_messages, axis = 'index')

# create to_msg_ratio by dividing from_this_person_to_poi from to_messages
df_new['from_msg_ratio'] = df_new.from_poi_to_this_person.divide(df_new.from_messages, axis = 'index')

# create a new feature by adding expenses and bonus together
df_new['bon_plus_expenses'] = df_new['bonus'].add(df_new['expenses'], axis = 'index')
# new feature of bonus to salary ratio

df_new['bon_sal_ratio'] = df_new['bonus'].divide(df_new['salary'], axis = 'index')
# new feature of bonus to expenses ratio

In [6]:
# Fill NaN with 0 where operations created NaN in some rows
df_new.fillna(0, inplace = True)

In [7]:
# create a dictionary from the dataframe
df_dict = df_new.to_dict('index')

In [8]:
# after you create features, the column names will be your new features
# create a list of column names:
new_features_list = df_new.columns.values
new_features_list

array(['bonus', 'deferral_payments', 'deferred_income', 'director_fees',
       'email_address', 'exercised_stock_options', 'expenses',
       'from_messages', 'from_poi_to_this_person',
       'from_this_person_to_poi', 'loan_advances', 'long_term_incentive',
       'other', 'poi', 'restricted_stock', 'restricted_stock_deferred',
       'salary', 'shared_receipt_with_poi', 'to_messages',
       'total_payments', 'total_stock_value', 'to_msg_ratio',
       'from_msg_ratio', 'bon_plus_expenses', 'bon_sal_ratio'], dtype=object)

In [9]:
### Task 2: Remove outliers

# From the mini project, we have to remove the one outlier called "TOTAL" as 
# a spreadsheet quirk

df_new.drop(['TOTAL'], inplace=True)

In [10]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = df_dict

In [101]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, all_features, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [12]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.

In [102]:
# First one tried is RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 25, min_samples_split = 3, 
                            random_state = 2)
rfc = rfc.fit(features, labels)

In [103]:
# Also trying a decision tree classifier because tree classifiers make sense here

dc = DecisionTreeClassifier()
dc = dc.fit(features, labels)


In [107]:
print dc.feature_importances_
print len(dc.feature_importances_)

[ 0.0543682   0.17958795  0.          0.          0.17095488  0.16475213
  0.05858439  0.12036185  0.08072355  0.          0.          0.08721566
  0.          0.0073359   0.          0.07611549  0.        ]
17


In [109]:
print len(all_features[1:])

17


In [104]:
# This function appends the feature and according importance value from tree
# classifier to a list to view more neatly
rfc_impt = []
dc_impt = []

def input_impt(impt_list, features_list, impts):
    for i in range(len(impts)):
        impt_list.append( (features_list[i], impts[i]) )
    
    impt_list.sort(key = lambda tup: tup[1], reverse = True)
    
    return impt_list

In [111]:
input_impt(rfc_impt, all_features[1:], rfc.feature_importances_)

[('from_msg_ratio', 0.12432918588048668),
 ('exercised_stock_options', 0.086950074327646162),
 ('expenses', 0.086910670190534431),
 ('bonus', 0.086910670190534431),
 ('other', 0.077036946137277351),
 ('to_msg_ratio', 0.076479073598330199),
 ('expenses', 0.076479073598330199),
 ('to_msg_ratio', 0.075163248835125443),
 ('restricted_stock', 0.072475551142175521),
 ('from_msg_ratio', 0.070543645125709681),
 ('total_payments', 0.070543645125709681),
 ('bonus', 0.055297306532976577),
 ('salary', 0.055297306532976577),
 ('to_messages', 0.046670639275239983),
 ('from_poi_to_this_person', 0.042629847217495909),
 ('shared_receipt_with_poi', 0.041730511427719739),
 ('bon_plus_expenses', 0.040213544323756123),
 ('long_term_incentive', 0.040213544323756123),
 ('bon_sal_ratio', 0.037894992123548028),
 ('deferred_income', 0.037894992123548028),
 ('from_messages', 0.036437919216726129),
 ('from_this_person_to_poi', 0.029236844645252093),
 ('email_address', 0.0)]

In [112]:
input_impt(dc_impt, all_features[1:], dc.feature_importances_)

[('bonus', 0.17958795431428407),
 ('expenses', 0.17095487555925451),
 ('total_payments', 0.16475213325607024),
 ('restricted_stock', 0.12036184946367158),
 ('from_poi_to_this_person', 0.087215660542432197),
 ('other', 0.080723551736432433),
 ('to_msg_ratio', 0.076115485564304475),
 ('exercised_stock_options', 0.058584388907908139),
 ('salary', 0.054368203974503197),
 ('from_this_person_to_poi', 0.0073358966811391214),
 ('long_term_incentive', 0.0),
 ('deferred_income', 0.0),
 ('to_messages', 0.0),
 ('email_address', 0.0),
 ('from_messages', 0.0),
 ('shared_receipt_with_poi', 0.0),
 ('from_msg_ratio', 0.0)]

In [16]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [17]:
# straified cv for parameters, 100 fold and shuffled
best_cv = StratifiedShuffleSplit(n_splits = 100, random_state=42) 

In [18]:
# look up parameters to use for cross validation here 
# random_state is to bring consistency to results
# results to best_params_ was variable before adding that parameter
# If you uncomment to run these lines of code, it may take a while
# Added start and end times to see how long this all takes because
# this exhaustive method has been taking forever

start_gridcv_rfc = time.time()
rbc_param_grid = {'n_estimators': [1,2, 3, 10, 100], 
                 'min_samples_split': [2, 3, 5],
                 'random_state': [2],
                 'max_features': [1, 2, 3],
                 'max_depth' : [2, 3, 5, 10, 50],
                 'min_samples_leaf': [1, 2, 3, 10]
                 }

grid_cv_rfc = GridSearchCV(estimator = rfc, param_grid = rbc_param_grid, cv = best_cv,
                          n_jobs = 5, scoring = 'f1')
grid_cv_rfc.fit(features, labels)
end_gridcv_rfc = time.time()
print "Minutes elapsed: " + str((float(end_gridcv_rfc - start_gridcv_rfc) / 60))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Minutes elapsed: 97.7768248836


In [19]:
# gridsearchcv for decisiontreeclassifier
# The idea is the same as for randomforestclassifier

start_gridcv_dc = time.time()

dc_param_grid = {'min_samples_split' : [2, 3, 4, 5, 10, 50],
                 'max_features' : [1, 2, 3, 4, 'auto', 'sqrt', 'log2'],
                 'min_samples_leaf': [1, 2, 3, 10, 20],
                'random_state' : [2]
                }
grid_cv_dc = GridSearchCV(estimator = dc, param_grid = dc_param_grid, cv = best_cv,
                         n_jobs = 5, scoring = 'f1')
grid_cv_dc.fit(features, labels)

end_gridcv_dc = time.time()

print "Minutes elapsed: " + str((float(end_gridcv_dc - start_gridcv_dc) / 60))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Minutes elapsed: 1.51779116789


In [20]:
# kmeans cross validation

kf_rbc = KFold(n_splits = 10, shuffle=True, random_state = 2)
kf_rbc.split(features, labels)
print kf_rbc

KFold(n_splits=10, random_state=2, shuffle=True)


In [21]:
#for train_indicies, test_indicies in kf_rbc:
#    features_train = [data[ii] for ii in train_indicies]

In [22]:
print classification_report(labels_train, grid_cv_dc.best_estimator_.predict(features_train))

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00        70
        1.0       1.00      1.00      1.00        11

avg / total       1.00      1.00      1.00        81



In [23]:
print classification_report(labels_test, grid_cv_dc.best_estimator_.predict(features_test))

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00        29
        1.0       1.00      1.00      1.00         7

avg / total       1.00      1.00      1.00        36



In [31]:
print classification_report(labels_train, grid_cv_rfc.best_estimator_.predict(features_train))

             precision    recall  f1-score   support

        0.0       0.95      1.00      0.97        70
        1.0       1.00      0.64      0.78        11

avg / total       0.95      0.95      0.95        81



In [32]:
print classification_report(labels_test, grid_cv_rfc.best_estimator_.predict(features_test))

             precision    recall  f1-score   support

        0.0       0.88      0.97      0.92        29
        1.0       0.75      0.43      0.55         7

avg / total       0.85      0.86      0.85        36



In [33]:
grid_cv_rfc.best_params_

{'max_depth': 50,
 'max_features': 1,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 3,
 'random_state': 2}

In [115]:
# Assign clf to classifer chosen after testing

clf = RandomForestClassifier(min_samples_split = 5, n_estimators = 3,
                            random_state = 2, max_depth = 50, min_samples_leaf = 1,
                            max_features = 1)

#clf = make_pipeline(selection, clf)
clf.fit(features, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features=1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=1,
            oob_score=False, random_state=2, verbose=0, warm_start=False)

In [28]:
grid_cv_dc.best_params_

{'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'random_state': 2}

In [117]:
clf = DecisionTreeClassifier(min_samples_split = 2, random_state = 2,
                            max_features = 3, min_samples_leaf = 1)

clf.fit(features, labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=2, splitter='best')

In [118]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
