In [1]:
#!/usr/bin/python

import sys
import pickle
import pandas as pd
import numpy
import time
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.cross_validation import train_test_split
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import matplotlib.pyplot



In [2]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
# features_list is a list of my selected features
# finance_features has finance features I'm interested in
# email_features is a list of email features
features_list = ['poi', 'bonus', 'expenses', 'bon_plus_expenses', 'bon_sal_ratio', \
                'to_msg_ratio', 'from_msg_ratio']
finance_features = ['poi', 'salary', 'bonus', 'long_term_incentive', \
                 'deferred_income', 'expenses', 'total_payments', \
                 'exercised_stock_options', 'restricted_stock', 'other'] 
email_features = ['poi', 'to_messages', 'email_address', 
                 'from_poi_to_this_person', 'from_messages', \
                 'from_this_person_to_poi', 'shared_receipt_with_poi', \
                 'to_msg_ratio', 'from_msg_ratio']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

df = pd.DataFrame.from_records(list(data_dict.values()))
employees = pd.Series(list(data_dict.keys()))

# set the index of df to be the employees series:
df.set_index(employees, inplace=True)

In [3]:
# Find how many missing values are in my selected features
for feature in features_list:
    try:
        print "Number of missing values in " + str(feature) + ": " + str(df[feature].value_counts(dropna=False)[0])
    except:
        print "Created feature: ", str(feature)

Number of missing values in poi: 128
Number of missing values in bonus: 64
Number of missing values in expenses: 51
Created feature:  bon_plus_expenses
Created feature:  bon_sal_ratio
Created feature:  to_msg_ratio
Created feature:  from_msg_ratio


In [4]:
# Create another working dataframe to make new features 

df_new = df.apply(lambda x: pd.to_numeric(x, errors='coerce')).copy()

In [5]:
# from_msg_ratio is ratio messages received from poi to total messages received
df_new['to_msg_ratio'] = df_new.from_this_person_to_poi.divide(df_new.to_messages, axis = 'index')

# create to_msg_ratio by dividing from_this_person_to_poi from to_messages
df_new['from_msg_ratio'] = df_new.from_poi_to_this_person.divide(df_new.from_messages, axis = 'index')

# create a new feature by adding expenses and bonus together
df_new['bon_plus_expenses'] = df_new['bonus'].add(df_new['expenses'], axis = 'index')
# new feature of bonus to salary ratio

df_new['bon_sal_ratio'] = df_new['bonus'].divide(df_new['salary'], axis = 'index')
# new feature of bonus to expenses ratio

In [6]:
# Fill NaN with 0 where operations created NaN in some rows
df_new.fillna(0, inplace = True)

In [7]:
# create a dictionary from the dataframe
df_dict = df_new.to_dict('index')

In [8]:
# after you create features, the column names will be your new features
# create a list of column names:
new_features_list = df_new.columns.values
new_features_list

array(['bonus', 'deferral_payments', 'deferred_income', 'director_fees',
       'email_address', 'exercised_stock_options', 'expenses',
       'from_messages', 'from_poi_to_this_person',
       'from_this_person_to_poi', 'loan_advances', 'long_term_incentive',
       'other', 'poi', 'restricted_stock', 'restricted_stock_deferred',
       'salary', 'shared_receipt_with_poi', 'to_messages',
       'total_payments', 'total_stock_value', 'to_msg_ratio',
       'from_msg_ratio', 'bon_plus_expenses', 'bon_sal_ratio'], dtype=object)

In [9]:
### Task 2: Remove outliers

# From the mini project, we have to remove the one outlier called "TOTAL" as 
# a spreadsheet quirk

df_new.drop(['TOTAL'], inplace=True)

In [10]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = df_dict

In [11]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [12]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.

In [13]:
# First one tried is RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 25, min_samples_split = 3, 
                            random_state = 2)
rfc = rfc.fit(features, labels)

print rfc.feature_importances_

[ 0.15654285  0.23536453  0.13178955  0.13761229  0.15467439  0.18401639]


In [14]:
# Also trying a decision tree classifier because tree classifiers make sense here

dc = DecisionTreeClassifier()
dc = dc.fit(features, labels)

print dc.feature_importances_

[ 0.02553311  0.46042969  0.38345358  0.01896745  0.03282828  0.07878788]


In [15]:
# using selectkbest to do feature selection

selection = SelectKBest(chi2, k = 2)

In [16]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [None]:
# straified cv for parameters, 100 fold and shuffled
best_cv = StratifiedShuffleSplit(n_splits = 100, random_state=42) 

In [None]:
# look up parameters to use for cross validation here 
# random_state is to bring consistency to results
# results to best_params_ was variable before adding that parameter
# If you uncomment to run these lines of code, it may take a while
# Added start and end times to see how long this all takes because
# this exhaustive method has been taking forever

start_gridcv_rfc = time.time()
rbc_param_grid = {'n_estimators': [1,2, 3, 10, 100], 
                 'min_samples_split': [2, 3, 5],
                 'random_state': [2],
                 'max_features': [1, 2, 3],
                 'max_depth' : [2, 3, 5, 10, 50],
                 'min_samples_leaf': [1, 2, 3, 10]
                 }

grid_cv_rfc = GridSearchCV(estimator = rfc, param_grid = rbc_param_grid, cv = best_cv,
                          n_jobs = 5, scoring = 'f1')
grid_cv_rfc.fit(features, labels)
end_gridcv_rfc = time.time()
print "Minutes elapsed: " + str((float(end_gridcv_rfc - start_gridcv_rfc) / 60))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
# gridsearchcv for decisiontreeclassifier

start_gridcv_dc = time.time()

dc_param_grid = {'min_samples_split' : [2, 3, 4, 5, 10, 50],
                 'max_features' : [1, 2, 3, 4, 'auto', 'sqrt', 'log2'],
                 'min_samples_leaf': [1, 2, 3, 10, 20],
                'random_state' : [2]
                }
grid_cv_dc = GridSearchCV(estimator = dc, param_grid = dc_param_grid, cv = best_cv,
                         n_jobs = 5, scoring = 'f1')
grid_cv_dc.fit(features, labels)

end_gridcv_dc = time.time()

print "Minutes elapsed: " + str((float(end_gridcv_dc - start_gridcv_dc) / 60))

In [None]:
# kmeans cross validation

kf_rbc = KFold(n_splits = 10, shuffle=True, random_state = 2)
kf_rbc.split(features, labels)
print kf_rbc

In [None]:
#for train_indicies, test_indicies in kf_rbc:
#    features_train = [data[ii] for ii in train_indicies]

In [None]:
print classification_report(labels_train, grid_cv_dc.best_estimator_.predict(features_train))

In [None]:
print classification_report(labels_test, grid_cv_dc.best_estimator_.predict(features_test))

In [None]:
#print classification_report(labels_train, grid_cv_rfc.best_estimator_.predict(features_train))

In [None]:
#print classification_report(labels_test, grid_cv_rfc.best_estimator_.predict(features_test))

In [None]:
#grid_cv_rfc.best_params_

In [None]:
# Assign clf to classifer chosen after testing

clf = RandomForestClassifier(min_samples_split = 5, n_estimators = 3,
                            random_state = 2, max_depth = 50, min_samples_leaf = 1,
                            max_features = 1)

clf = make_pipeline(selection, clf)
clf.fit(features, labels)

In [None]:
grid_cv_dc.best_params_

In [None]:
clf = DecisionTreeClassifier(min_samples_split = 2, random_state = 2,
                            max_features = 3, min_samples_leaf = 1)

clf.fit(features, labels)

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
