In [1]:
#!/usr/bin/python

import sys
import pickle
import pandas as pd
import numpy
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data



In [58]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
# features_list is a list of my selected features
# finance_features has finance features I'm interested in
# email_features is a list of email features
features_list = ['poi', 'bonus', 'expenses', 'bon_plus_expenses', \
                'to_msg_ratio', 'from_msg_ratio']
finance_features = ['poi', 'salary', 'bonus', 'long_term_incentive', \
                 'deferred_income', 'expenses', 'total_payments', \
                 'exercised_stock_options', 'restricted_stock', 'other'] 
email_features = ['poi', 'to_messages', 'email_address', 
                 'from_poi_to_this_person', 'from_messages', \
                 'from_this_person_to_poi', 'shared_receipt_with_poi', \
                 'to_msg_ratio', 'from_msg_ratio']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

df = pd.DataFrame.from_records(list(data_dict.values()))
employees = pd.Series(list(data_dict.keys()))

# set the index of df to be the employees series:
df.set_index(employees, inplace=True)

In [59]:
# Find how many missing values are in my selected features
for feature in features_list:
    try:
        print "Number of missing values in " + str(feature) + ": " + str(df[feature].value_counts(dropna=False)[0])
    except:
        print "Created feature: ", str(feature)

Number of missing values in poi: 128
Number of missing values in bonus: 64
Number of missing values in expenses: 51
Created feature:  bon_plus_expenses
Created feature:  to_msg_ratio
Created feature:  from_msg_ratio


In [60]:
# Create another working dataframe to make new features 

df_new = df.apply(lambda x: pd.to_numeric(x, errors='coerce')).copy()

df_new

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
METTS MARK,600000.0,,,,,,94299.0,29.0,38.0,1.0,...,,1740.0,False,585062.0,,365788.0,702.0,807.0,1061827.0,585062.0
BAXTER JOHN C,1200000.0,1295738.0,-1386055.0,,,6680544.0,11200.0,,,,...,1586055.0,2660303.0,False,3942714.0,,267102.0,,,5634343.0,10623258.0
ELLIOTT STEVEN,350000.0,,-400729.0,,,4890344.0,78552.0,,,,...,,12961.0,False,1788391.0,,170941.0,,,211725.0,6678735.0
CORDES WILLIAM R,,,,,,651850.0,,12.0,10.0,0.0,...,,,False,386335.0,,,58.0,764.0,,1038185.0
HANNON KEVIN P,1500000.0,,-3117011.0,,,5538001.0,34039.0,32.0,32.0,21.0,...,1617011.0,11350.0,True,853064.0,,243293.0,1035.0,1045.0,288682.0,6391065.0
MORDAUNT KRISTINA M,325000.0,,,,,,35018.0,,,,...,,1411.0,False,208510.0,,267093.0,,,628522.0,208510.0
MEYER ROCKFORD G,,1848227.0,,,,493489.0,,28.0,0.0,0.0,...,,,False,462384.0,,,22.0,232.0,1848227.0,955873.0
MCMAHON JEFFREY,2600000.0,,,,,1104054.0,137108.0,48.0,58.0,26.0,...,694862.0,297353.0,False,558801.0,,370448.0,2228.0,2355.0,4099771.0,1662855.0
HORTON STANLEY C,,3131860.0,,,,5210569.0,,1073.0,44.0,15.0,...,,,False,2046079.0,,,1074.0,2350.0,3131860.0,7256648.0
PIPER GREGORY F,400000.0,1130036.0,-33333.0,,,880290.0,43057.0,222.0,61.0,48.0,...,,778.0,False,409554.0,-409554.0,197091.0,742.0,1238.0,1737629.0,880290.0


In [61]:
# from_msg_ratio is ratio messages received from poi to total messages received
df_new['to_msg_ratio'] = df_new.from_this_person_to_poi.divide(df_new.to_messages, axis = 'index')
# create to_msg_ratio by dividing from_this_person_to_poi from to_messages
df_new['from_msg_ratio'] = df_new.from_poi_to_this_person.divide(df_new.from_messages, axis = 'index')
# create a new feature by adding expenses and bonus together
df_new['bon_plus_expenses'] = df_new['bonus'].add(df_new['expenses'], axis = 'index')

In [62]:
# Fill NaN with 0 where operations created NaN in some rows
df_new.fillna(0, inplace = True)

In [63]:
#df_new['bon_plus_expenses']

In [64]:
# create a dictionary from the dataframe
df_dict = df_new.to_dict('index')

In [65]:
# after you create features, the column names will be your new features
# create a list of column names:
new_features_list = df_new.columns.values
new_features_list

array(['bonus', 'deferral_payments', 'deferred_income', 'director_fees',
       'email_address', 'exercised_stock_options', 'expenses',
       'from_messages', 'from_poi_to_this_person',
       'from_this_person_to_poi', 'loan_advances', 'long_term_incentive',
       'other', 'poi', 'restricted_stock', 'restricted_stock_deferred',
       'salary', 'shared_receipt_with_poi', 'to_messages',
       'total_payments', 'total_stock_value', 'to_msg_ratio',
       'from_msg_ratio', 'bon_plus_expenses'], dtype=object)

In [66]:
### Task 2: Remove outliers

# I want to try out classifiers first before removing outliers to see 
# if it would perform any better.

In [67]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = df_dict

In [70]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [69]:
### Extract features and labels from dataset for local testing
# Use list finances_features
data = featureFormat(my_dataset, finance_features, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [53]:
### Extract features and labels from dataset for local testing
# Same as before, but uses list email_features
data = featureFormat(my_dataset, email_features, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [71]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.



[ 0.21170286  0.25173006  0.15687717  0.14905007  0.23063984]


In [96]:
# Trying PCA to combine some features

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(features)


#print pca.explained_variance_ratio_
print pca.components_[0]
#print pca.explained_variance_

[  6.87843393e-01   3.63301401e-02   7.24949369e-01  -2.89403891e-11
  -2.96706207e-09]


In [85]:
features_train = pca.transform(features)

array([[  3.54399236e+06,   1.27069791e+05,   1.84464629e+05,
         -1.22337315e+00,  -6.91944168e-03],
       [ -2.36484820e+06,  -5.08933860e+04,   7.60234956e+03,
         -8.51999612e-01,  -1.44362307e-02],
       [ -2.36292942e+06,  -5.98231960e+04,  -4.44168874e+04,
          6.22911577e-01,  -8.25087749e-03],
       [ -6.61097200e+05,  -5.40364252e+03,   5.48057994e+04,
         -9.56882696e-01,  -1.84096729e-02],
       [ -1.70154458e+06,  -1.41055861e+05,  -8.00317038e+04,
         -6.47166301e-01,  -6.92871800e-03],
       [ -1.34772163e+06,  -4.99403065e+04,   1.07550581e+04,
         -8.23689302e-01,   3.81420339e-02],
       [  5.06539775e+06,   1.72811483e+05,   2.29190509e+05,
         -8.72781700e-01,  -1.89774260e-02],
       [ -2.36454281e+06,  -5.23146486e+04,  -6.76997329e+02,
         -8.31295406e-01,  -1.37351843e-02],
       [ -1.96672794e+06,  -8.88668859e+04,  -2.77348597e+04,
         -7.01569453e-01,  -1.13134585e-02],
       [ -2.36497484e+06,  -5.0303983

In [None]:
# First one tried is RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf = clf.fit(features, labels)

print clf.feature_importances_

In [72]:
# Next, Adaboost
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier()
clf = clf.fit(features, labels)

print clf.feature_importances_

[ 0.12  0.44  0.14  0.14  0.16]


In [18]:
# Check accuracy scores
print clf.score(features)

TypeError: score() takes at least 3 arguments (2 given)

In [None]:
print clf.feature_importances_[4]
print email_features[4]

In [19]:
print finance_features[1:]

# bonus and exercised stock option are two more important ones
# also expenses

['salary', 'bonus', 'long_term_incentive', 'deferred_income', 'expenses', 'total_payments', 'exercised_stock_options', 'restricted_stock', 'other']


In [None]:
print email_features[1:]

# to_messages, shared_receipt_with_poi are both best
# the two msg ratios aren't that great, so work with shared receipt

In [None]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
