In [38]:
#!/usr/bin/python

import sys
import pickle
import pandas as pd
import numpy
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

In [39]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
# features_list is a list of my selected features
# finance_features has finance features I'm interested in
# email_features is a list of email features
features_list = ['poi', 'to_msg_ratio', 'from_msg_ratio', \
                'from_poi_to_this_person', 'from_this_person_to_poi']
finance_features = ['poi', 'salary', 'bonus', 'long_term_incentive', \
                 'deferred_income', 'expenses', 'total_payments', \
                 'exercised_stock_options', 'restricted_stock', 'other'] 
email_features = ['poi', 'to_messages', 'email_address', 
                 'from_poi_to_this_person', 'from_messages', \
                 'from_this_person_to_poi', 'shared_receipt_with_poi', \
                 'to_msg_ratio', 'from_msg_ratio']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

df = pd.DataFrame.from_records(list(data_dict.values()))
employees = pd.Series(list(data_dict.keys()))

# set the index of df to be the employees series:
df.set_index(employees, inplace=True)

In [40]:
# Find how many missing values are in my selected features
for feature in features_list:
    try:
        print "Number of missing values in " + str(feature) + ": " + str(df[feature].value_counts(dropna=False)[0])
    except:
        print "Created feature: ", str(feature)

Number of missing values in poi: 128
Created feature:  to_msg_ratio
Created feature:  from_msg_ratio
Number of missing values in from_poi_to_this_person: 12
Number of missing values in from_this_person_to_poi: 20


In [41]:
# Create another working dataframe to make new features 

df_new = df.apply(lambda x: pd.to_numeric(x, errors='coerce')).copy()

df_new.fillna(0, inplace = True)

#df_new

In [42]:
# from_msg_ratio is ratio messages received from poi to total messages received
df_new['to_msg_ratio'] = df_new.from_this_person_to_poi.divide(df_new.to_messages, axis = 'index')
# create to_msg_ratio by dividing from_this_person_to_poi from to_messages
df_new['from_msg_ratio'] = df_new.from_poi_to_this_person.divide(df_new.from_messages, axis = 'index')
# create a new feature by adding expenses and bonus together
df_new['bon_plus_expenses'] = df_new['bonus'].add(df_new['expenses'], axis = 'index')

In [43]:
len(df_new['bonus'])

146

In [44]:
len(df_new['expenses'])

146

In [45]:
#df_new['bon_plus_expenses']

METTS MARK               694299.0
BAXTER JOHN C           1211200.0
ELLIOTT STEVEN           428552.0
CORDES WILLIAM R              0.0
HANNON KEVIN P          1534039.0
MORDAUNT KRISTINA M      360018.0
MEYER ROCKFORD G              0.0
MCMAHON JEFFREY         2737108.0
HORTON STANLEY C              0.0
PIPER GREGORY F          443057.0
HUMPHREY GENE E            4994.0
UMANOFF ADAM S           841872.0
BLACHMAN JEREMY M        934208.0
SUNDE MARTIN             700000.0
GIBBS DANA R                  0.0
LOWRY CHARLES P               0.0
COLWELL WESLEY          1216514.0
MULLER MARK S           1100000.0
JACKSON CHARLENE R       260181.0
WESTFAHL RICHARD K        51870.0
WALTERS GARETH W          33785.0
WALLS JR ROBERT H        900936.0
KITCHEN LOUISE          3105774.0
CHAN RONNIE                   0.0
BELFER ROBERT                 0.0
SHANKMAN JEFFREY A      2178979.0
WODRASKA JOHN                 0.0
BERGSIEKER RICHARD P     309175.0
URQUHART JOHN A          228656.0
BIBI PHILIPPE 

In [46]:
df_new.fillna(0, inplace = True)

In [47]:
data_dict["GRAMM WENDY L"]

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 119292,
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 'NaN',
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 119292,
 'total_stock_value': 'NaN'}

In [48]:
# create a dictionary from the dataframe
df_dict = df_new.to_dict('index')

In [49]:
# after you create features, the column names will be your new features
# create a list of column names:
new_features_list = df_new.columns.values
new_features_list

array(['bonus', 'deferral_payments', 'deferred_income', 'director_fees',
       'email_address', 'exercised_stock_options', 'expenses',
       'from_messages', 'from_poi_to_this_person',
       'from_this_person_to_poi', 'loan_advances', 'long_term_incentive',
       'other', 'poi', 'restricted_stock', 'restricted_stock_deferred',
       'salary', 'shared_receipt_with_poi', 'to_messages',
       'total_payments', 'total_stock_value', 'to_msg_ratio',
       'from_msg_ratio', 'bon_plus_expenses'], dtype=object)

In [50]:
### Task 2: Remove outliers

# I want to try out classifiers first before removing outliers to see 
# if it would perform any better.

In [51]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = df_dict

In [52]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [53]:
### Extract features and labels from dataset for local testing
# Same as before, but uses list finances_features
data = featureFormat(my_dataset, finance_features, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [54]:
### Extract features and labels from dataset for local testing
# Same as before, but uses list email_features
data = featureFormat(my_dataset, email_features, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [55]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf = clf.fit(features, labels)

print clf.feature_importances_

[ 0.12350831  0.          0.0954751   0.0855793   0.0743838   0.14198237
  0.13652674  0.34254438]


In [56]:
print clf.feature_importances_[4]
print email_features[4]

0.0743837968384
from_messages


In [57]:
print finance_features[1:]

# bonus and exercised stock option are two more important ones
# also expenses

['salary', 'bonus', 'long_term_incentive', 'deferred_income', 'expenses', 'total_payments', 'exercised_stock_options', 'restricted_stock', 'other']


In [58]:
print email_features[1:]

# to_messages, shared_receipt_with_poi are both best
# the two msg ratios aren't that great, so work with shared receipt

['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi', 'to_msg_ratio', 'from_msg_ratio']


In [60]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
