In [4]:
#!/usr/bin/python

import sys
import pickle
import pandas
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

In [5]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
# features_list has finance features
# email_features is a list of email features
features_list = ['poi', 'salary', 'bonus', 'long_term_incentive', \
                 'deferred_income', 'expenses', 'total_payments', \
                 'exercised_stock_options', 'restricted_stock', 'other'] 
email_features = ['poi', 'to_messages', 'email_address', 
                 'from_poi_to_this_person', 'from_messages', \
                 'from_this_person_to_poi', 'shared_receipt_with_poi']
# You will need to use more features


### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

df = pandas.DataFrame.from_records(list(data_dict.values()))
employees = pandas.Series(list(data_dict.keys()))

# set the index of df to be the employees series:
df.set_index(employees, inplace=True)

# You will have code here to add columns, i.e. new features,
# to the df, or remove rows, i.e. employees, from the df
df_finance = df[features_list]
df_email = df[email_features]

# after you create features, the column names will be your new features
# create a list of column names:
new_features_list = df.columns.values


# create a dictionary from the dataframe
df_dict = df.to_dict('index')

# compare the original dictionary 
# with the dictionary reconstructed from the dataframe:  
print df_dict == data_dict

True


In [7]:
# Find how many missing values are in each email feature
for feature in email_features:
    print "Number of missing values in " + feature + ": " + \
    str(df_email[feature].value_counts(dropna=False)[0])

Number of missing values in poi: 128
Number of missing values in to_messages: 60
Number of missing values in email_address: 35
Number of missing values in from_poi_to_this_person: 12
Number of missing values in from_messages: 60
Number of missing values in from_this_person_to_poi: 20
Number of missing values in shared_receipt_with_poi: 60


In [8]:
print df_email.dtypes

poi                          bool
to_messages                object
email_address              object
from_poi_to_this_person    object
from_messages              object
from_this_person_to_poi    object
shared_receipt_with_poi    object
dtype: object
METTS MARK              False
BAXTER JOHN C           False
ELLIOTT STEVEN          False
CORDES WILLIAM R        False
HANNON KEVIN P           True
MORDAUNT KRISTINA M     False
MEYER ROCKFORD G        False
MCMAHON JEFFREY         False
HORTON STANLEY C        False
PIPER GREGORY F         False
HUMPHREY GENE E         False
UMANOFF ADAM S          False
BLACHMAN JEREMY M       False
SUNDE MARTIN            False
GIBBS DANA R            False
LOWRY CHARLES P         False
COLWELL WESLEY           True
MULLER MARK S           False
JACKSON CHARLENE R      False
WESTFAHL RICHARD K      False
WALTERS GARETH W        False
WALLS JR ROBERT H       False
KITCHEN LOUISE          False
CHAN RONNIE             False
BELFER ROBERT           Fals

In [9]:
### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

In [10]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [11]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

In [12]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
