In [1]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

In [10]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi',
                 'salary', "to_messages",
                 "total_payments",
                 "exercised_stock_options",
                 "bonus",
                 "restricted_stock",
                 "shared_receipt_with_poi",
                 "restricted_stock_deferred",
                 "total_stock_value",
                 "expenses",
                 "loan_advances",
                 "from_messages",
                 "from_this_person_to_poi",
                 "deferred_income",
                 "long_term_incentive",
                 "from_poi_to_this_person"]# You will need to use more features

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "rb") as data_file:
    data_dict = pickle.load(data_file)

In [11]:
### Task 2: Remove outliers

data_dict.pop('TOTAL', 0) # Contains column total data
data_dict.pop('THE TRAVEL AGENCY IN THE PARK', 0) # not an individual
data_dict.pop('LOCKHART EUGENE E', 0) # record contains no information
data_dict.pop('HUMPHREY GENE E', 0) # 'to_poi_rate' outlier
data_dict.pop('LAVORATO JOHN J', 0) # 'from_poi_to_this_person' / 'total_payments' outlier
data_dict.pop('FREVERT MARK A', 0) # 'total_payments' outlier

{'salary': 1060932,
 'to_messages': 3275,
 'deferral_payments': 6426990,
 'total_payments': 17252530,
 'loan_advances': 2000000,
 'bonus': 2000000,
 'email_address': 'mark.frevert@enron.com',
 'restricted_stock_deferred': 'NaN',
 'deferred_income': -3367011,
 'total_stock_value': 14622185,
 'expenses': 86987,
 'from_poi_to_this_person': 242,
 'exercised_stock_options': 10433518,
 'from_messages': 21,
 'other': 7427621,
 'from_this_person_to_poi': 6,
 'poi': False,
 'long_term_incentive': 1617011,
 'shared_receipt_with_poi': 2979,
 'restricted_stock': 4188667,
 'director_fees': 'NaN'}

In [14]:
### Task 3: Create new feature(s)

my_dataset = {}
for key in data_dict:
    my_dataset[key] = data_dict[key]
    try:
        from_poi_rate = 1. * data_dict[key]['from_poi_to_this_person'] / \
        data_dict[key]['to_messages']
    except:
        from_poi_rate = "NaN"
    try:
        to_poi_rate = 1. * data_dict[key]['from_this_person_to_poi'] / \
        data_dict[key]['from_messages']
    except:
        to_poi_rate = "NaN"
    my_dataset[key]['from_poi_rate'] = from_poi_rate
    my_dataset[key]['to_poi_rate'] = to_poi_rate
    
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [15]:
my_dataset

{'METTS MARK': {'salary': 365788,
  'to_messages': 807,
  'deferral_payments': 'NaN',
  'total_payments': 1061827,
  'loan_advances': 'NaN',
  'bonus': 600000,
  'email_address': 'mark.metts@enron.com',
  'restricted_stock_deferred': 'NaN',
  'deferred_income': 'NaN',
  'total_stock_value': 585062,
  'expenses': 94299,
  'from_poi_to_this_person': 38,
  'exercised_stock_options': 'NaN',
  'from_messages': 29,
  'other': 1740,
  'from_this_person_to_poi': 1,
  'poi': False,
  'long_term_incentive': 'NaN',
  'shared_receipt_with_poi': 702,
  'restricted_stock': 585062,
  'director_fees': 'NaN',
  'from_poi_rate': 0.04708798017348203,
  'to_poi_rate': 0.034482758620689655},
 'BAXTER JOHN C': {'salary': 267102,
  'to_messages': 'NaN',
  'deferral_payments': 1295738,
  'total_payments': 5634343,
  'loan_advances': 'NaN',
  'bonus': 1200000,
  'email_address': 'NaN',
  'restricted_stock_deferred': 'NaN',
  'deferred_income': -1386055,
  'total_stock_value': 10623258,
  'expenses': 11200,
  '

In [20]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

from time import time

t0 = time()

clf = GaussianNB() 
clf.fit(features, labels)    
pred = clf.predict(features)

acc = accuracy_score(pred, labels)

print("training time:", round(time()-t0, 3), "s")

print("Accuracy: ", acc)

training time: 0.003 s
Accuracy:  0.8857142857142857


In [None]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

## Questions

1. Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data exploration”, “outlier investigation”]

> Answer

2. What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values.  [relevant rubric items: “create new features”, “intelligently select features”, “properly scale features”]

3. What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms?  [relevant rubric item: “pick an algorithm”]

4. What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?  How did you tune the parameters of your particular algorithm? What parameters did you tune? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier).  [relevant rubric items: “discuss parameter tuning”, “tune the algorithm”]

5. What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis?  [relevant rubric items: “discuss validation”, “validation strategy”]

6. Give at least 2 evaluation metrics and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]