In [10]:
import numpy as np
import sys
import pickle
from collections import defaultdict
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, test_classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.preprocessing import RobustScaler

import my_tools

### define constants for base feature names
FINANCIAL_FEATURES = ["bonus", "deferral_payments", "deferred_income", "director_fees",
                      "exercised_stock_options", "expenses", "loan_advances",
                      "long_term_incentive", "other", "restricted_stock", 
                      "restricted_stock_deferred", "salary", "total_payments",
                      "total_stock_value"]

EMAIL_FEATURES = ["from_messages", "to_messages"]
EMAIL_POI_FEATURES = ["from_poi_to_this_person", "from_this_person_to_poi", "shared_receipt_with_poi"]


### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
### There are 2 entries which are not individuals; remove them.
del data_dict["TOTAL"]
del data_dict["THE TRAVEL AGENCY IN THE PARK"]

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Custom imputation for salary:  Led me to best classifier but ended up having salary drop out of list of 
### features used in this classifier.
my_tools.my_imputer(my_dataset, "salary", strategy = 'median', test = my_tools.is_not_director)

### Task 3: Create new feature(s)
### Tried 2 ratios of poi-related emails.  This one worked better but ended up not being used in best classifier
my_tools.create_ratio_feature(my_dataset, "shared_receipt_ratio", "shared_receipt_with_poi", "to_messages")


### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','total_stock_value','bonus', 'exercised_stock_options']
rs = RobustScaler()
knn = KNeighborsClassifier(n_neighbors = 4, weights = 'distance')
clf = Pipeline([('rs',rs), ('knn',knn)])

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
print "done"

done


In [2]:
from tester import load_classifier_and_data, test_classifier
### load up student's classifier, dataset, and feature_list
clf, dataset, feature_list = load_classifier_and_data()
### Run testing script
test_classifier(clf, dataset, feature_list)

Pipeline(steps=[('rs', RobustScaler(copy=True, with_centering=True, with_scaling=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance'))])
	Accuracy: 0.87531	Precision: 0.63802	Recall: 0.43800	F1: 0.51942	F2: 0.46730
	Total predictions: 13000	True positives:  876	False positives:  497	False negatives: 1124	True negatives: 10503

