In [2]:
### Try the K-nearest neighbors classifier
import numpy as np
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.decomposition import PCA
import my_tools 


### define constants for base feature names
FINANCIAL_FEATURES = ["bonus", "deferral_payments", "deferred_income", "director_fees",
                      "exercised_stock_options", "expenses", "loan_advances",
                      "long_term_incentive", "other", "restricted_stock", 
                      "restricted_stock_deferred", "salary", "total_payments",
                      "total_stock_value"]

EMAIL_FEATURES = ["from_messages", "to_messages"]
EMAIL_POI_FEATURES = ["from_poi_to_this_person", "from_this_person_to_poi", "shared_receipt_with_poi"]

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
### 2 entries are not individuals. one labeled "TOTAL" and the other
### "THE TRAVEL AGENCY IN THE PARK"
del data_dict["TOTAL"]
del data_dict["THE TRAVEL AGENCY IN THE PARK"]

### Store to my_dataset for easy export below.
my_dataset = data_dict

### create custom features:
my_tools.create_ratio_feature(my_dataset, "shared_receipt_ratio", "shared_receipt_with_poi", "to_messages")
my_tools.create_ratio_feature(my_dataset, "poi_email_ratio", EMAIL_POI_FEATURES, EMAIL_FEATURES)

features_list = ['poi'] + FINANCIAL_FEATURES + ["shared_receipt_ratio"]

data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


rs = RobustScaler()

skb = SelectKBest()
knn = KNeighborsClassifier()
cv = StratifiedShuffleSplit(labels, n_iter = 100, test_size = 0.1, random_state = 42)
pl = Pipeline([('skb', skb), ('rs', rs), ('knn', knn)])
param_grid = {'skb__k':[3,4,5,6,7,8],
              'knn__n_neighbors':[2,3,4,5,6],
             'knn__weights':['distance','uniform']}
gs = GridSearchCV(pl, param_grid, scoring='f1', cv = cv)
gs.fit(features, labels)
print gs.best_estimator_
print gs.best_score_

Pipeline(steps=[('skb', SelectKBest(k=4, score_func=<function f_classif at 0x00000000073CE748>)), ('rs', RobustScaler(copy=True, with_centering=True, with_scaling=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])
0.448666666667


In [3]:
### run this estimator through the full tester
rs = RobustScaler()
skb = SelectKBest(k=4)
knn = KNeighborsClassifier(n_neighbors = 3, weights = 'distance')
pl = Pipeline([('skb',skb), ('rs',rs), ('knn',knn)])
print "Testing pipline of SKB(k = 4) -> RobustScaler -> KNeighbors (n_neighbors = 3, weights = 'distance')"
print "accuracy \t precision \t recall \t f1 \t\t f2" 
accuracy, precision, recall, f1, f2 = my_tools.my_test_classifier(pl, my_dataset, features_list, folds = 1000)
print "%1.3f \t\t %1.3f \t\t %1.3f \t\t %1.3f \t\t %1.3f" %(accuracy, precision, recall, f1, f2)

Testing pipline of SKB(k = 4) -> RobustScaler -> KNeighbors (n_neighbors = 3, weights = 'distance')
accuracy 	 precision 	 recall 	 f1 		 f2
0.870 		 0.521 		 0.346 		 0.416 		 0.371


In [4]:
### get arrays of train/test indices from StratifiedShuffleSplit
cv = StratifiedShuffleSplit(labels, n_iter = 1000, random_state = 42)
### save the feature counts in this dictionary
features_use_count = np.zeros(len(features_list)-1, dtype = int)
### get the indices from each iteration and create the training matrices
for train_indices, test_indices in cv:
    labels_train = []
    features_train = []
    for train_ix in train_indices:
        labels_train.append(labels[train_ix])
        features_train.append(features[train_ix])
    skb = SelectKBest(score_func = f_classif, k = 5)
    skb.fit(features_train, labels_train)
    for ix in skb.get_support(indices = True):
        features_use_count[ix] += 1

feature_scores = dict(zip(features_list[1:],features_use_count))
for key in sorted(feature_scores, key=feature_scores.get, reverse = True):
    print key,":\t", feature_scores[key]


total_stock_value :	999
bonus :	995
exercised_stock_options :	995
salary :	983
deferred_income :	489
long_term_incentive :	185
shared_receipt_ratio :	134
restricted_stock :	130
total_payments :	79
expenses :	11
deferral_payments :	0
loan_advances :	0
restricted_stock_deferred :	0
other :	0
director_fees :	0


In [5]:
### First estimator is not using the shared_receipt_ratio.  Omit the SKB step and use the top 4 financial
### features plus email ratio which were best in GaussianNB:
test_features_list = ['poi'] + ['bonus', 'deferred_income', 'exercised_stock_options', 
                           'salary', 'total_stock_value'] + ["shared_receipt_ratio"]
data = featureFormat(my_dataset, test_features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
rs = RobustScaler()
knn = KNeighborsClassifier()
cv = StratifiedShuffleSplit(labels, n_iter = 100, test_size = 0.1, random_state = 42)
pl = Pipeline([('rs', rs), ('knn',knn)])
param_grid = {'knn__n_neighbors':[2,3,4,5,6]}
gs = GridSearchCV(pl, param_grid, scoring = 'f1', cv = cv)
gs.fit(features,labels)
print gs.best_estimator_
print gs.best_score_

Pipeline(steps=[('rs', RobustScaler(copy=True, with_centering=True, with_scaling=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])
0.321333333333


So for this classifier, adding the email ratio doesn't seem to help.  Try running the full features list through the pipeline including the SelectKBest step.

In [6]:
features_list = ['poi'] + FINANCIAL_FEATURES + EMAIL_FEATURES + EMAIL_POI_FEATURES

data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


rs = RobustScaler()

skb = SelectKBest()
knn = KNeighborsClassifier()
cv = StratifiedShuffleSplit(labels, n_iter = 100, test_size = 0.1, random_state = 42)
pl = Pipeline([('skb', skb), ('rs', rs), ('knn', knn)])
param_grid = {'skb__k':[3,4,5,6,7,8],
              'knn__n_neighbors':[2,3,4,5,6],
             'knn__weights':['distance','uniform']}
gs = GridSearchCV(pl, param_grid, scoring='f1', cv = cv)
gs.fit(features, labels)
print gs.best_estimator_
print gs.best_score_

Pipeline(steps=[('skb', SelectKBest(k=4, score_func=<function f_classif at 0x00000000073CE748>)), ('rs', RobustScaler(copy=True, with_centering=True, with_scaling=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])
0.456666666667


Similar results: Only using the top 4 financial features.  

In [7]:
### set up KNN with top 4 financial features only and run through full tester
test_features_list = ['poi','total_stock_value','bonus', 'exercised_stock_options','salary']
rs = RobustScaler()
knn = KNeighborsClassifier(n_neighbors = 3, weights = 'uniform')
pl = Pipeline([('rs',rs), ('knn',knn)])
print "Testing pipeline of RobustScaler -> KNeighbors (n_neighbors = 3, weights = 'uniform')"
print "Feature list:", test_features_list
print "accuracy \t precision \t recall \t f1 \t\t f2" 
accuracy, precision, recall, f1, f2 = my_tools.my_test_classifier(pl, my_dataset, test_features_list, folds = 1000)
print "%1.3f \t\t %1.3f \t\t %1.3f \t\t %1.3f \t\t %1.3f" %(accuracy, precision, recall, f1, f2)

Testing pipeline of RobustScaler -> KNeighbors (n_neighbors = 3, weights = 'uniform')
Feature list: ['poi', 'total_stock_value', 'bonus', 'exercised_stock_options', 'salary']
accuracy 	 precision 	 recall 	 f1 		 f2
0.872 		 0.645 		 0.379 		 0.477 		 0.413


In [8]:
### As a final step, try running my custom imputation method for salary on the datset first,
### then tune and test the KNN classiier. Need to reload the dataset, set up the features
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
### 2 entries are not individuals. one labeled "TOTAL" and the other
### "THE TRAVEL AGENCY IN THE PARK"
del data_dict["TOTAL"]
del data_dict["THE TRAVEL AGENCY IN THE PARK"]

### Unclear whether other outliers should be removed

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Add custom imputation here
my_tools.my_imputer(my_dataset, "salary", test = my_tools.is_not_director)


### create custom features:
my_tools.create_ratio_feature(my_dataset, "shared_receipt_ratio", "shared_receipt_with_poi", "to_messages")
my_tools.create_ratio_feature(my_dataset, "poi_email_ratio", EMAIL_POI_FEATURES, EMAIL_FEATURES)

features_list = ['poi'] + FINANCIAL_FEATURES + ["shared_receipt_ratio"]

data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
cv = StratifiedShuffleSplit(labels, n_iter = 100, test_size = 0.1, random_state = 42)

pl = Pipeline([('skb', skb), ('rs', rs), ('knn', knn)])
param_grid = {'skb__k':[3,4,5,6,7,8],
              'knn__n_neighbors':[3,4,5,6],
             'knn__weights':['distance','uniform']}
gs = GridSearchCV(pl, param_grid, scoring='f1', cv = cv)
gs.fit(features, labels)
print gs.best_estimator_
print gs.best_score_


Pipeline(steps=[('skb', SelectKBest(k=3, score_func=<function f_classif at 0x00000000073CE748>)), ('rs', RobustScaler(copy=True, with_centering=True, with_scaling=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance'))])
0.403333333333


In [10]:
### get arrays of train/test indices from StratifiedShuffleSplit
cv = StratifiedShuffleSplit(labels, n_iter = 1000, random_state = 42)
### save the feature counts in this dictionary
features_use_count = np.zeros(len(features_list)-1, dtype = int)
### get the indices from each iteration and create the training matrices
for train_indices, test_indices in cv:
    labels_train = []
    features_train = []
    for train_ix in train_indices:
        labels_train.append(labels[train_ix])
        features_train.append(features[train_ix])
    skb = SelectKBest(score_func = f_classif, k = 5)
    skb.fit(features_train, labels_train)
    for ix in skb.get_support(indices = True):
        features_use_count[ix] += 1

feature_scores = dict(zip(features_list[1:],features_use_count))
for key in sorted(feature_scores, key=feature_scores.get, reverse = True):
    print key,":\t", feature_scores[key]


total_stock_value :	1000
bonus :	999
exercised_stock_options :	994
salary :	796
deferred_income :	533
shared_receipt_ratio :	205
long_term_incentive :	180
restricted_stock :	156
total_payments :	119
expenses :	14
loan_advances :	4
deferral_payments :	0
restricted_stock_deferred :	0
other :	0
director_fees :	0


In [12]:
### The best estimator now found in the GridSearchCV has only 3 features, and uses n_neighbors=4, weights = 'distance  
### Set this up and run it through the full tester
test_features_list = ['poi','total_stock_value','bonus', 'exercised_stock_options']
rs = RobustScaler()
knn = KNeighborsClassifier(n_neighbors = 4, weights = 'distance')
pl = Pipeline([('rs',rs), ('knn',knn)])
print "Testing pipleline of RobustScaler -> KNeighbors (n_neighbors = 4, weights = 'distance')"
print "Feature list:", test_features_list
print "accuracy \t precision \t recall \t f1 \t\t f2" 
accuracy, precision, recall, f1, f2 = my_tools.my_test_classifier(pl, my_dataset, test_features_list, folds = 1000)
print "%1.3f \t\t %1.3f \t\t %1.3f \t\t %1.3f \t\t %1.3f" %(accuracy, precision, recall, f1, f2)

Testing pipleline of RobustScaler -> KNeighbors (n_neighbors = 4, weights = 'distance')
Feature list: ['poi', 'total_stock_value', 'bonus', 'exercised_stock_options']
accuracy 	 precision 	 recall 	 f1 		 f2
0.875 		 0.638 		 0.438 		 0.519 		 0.467


Which is, interestingly, a very simple classifier, using only 3 features, but gives best results of all the ones
I've tried so far.