# POI detection using supervised classification

## Basic functions for data processing

In [1]:

import numpy as np

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):

    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print ("error: key ", feature, " not present")
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)


def targetFeatureSplit( data ):


    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features


## Basic imports

In [2]:
import pickle
import sys
sys.path.append("../tools/")

## Import data

In [3]:

data_dict = pickle.load(open("final_project_dataset.pkl", "rb") )

### first element is our labels, any added elements are predictor
### features. Keep this the same for the mini-project, but you'll
### have a different feature list when you do the final project.
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list,sort_keys = "python2_lesson14_keys.pkl")
labels, features = targetFeatureSplit(data)

## First, overfit model

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier()
dt.fit(features,labels)
labels_predict = dt.predict(features)
print("accuracy: " + str(accuracy_score(labels_predict,labels)))

accuracy: 0.989473684211


## Apply cross-validation

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size = 0.3,random_state=  42)
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
y_predict = dt.predict(x_test)
print("accuracy: " + str(accuracy_score(y_predict,y_test)))

accuracy: 0.724137931034


## Basic questions

In [16]:
# number of test POI's
print(np.sum(y_predict))

# number of people in test set
print(len(y_predict))

# accuracy if all zero
y_predict_fake = np.zeros(29)
print("accuracy: " + str(accuracy_score(y_predict_fake,y_test)))

4.0
29
accuracy: 0.862068965517


## Confusion matrix

In [19]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_predict))

[[21  4]
 [ 4  0]]


## Precision and recall

In [20]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

print("recall score: " + str(recall_score(y_test,y_predict)))
print("precision score: " + str(precision_score(y_test,y_predict)))

recall score: 0.0
precision score: 0.0
