In [1]:
# https://stackoverflow.com/questions/9031783/hide-all-warnings-in-ipython
from IPython.display import HTML
HTML('''<script>
code_show_err=true;
function code_toggle_err() {
  if (code_show_err) {
  $('div.output_stderr').hide();
  } else {
  $('div.output_stderr').show();
  }
  code_show_err = !code_show_err
}
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')

### Project setup
This project uses scikit-learn version 0.19 and Python 2.7.  There is an Anaconda setup file called condaEnv_udMLproject.txt in the root directory of the git repository.

## Objective

stuff here

## Data Exploration

#### Number of employees in the initial data set
146



#### Feature Metadata

In [2]:
import pickle
import pandas as pd
from toolsDataExploration import getMetadata
from toolsFeatureSelection import normalizeEmailMessages
from toolsFeatureSelection import calcTotalMonetaryValues
from toolsFeatureSelection import significantPoiEmailActivity

# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.from_dict.html
# https://kaijento.github.io/2017/04/22/pandas-create-new-column-sum/
# https://stackoverflow.com/questions/13575090/construct-pandas-dataframe-from-items-in-nested-dictionary
# get dictionary from dataset

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# get rid of total line from spreadsheet
data_dict.pop('TOTAL',None)

### Task 3: Create new feature(s)
# like the email example in L12Q4
### Store to my_dataset for easy export below.
my_dataset = normalizeEmailMessages(data_dict)
# create another new feature which is significant_poi_email_activity (>2% from, >17% to)
# However, I've simply summed up the activity, so the %to+%from poi's > 0.2 is significant
my_dataset = significantPoiEmailActivity(my_dataset)
    
dictMetadata, totalLen = getMetadata(my_dataset)
#df = pd.DataFrame.from_dict(dictMetadata, orient='index')
df = pd.DataFrame.from_dict({(i): dictMetadata[i] 
                             for i in dictMetadata.keys()}, 
                            orient='index')
df['percent valid values'] = df['valid count']/totalLen
df['percent poi to total'] = df['poiCount']/df['valid count']
df

poi count :  18


Unnamed: 0,max,poiCount,valid count,min,percent valid values,percent poi to total
bonus,8000000,16,81,0,0.558621,0.197531
deferral_payments,6426990,5,38,-102500,0.262069,0.131579
deferred_income,0,11,48,-3504386,0.331034,0.229167
director_fees,137864,0,16,0,0.110345,0.0
exercised_stock_options,34348384,12,101,0,0.696552,0.118812
expenses,228763,18,94,0,0.648276,0.191489
fraction_from_poi,0.217341,18,145,0,1.0,0.124138
fraction_to_poi,1,18,145,0,1.0,0.124138
from_messages,14368,14,86,0,0.593103,0.162791
from_poi_to_this_person,528,14,86,0,0.593103,0.162791


### And if more data is always better than a clever algorithm...
then I am seriously considering removing features that have very little usable data, such as loan_advances, and director_fees, since a few poi's in either of those categories may cause incorrect overfitting. It's hard to say at this point, since they may also be genuine indicators.

But first, let's examine the features to determine which ones are the most useful by scoring them with a univariate SelectKBest algorithm.

In [3]:
from toolsFeatureSelection import calcKMeans
from feature_format2 import featureFormat
from sklearn.preprocessing import MinMaxScaler

# https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

features_list = ['poi',
                 'salary',
                 'deferral_payments',
                 'total_payments',
                 'loan_advances',
                 'bonus',
                 'restricted_stock_deferred',
                 'deferred_income',
                 'total_stock_value',
                 'expenses',
                 'exercised_stock_options',
                 'other',
                 'long_term_incentive',
                 'restricted_stock',
                 'director_fees',
                 'to_messages',
                 'from_poi_to_this_person',
                 'from_messages',
                 'from_this_person_to_poi',
                 'shared_receipt_with_poi'
                 ]
cleanedData = featureFormat(data_dict, features_list, removePOI=False)
mmScaler = MinMaxScaler(feature_range=(0,1000))
resultsScaled = mmScaler.fit_transform(cleanedData)
fit, Xnew, featureScores = calcKMeans(resultsScaled, features_list, 10)
featureScores.nlargest(len(featureScores), 'Score')

Unnamed: 0,Feature,Score
9,exercised_stock_options,6927.477105
3,loan_advances,6742.747612
7,total_stock_value,5544.839651
4,bonus,5193.34926
0,salary,3116.643855
2,total_payments,2811.509128
11,long_term_incentive,2579.689857
18,shared_receipt_with_poi,2482.307124
10,other,1708.556798
8,expenses,1525.656574


#### We need to split the dataset before we select features
though, due to correlations that may be identified early on, and then re-identified when we split the data.
https://www.wildcardconsulting.dk/useful-information/never-do-this-mistake-when-using-feature-selection/


### Add features
Since many of the email features did not have good scores, let's combine some of them to try to find some more meaning, since perhaps the whole is greater than the parts.

First we'll create 2 new features :
- 'fraction_from_poi'
- 'fraction_to_poi'
which represent the percent of emails that are sent by each person either to or from a person of interest.

In [4]:
from toolsFeatureSelection import normalizeEmailMessages
my_dataset = normalizeEmailMessages(data_dict)

Next we'll do something similar to the income and expenses for each individual.
totalExpenses consist of :
- deferral_payments
- director_fees
- expenses
- loan_advances
- total_payments

totalIncome consists of :
- bonus
- deferred_income
- exercised_stock_options
- loan_advances
- long_term_incentive
- restricted_stock
- restricted_stock_deferred
- salary
- total_stock_value

In [5]:
from toolsFeatureSelection import calcTotalMonetaryValues
my_dataset = calcTotalMonetaryValues(my_dataset)

Now re-examine the highest scoring features.

In [6]:
newFeatureList = ['fraction_from_poi', 'fraction_to_poi', 'totalIncome', 'totalExpenses','significant_poi_email_activity']
features_list.extend(newFeatureList)
cleanedData = featureFormat(my_dataset, features_list)
mmScaler = MinMaxScaler(feature_range=(0,1000))
resultsScaled = mmScaler.fit_transform(cleanedData)
fit, Xnew, featureScores = calcKMeans(resultsScaled, features_list, 10)
featureScores.nlargest(len(featureScores), 'Score')

Unnamed: 0,Feature,Score
23,significant_poi_email_activity,18514.285714
9,exercised_stock_options,6927.477105
3,loan_advances,6742.747612
7,total_stock_value,5544.839651
4,bonus,5193.34926
20,fraction_to_poi,4717.950582
21,totalIncome,4424.415509
22,totalExpenses,3246.454444
0,salary,3116.643855
2,total_payments,2811.509128


## Initial Classifier Testing
Use 25% of the data for testing. The data was cleaned of NaN's previously. All classifier calls support cross validation via GridSearchCV so that we can continue to call the same functions with or without tuning.

We'll look at :
- Support Vector Classifier (SVC)
- Random Forest
- AdaBoost
- Gradient Tree Boosting
- Gaussian Naive Bayes

### Split the data

In [7]:
from sklearn.model_selection import train_test_split
from feature_format2 import featureFormat

cleanedData = featureFormat(my_dataset, features_list, removePOI=False)
mmScaler = MinMaxScaler(feature_range=(0,1000))
resultsScaled = mmScaler.fit_transform(cleanedData)
fit, Xnew, featureScores = calcKMeans(resultsScaled, features_list, 10)
Xtrain, Xtest, yTrain, yTest = train_test_split(Xnew, cleanedData[:,0], 
                                                test_size=0.25, random_state=42, shuffle=True)

#### SVC

In [8]:
from toolsClassifiers import runClassifier
from sklearn.svm import SVC

clf = SVC()
param_grid = {'C':[1],
             'kernel':['rbf']
             }
clf, results = runClassifier(clf, param_grid, Xtrain, yTrain, Xtest, yTest)
pd.DataFrame(results)

  'precision', 'predicted', average, warn_for)


Unnamed: 0,accuracyTest,accuracyTrain,best_params,precision,recall
C,0.805556,1.0,1,0.0,0.0
kernel,0.805556,1.0,rbf,0.0,0.0


#### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
param_grid = {
    'n_estimators':[10],
    'max_features':['auto']
}
clf, results = runClassifier(clf, param_grid, Xtrain, yTrain, Xtest, yTest)
pd.DataFrame(results)

Unnamed: 0,accuracyTest,accuracyTrain,best_params,precision,recall
max_features,0.805556,0.990741,auto,0.5,0.142857
n_estimators,0.805556,0.990741,10,0.5,0.142857


#### AdaBoost

In [10]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier()
param_grid = {
    'n_estimators':[50],
    'learning_rate':[1]
}
clf, results = runClassifier(clf, param_grid, Xtrain, yTrain, Xtest, yTest)
pd.DataFrame(results)

Unnamed: 0,accuracyTest,accuracyTrain,best_params,precision,recall
learning_rate,0.888889,1.0,1,0.8,0.571429
n_estimators,0.888889,1.0,50,0.8,0.571429


#### GradientTreeBoost

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
param_grid = {
    'n_estimators':[100],
    'learning_rate':[0.1]
}
clf, results = runClassifier(clf, param_grid, Xtrain, yTrain, Xtest, yTest)
pd.DataFrame(results)

Unnamed: 0,accuracyTest,accuracyTrain,best_params,precision,recall
learning_rate,0.861111,1.0,0.1,0.666667,0.571429
n_estimators,0.861111,1.0,100.0,0.666667,0.571429


#### GaussianNaiveBayes

In [12]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
param_grid = {}
clf, results = runClassifier(clf, param_grid, Xtrain, yTrain, Xtest, yTest)
pd.DataFrame(results)

RESULTS :  {'best_params': ['default'], 'recall': 0.5714285714285714, 'accuracyTest': 0.5, 'precision': 0.21052631578947367, 'accuracyTrain': 0.59259259259259256}


Unnamed: 0,accuracyTest,accuracyTrain,best_params,precision,recall
0,0.5,0.592593,default,0.210526,0.571429
