In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, Normalizer, scale
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)

seed = 999

creditcard = pd.read_csv('./data/creditcard.csv')
creditcard.columns = [x.lower() for x in creditcard.columns]
creditcard.rename(columns = {'class': 'fraud'}, inplace = True)

In [2]:
# 1. Split Test Data Out
creditcard.drop(columns = 'time', inplace = True)

# Normalize the 'amount' column
scaler = StandardScaler()
creditcard['amount'] = scaler.fit_transform(creditcard['amount'].values.reshape(-1, 1))
# creditcard.drop(columns = 'amount', inplace = True)

X = creditcard.iloc[:, :-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = seed)

## RandomForestClassifier

Usually for imbalanced data, we can try:

1. Collect more data (which not work here since the data is given)
2. Down-Sampling or Over-Sampling to get balanced samples
3. Change the Thresholds to adjust the prediction
4. Assign class weights for the low rate class

## 1. Use the Imbalanced Data Directly in RandomForestClassifier

In [3]:
X = creditcard.iloc[:, :-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = seed)

estimator = RandomForestClassifier(random_state=0, warm_start = True)

In [4]:
# rf_tuned_parameters = {"max_depth": [10, 20, 50, 100], 'n_estimators': [50, 100, 200, 500], 'min_samples_leaf': [10, 20, 50]}

In [5]:
rf_tuned_parameters = {"max_depth": [10,20], 'n_estimators': [50], 'min_samples_leaf': [10]}

In [6]:
cv_grid = GridSearchCV(estimator, param_grid = rf_tuned_parameters, scoring = 'roc_auc', verbose = 5) # 'recall', my_score
cv_grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  43.8s
[CV 2/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  46.6s
[CV 3/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  49.4s
[CV 4/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  45.7s
[CV 5/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  49.9s
[CV 1/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.3min
[CV 2/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.3min
[CV 3/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.1min
[CV 4/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.2min
[CV 5/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.1min


GridSearchCV(estimator=RandomForestClassifier(random_state=0, warm_start=True),
             param_grid={'max_depth': [10, 20], 'min_samples_leaf': [10],
                         'n_estimators': [50]},
             scoring='roc_auc', verbose=5)

In [8]:
print (cv_grid.cv_results_)

{'mean_fit_time': array([46.96056376, 70.96481733]), 'std_fit_time': array([2.26770363, 4.52632999]), 'mean_score_time': array([0.2196723 , 0.25400629]), 'std_score_time': array([0.02367693, 0.02137304]), 'param_max_depth': masked_array(data=[10, 20],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_min_samples_leaf': masked_array(data=[10, 10],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[50, 50],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 50}, {'max_depth': 20, 'min_samples_leaf': 10, 'n_estimators': 50}], 'split0_test_score': array([0.97704246, 0.94268887]), 'split1_test_score': array([0.98400556, 0.98041415]), 'split2_test_score': array([0.95973025, 0.97012668]), 'split3_test_score': array([0.97796691, 0.95879725]), 'split4_test_score': array([0.96511648

In [9]:
cv_grid.cv_results_

{'mean_fit_time': array([46.96056376, 70.96481733]),
 'std_fit_time': array([2.26770363, 4.52632999]),
 'mean_score_time': array([0.2196723 , 0.25400629]),
 'std_score_time': array([0.02367693, 0.02137304]),
 'param_max_depth': masked_array(data=[10, 20],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[10, 10],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[50, 50],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 50},
  {'max_depth': 20, 'min_samples_leaf': 10, 'n_estimators': 50}],
 'split0_test_score': array([0.97704246, 0.94268887]),
 'split1_test_score': array([0.98400556, 0.98041415]),
 'split2_test_score': array([0.95973025, 0.97012668]),
 'split3_test_score': array([0.97796691, 0.95879725]),
 'split4_test_sc

In [10]:
best_parameters = cv_grid.best_estimator_.get_params()

In [11]:
best_parameters

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': True}

In [12]:
for param_name in sorted(rf_tuned_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	max_depth: 10
	min_samples_leaf: 10
	n_estimators: 50


In [13]:
pred_test = cv_grid.predict(Xtest)
print(recall_score(ytest, pred_test))     # 0.65
print(precision_score(ytest, pred_test))  # 0.85
print(roc_auc_score(ytest, pred_test))    # 0.83
print("confustion matrix on validation data: \n" + str(confusion_matrix(ytest, pred_test)))

0.7530864197530864
0.9172932330827067
0.8764845901056932
confustion matrix on validation data: 
[[93814    11]
 [   40   122]]


## 2. Create Over-sampling data and Fit the model

## 3. RandomForestClassifier with class_weight

In [18]:
X = creditcard.iloc[:, :-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = seed)

In [19]:
positive_weight = sum(ytrain == 0) / sum(ytrain == 1)  # size to repeat y == 1

estimator = RandomForestClassifier(random_state=0, class_weight = {0 : 1, 1 : positive_weight}, warm_start = True)

In [20]:
# rf_tuned_parameters = {"max_depth": [10, 20, 50, 100], 'n_estimators': [50, 100, 200, 500], 'min_samples_leaf': [10, 20, 50]}

In [21]:
rf_tuned_parameters = {"max_depth": [10,20], 'n_estimators': [50], 'min_samples_leaf': [10]}

In [22]:
cv_grid = GridSearchCV(estimator, param_grid = rf_tuned_parameters, scoring = 'roc_auc', verbose = 5) # 'recall', my_score
cv_grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  38.8s
[CV 2/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  33.6s
[CV 3/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  30.2s
[CV 4/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  32.1s
[CV 5/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  31.7s
[CV 1/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time=  30.7s
[CV 2/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time=  31.9s
[CV 3/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time=  29.9s
[CV 4/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time=  32.6s
[CV 5/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time=  31.5s


GridSearchCV(estimator=RandomForestClassifier(class_weight={0: 1,
                                                            1: 577.2424242424242},
                                              random_state=0, warm_start=True),
             param_grid={'max_depth': [10, 20], 'min_samples_leaf': [10],
                         'n_estimators': [50]},
             scoring='roc_auc', verbose=5)

In [23]:
cv_grid.cv_results_

{'mean_fit_time': array([33.16161108, 31.24362402]),
 'std_fit_time': array([2.92718634, 0.95469386]),
 'mean_score_time': array([0.20468321, 0.18049707]),
 'std_score_time': array([0.03230071, 0.00195841]),
 'param_max_depth': masked_array(data=[10, 20],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[10, 10],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[50, 50],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 50},
  {'max_depth': 20, 'min_samples_leaf': 10, 'n_estimators': 50}],
 'split0_test_score': array([0.99003626, 0.96549151]),
 'split1_test_score': array([0.97211756, 0.96656967]),
 'split2_test_score': array([0.95961512, 0.94179007]),
 'split3_test_score': array([0.98113637, 0.98189816]),
 'split4_test_sc

In [24]:
best_parameters = cv_grid.best_estimator_.get_params()
best_parameters

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': {0: 1, 1: 577.2424242424242},
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': True}

In [25]:
for param_name in sorted(rf_tuned_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	max_depth: 10
	min_samples_leaf: 10
	n_estimators: 50


In [26]:
pred_test = cv_grid.predict(Xtest)
print(recall_score(ytest, pred_test))     #  0.85
print(precision_score(ytest, pred_test))  #  0.81
print(roc_auc_score(ytest, pred_test))    #  0.92
print("\n confustion matrix on validation data: \n" + str(confusion_matrix(ytest, pred_test)))

0.8333333333333334
0.8490566037735849
0.9165387689848122

 confustion matrix on validation data: 
[[93801    24]
 [   27   135]]


## 4. Self-defined Score and GridSearchCV of hyperparameter

In [30]:
def scoring(ground_truth, predictions):
    '''
    based on results above about the average loss from false positive and false negative predictions.
    '''
    cmatrix = confusion_matrix(ground_truth, predictions)
    fp = cmatrix[0, 1]
    fn = cmatrix[1, 0]
    return  fn * 122 + fp * 1.76

In [31]:
wt_loss_score = make_scorer(scoring, greater_is_better = False)

In [32]:
X = creditcard.iloc[:, :-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = seed)

estimator = RandomForestClassifier(random_state=0, warm_start = True)

In [33]:
estimator = RandomForestClassifier(random_state=0, warm_start = True)

In [34]:
# rf_tuned_parameters = {"max_depth": [10, 20, 50, 100], 'n_estimators': [50, 100, 200, 500], 'min_samples_leaf': [10, 20, 50]}

In [35]:
rf_tuned_parameters = {"max_depth": [10,20], 'n_estimators': [50], 'min_samples_leaf': [10]}

In [37]:
cv_grid = GridSearchCV(estimator, param_grid = rf_tuned_parameters, scoring = wt_loss_score, verbose = 5)
cv_grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  49.9s
[CV 2/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  44.4s
[CV 3/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  44.8s
[CV 4/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  48.2s
[CV 5/5] END max_depth=10, min_samples_leaf=10, n_estimators=50; total time=  46.3s
[CV 1/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.0min
[CV 2/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.1min
[CV 3/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.0min
[CV 4/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.0min
[CV 5/5] END max_depth=20, min_samples_leaf=10, n_estimators=50; total time= 1.0min


GridSearchCV(estimator=RandomForestClassifier(random_state=0, warm_start=True),
             param_grid={'max_depth': [10, 20], 'min_samples_leaf': [10],
                         'n_estimators': [50]},
             scoring=make_scorer(scoring, greater_is_better=False), verbose=5)

In [38]:
cv_grid.best_params_

{'max_depth': 20, 'min_samples_leaf': 10, 'n_estimators': 50}

In [39]:
best_parameters = cv_grid.best_estimator_.get_params()
best_parameters

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': True}

In [40]:
for param_name in sorted(rf_tuned_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	max_depth: 20
	min_samples_leaf: 10
	n_estimators: 50


In [41]:
pred_test = cv_grid.predict(Xtest)
print(recall_score(ytest, pred_test))     # 0.84
print(precision_score(ytest, pred_test))  # 0.84
print(roc_auc_score(ytest, pred_test))    # 0.92
print("\n confustion matrix on validation data: \n" + str(confusion_matrix(ytest, pred_test)))

0.7592592592592593
0.917910447761194
0.8795710098587797

 confustion matrix on validation data: 
[[93814    11]
 [   39   123]]
