In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, Normalizer, scale
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)

seed = 999

creditcard = pd.read_csv('./data/creditcard.csv')
creditcard.columns = [x.lower() for x in creditcard.columns]
creditcard.rename(columns = {'class': 'fraud'}, inplace = True)

In [2]:
# 1. Split Test Data Out
creditcard.drop(columns = 'time', inplace = True)

# Normalize the 'amount' column
scaler = StandardScaler()
creditcard['amount'] = scaler.fit_transform(creditcard['amount'].values.reshape(-1, 1))
# creditcard.drop(columns = 'amount', inplace = True)

X = creditcard.iloc[:, :-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = seed)

## RandomForestClassifier

Usually for imbalanced data, we can try:

1. Collect more data (which not work here since the data is given)
2. Down-Sampling or Over-Sampling to get balanced samples
3. Change the Thresholds to adjust the prediction
4. Assign class weights for the low rate class

## 1. Use the Imbalanced Data Directly in RandomForestClassifier

In [3]:
X = creditcard.iloc[:, :-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = seed)

estimator = RandomForestClassifier(random_state=0, warm_start = True)

In [4]:
rf_tuned_parameters = {"max_depth": [10, 20, 50, 100], 'n_estimators': [50, 100, 200, 500], 'min_samples_leaf': [10, 20, 50]}

In [None]:
cv_grid = GridSearchCV(estimator, param_grid = rf_tuned_parameters, scoring = 'roc_auc', verbose = 5, n_jobs = 70) # 'recall', my_score
cv_grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


Exception in thread Thread-6:
Traceback (most recent call last):
  File "C:\Users\Owner\Anaconda3\lib\threading.py", line 926, in _bootstrap_inner
    self.run()
  File "C:\Users\Owner\Anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 555, in run
    result_item, is_broken, bpe = self.wait_result_broken_or_wakeup()
  File "C:\Users\Owner\Anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 609, in wait_result_broken_or_wakeup
    ready = wait(readers + worker_sentinels)
  File "C:\Users\Owner\Anaconda3\lib\multiprocessing\connection.py", line 869, in wait
    ready_handles = _exhaustive_wait(waithandle_to_obj.keys(), timeout)
  File "C:\Users\Owner\Anaconda3\lib\multiprocessing\connection.py", line 801, in _exhaustive_wait
    res = _winapi.WaitForMultipleObjects(L, False, timeout)
ValueError: need at most 63 handles, got a sequence of length 71



In [None]:
# print cv_grid.cv_results_

In [None]:
best_parameters = cv_grid.best_estimator_.get_params()

In [None]:
# for param_name in sorted(rf_tuned_parameters.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
pred_test = cv_grid.predict(Xtest)
print(recall_score(ytest, pred_test))     # 0.65
print(precision_score(ytest, pred_test))  # 0.85
print(roc_auc_score(ytest, pred_test))    # 0.83
print("confustion matrix on validation data: \n" + str(confusion_matrix(ytest, pred_test)))

## 2. Create Over-sampling data and Fit the model

In [None]:
oversample_ratio = sum(ytrain == 0) / sum(ytrain == 1)  # size to repeat y == 1
# repeat the positive data for X and y
ytrain_pos_oversample = pd.concat([ytrain[ytrain==1]] * oversample_ratio, axis = 0)
Xtrain_pos_oversample = pd.concat([Xtrain.loc[ytrain==1, :]] * oversample_ratio, axis = 0)
# concat the repeated data with the original data together
ytrain_oversample = pd.concat([ytrain, ytrain_pos_oversample], axis = 0).reset_index(drop = True)
Xtrain_oversample = pd.concat([Xtrain, Xtrain_pos_oversample], axis = 0).reset_index(drop = True)

In [None]:
ytrain_oversample.value_counts(dropna = False, normalize = True)   # 50:50

estimator = RandomForestClassifier(random_state=0, warm_start = True)

In [None]:
rf_tuned_parameters = {"max_depth": [10, 20, 50, 100], 'n_estimators': [50, 100, 200, 500], 'min_samples_leaf': [10, 20, 50]}

In [None]:
cv_grid = GridSearchCV(estimator, param_grid = rf_tuned_parameters, scoring = 'roc_auc', verbose = 5, n_jobs = 70) # 'recall', my_score
cv_grid.fit(Xtrain_oversample, ytrain_oversample)

In [None]:
# print cv_grid.best_params_
# print cv_grid.cv_results_

In [None]:
best_parameters = cv_grid.best_estimator_.get_params()

In [None]:
# for param_name in sorted(rf_tuned_parameters.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
pred_test = cv_grid.predict(Xtest)
print(recall_score(ytest, pred_test))     # 0.83
print(precision_score(ytest, pred_test))  # 0.83
print(roc_auc_score(ytest, pred_test))    # 0.92
print("\n confustion matrix on validation data: \n" + str(confusion_matrix(ytest, pred_test)))

## 3. RandomForestClassifier with class_weight

In [None]:
X = creditcard.iloc[:, :-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = seed)

In [None]:
positive_weight = sum(ytrain == 0) / sum(ytrain == 1)  # size to repeat y == 1

estimator = RandomForestClassifier(random_state=0, class_weight = {0 : 1, 1 : positive_weight}, warm_start = True)

rf_tuned_parameters = {"max_depth": [10, 20, 50, 100], 'n_estimators': [50, 100, 200, 500], 'min_samples_leaf': [10, 20, 50]}

cv_grid = GridSearchCV(estimator, param_grid = rf_tuned_parameters, scoring = 'roc_auc', verbose = 5, n_jobs = 70) # 'recall', my_score
cv_grid.fit(Xtrain, ytrain)

In [None]:
# print cv_grid.cv_results_

In [None]:
best_parameters = cv_grid.best_estimator_.get_params()

In [None]:
# for param_name in sorted(rf_tuned_parameters.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
pred_test = cv_grid.predict(Xtest)
print(recall_score(ytest, pred_test))     #  0.85
print(precision_score(ytest, pred_test))  #  0.81
print(roc_auc_score(ytest, pred_test))    #  0.92
print("\n confustion matrix on validation data: \n" + str(confusion_matrix(ytest, pred_test)))

## 4. Self-defined Score and GridSearchCV of hyperparameter

In [None]:
def scoring(ground_truth, predictions):
    '''
    based on results above about the average loss from false positive and false negative predictions.
    '''
    cmatrix = confusion_matrix(ground_truth, predictions)
    fp = cmatrix[0, 1]
    fn = cmatrix[1, 0]
    return  fn * 122 + fp * 1.76

In [None]:
wt_loss_score = make_scorer(scoring, greater_is_better = False)

In [None]:
oversample_ratio = sum(ytrain == 0) / sum(ytrain == 1)  # size to repeat y == 1
# repeat the positive data for X and y
ytrain_pos_oversample = pd.concat([ytrain[ytrain==1]] * oversample_ratio, axis = 0)
Xtrain_pos_oversample = pd.concat([Xtrain.loc[ytrain==1, :]] * oversample_ratio, axis = 0)
# concat the repeated data with the original data together
ytrain_oversample = pd.concat([ytrain, ytrain_pos_oversample], axis = 0).reset_index(drop = True)
Xtrain_oversample = pd.concat([Xtrain, Xtrain_pos_oversample], axis = 0).reset_index(drop = True)

In [None]:
ytrain_oversample.value_counts(dropna = False, normalize = True)   # 50:50

In [None]:
estimator = RandomForestClassifier(random_state=0, warm_start = True)

In [None]:
rf_tuned_parameters = {"max_depth": [10, 20, 50, 100], 'n_estimators': [50, 100, 200, 500], 
                       'min_samples_leaf': [10, 20, 50]}

In [None]:
cv_grid = GridSearchCV(estimator, param_grid = rf_tuned_parameters, scoring = wt_loss_score, verbose = 5, n_jobs = 70)
cv_grid.fit(Xtrain_oversample, ytrain_oversample)

In [None]:
# print cv_grid.best_params_

In [None]:
pred_test = cv_grid.predict(Xtest)
print(recall_score(ytest, pred_test))     # 0.84
print(precision_score(ytest, pred_test))  # 0.84
print(roc_auc_score(ytest, pred_test))    # 0.92
print("\n confustion matrix on validation data: \n" + str(confusion_matrix(ytest, pred_test)))