In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score
from xgboost import XGBClassifier

In [None]:
# def make_confusion_matrix(model, threshold=0.5):
#     y_predict = (model.predict_proba(X_test)[:, 1] >= threshold)
#     fraud_confusion = confusion_matrix(y_test, y_predict)
#     plt.figure(dpi=80)
#     sns.heatmap(fraud_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
#            xticklabels=['legit', 'fraud'],
#            yticklabels=['legit', 'fraud']);
#     plt.xlabel('prediction')
#     plt.ylabel('actual')

In [None]:
# import matplotlib.cm as cm

# def visualize(X, y, bdry='diag'):
#     c = cm.rainbow(np.linspace(0, 1, 2))
#     plt.scatter([i[0] for i in X], [i[1] for i in X], color=[c[i] for i in y], alpha=.5)
    
#     #Plot the true decision boundary
#     if bdry == 'diag':
#         plt.plot([0, 1], [0, 1], 'k--')
#     elif bdry == 'quadrant':
#         plt.plot([0, 1], [0.5, 0.5], 'k--')
#         plt.plot([0.5, 0.5], [0, 1], 'k--')
        
#     plt.grid(True)

In [None]:
# def plot_features(df, sample_size=500):
    
#     sample = (df.drop(['product_id','user_id','latest_cart'],axis=1)
#                 .sample(1000, random_state=42)) 
#     sns.pairplot(sample,hue='in_cart', plot_kws=dict(alpha=.3, edgecolor='none'))

# plot_features(df_X1)

In [None]:
# def get_user_split_data(df, test_size=.2, seed=42):

#     rs = np.random.RandomState(seed)
    
#     total_users = df['user_id'].unique() 
#     test_users = rs.choice(total_users, 
#                            size=int(total_users.shape[0] * test_size), 
#                            replace=False)

#     df_tr = df[~df['user_id'].isin(test_users)]
#     df_te = df[df['user_id'].isin(test_users)] 

#     y_tr, y_te = df_tr['in_cart'], df_te['in_cart']
#     X_tr = df_tr.drop(['product_id','user_id','latest_cart','in_cart'],axis=1) 
#     X_te = df_te.drop(['product_id','user_id','latest_cart','in_cart'],axis=1)

#     return X_tr, X_te, y_tr, y_te

In [None]:
default_df = pd.read_csv('../01_data_collection/cc_fraud/card_transdata.csv')

In [None]:
default_df.shape

In [None]:
default_df.info()

In [None]:
default_df.head()

In [None]:
default_df.describe()

In [None]:
sample_df = default_df[0:5000]
sns.pairplot(sample_df, hue='fraud');

In [None]:
default_df['fraud'].value_counts()

In [None]:
sns.countplot(default_df['fraud'])
labels = ['legit', 'fraud']
plt.title('Transaction Class Distribution',fontsize = 16)
plt.xticks(range(2), labels) 
plt.xlabel('transaction type', fontsize =14)
plt.ylabel('frequency', fontsize = 14);

In [None]:
X = default_df.drop(columns=['fraud'])
y = default_df['fraud']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train.shape

In [None]:
# standardized scaling
std = StandardScaler()
std.fit(X_train.values)

X_train_scaled = std.transform(X_train.values)
X_test_scaled = std.transform(X_test.values)

In [None]:
# instantiate KNeighborsClassifier and fit
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

In [None]:
print('kNN Scores:' )

# knn accuracy score on train
print('Accuracy (train): ', knn.score(X_train_scaled,y_train))
print('Accuracy (test): ', knn.score(X_test_scaled,y_test))

# knn precision, recall, f1 (default threshold)
y_predict = knn.predict(X_test_scaled)
print('Precision: ', precision_score(y_test, y_predict))
print('Recall: ', recall_score(y_test, y_predict))
print('F1: ', f1_score(y_test, y_predict))

# knn regression log loss
y_prob_pred_train = knn.predict_proba(X_train_scaled)[:,1]
y_prob_pred_test = knn.predict_proba(X_test_scaled)[:,1]

print('Log Loss (train): ', log_loss(y_train,y_prob_pred_train))
print('Log Loss (test): ', log_loss(y_test,y_prob_pred_test))

In [None]:
# knn confusion matrix
knn_confusion = confusion_matrix(y_test, knn.predict(X_test_scaled))
plt.figure(dpi=80)
sns.heatmap(knn_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['legit', 'fraud'],
           yticklabels=['legit', 'fraud'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('kNN Confusion Matrix: Credit Card Transactions');

In [None]:
# knn precision-recall curve
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_test, knn.predict_proba(X_test_scaled)[:,1] )

plt.figure(dpi=80)
plt.plot(threshold_curve, precision_curve[1:],label='precision')
plt.plot(threshold_curve, recall_curve[1:], label='recall')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability, label as fraud)');
plt.title('Precision and Recall Curves');

In [None]:
# knn ROC curve
fpr, tpr, thresholds = roc_curve(y_test, knn.predict_proba(X_test_scaled)[:,1])

plt.plot(fpr, tpr,lw=2)
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])


plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for fraud problem');
print("ROC AUC score = ", roc_auc_score(y_test, knn.predict_proba(X_test_scaled)[:,1]))

In [None]:
# instantiate LogisticRegression and fit (all features)
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

In [None]:
print('Logistic Regression Scores:' )

# logistic regression accuracy score
print('Accuracy (train): ', logreg.score(X_train_scaled,y_train))
print('Accuracy (test): ', logreg.score(X_test_scaled,y_test))


# logistic regression precision, recall, f1 (default threshold)
y_predict = logreg.predict(X_test_scaled)
print('Precision: ', precision_score(y_test, y_predict))
print('Recall: ', recall_score(y_test, y_predict))
print('F1: ', f1_score(y_test, y_predict))

# logisitic regression log loss
y_prob_pred_train = logreg.predict_proba(X_train_scaled)[:,1]
y_prob_pred_test = logreg.predict_proba(X_test_scaled)[:,1]

print('Log Loss (train): ', log_loss(y_train,y_prob_pred_train))
print('Log Loss (test): ', log_loss(y_test,y_prob_pred_test))

In [None]:
# logistic regression confusion matrix
logreg_confusion = confusion_matrix(y_test, logreg.predict(X_test_scaled))
plt.figure(dpi=80)
sns.heatmap(logreg_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['legit', 'fraud'],
           yticklabels=['legit', 'fraud'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Logistic Regression Confusion Matrix: Credit Card Transactions');

In [None]:
# logistic regression precision-recall curve
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_test, logreg.predict_proba(X_test_scaled)[:,1] )

plt.figure(dpi=80)
plt.plot(threshold_curve, precision_curve[1:],label='precision')
plt.plot(threshold_curve, recall_curve[1:], label='recall')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability, label as fraud)');
plt.title('Precision and Recall Curves');

In [None]:
# logistic regression ROC curve
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test_scaled)[:,1])

plt.plot(fpr, tpr,lw=2)
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])


plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for fraud problem');
print("ROC AUC score = ", roc_auc_score(y_test, logreg.predict_proba(X_test_scaled)[:,1]))

In [None]:
# instantiate DecisionTreeClassifier and fit (all features)
decisiontree = DecisionTreeClassifier()
decisiontree.fit(X_train_scaled, y_train)

In [None]:
print('Decision Tree Scores:' )

# decision tree accuracy score
print('Accuracy (train): ', decisiontree.score(X_train_scaled,y_train))
print('Accuracy (test): ', decisiontree.score(X_test_scaled,y_test))


# decision tree precision, recall, f1 (default threshold)
y_predict = decisiontree.predict(X_test_scaled)
print('Precision: ', precision_score(y_test, y_predict))
print('Recall: ', recall_score(y_test, y_predict))
print('F1: ', f1_score(y_test, y_predict))

# ldecision tree log loss
y_prob_pred_train = decisiontree.predict_proba(X_train_scaled)[:,1]
y_prob_pred_test = decisiontree.predict_proba(X_test_scaled)[:,1]

print('Log Loss (train): ', log_loss(y_train,y_prob_pred_train))
print('Log Loss (test): ', log_loss(y_test,y_prob_pred_test))

In [None]:
# decsion tree confusion matrix
decisiontree_confusion = confusion_matrix(y_test, decisiontree.predict(X_test_scaled))
plt.figure(dpi=80)
sns.heatmap(decisiontree_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['legit', 'fraud'],
           yticklabels=['legit', 'fraud'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Decision Tree Confusion Matrix: Credit Card Transactions');

In [None]:
# decision tree precision-recall curve
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_test, decisiontree.predict_proba(X_test_scaled)[:,1] )

plt.figure(dpi=80)
plt.plot(threshold_curve, precision_curve[1:],label='precision')
plt.plot(threshold_curve, recall_curve[1:], label='recall')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability, label as fraud)');
plt.title('Precision and Recall Curves');

In [None]:
# decision tree ROC curve
fpr, tpr, thresholds = roc_curve(y_test, decisiontree.predict_proba(X_test_scaled)[:,1])

plt.plot(fpr, tpr,lw=2)
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])


plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for fraud problem');
print("ROC AUC score = ", roc_auc_score(y_test, decisiontree.predict_proba(X_test_scaled)[:,1]))

In [None]:
# instantiate RandomForestClassifier and fit (all features)
randomforest = RandomForestClassifier()
randomforest.fit(X_train_scaled, y_train)

In [None]:
print('Random Forest Scores:' )

# random forest accuracy score
print('Accuracy (train): ', randomforest.score(X_train_scaled,y_train))
print('Accuracy (test): ', randomforest.score(X_test_scaled,y_test))


# random forest precision, recall, f1 (default threshold)
y_predict = randomforest.predict(X_test_scaled)
print('Precision: ', precision_score(y_test, y_predict))
print('Recall: ', recall_score(y_test, y_predict))
print('F1: ', f1_score(y_test, y_predict))

# lrandom forest log loss
y_prob_pred_train = randomforest.predict_proba(X_train_scaled)[:,1]
y_prob_pred_test = randomforest.predict_proba(X_test_scaled)[:,1]

print('Log Loss (train): ', log_loss(y_train,y_prob_pred_train))
print('Log Loss (test): ', log_loss(y_test,y_prob_pred_test))

In [None]:
# random forest confusion matrix
randomforest_confusion = confusion_matrix(y_test, randomforest.predict(X_test_scaled))
plt.figure(dpi=80)
sns.heatmap(randomforest_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['legit', 'fraud'],
           yticklabels=['legit', 'fraud'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Random Forest Confusion Matrix: Credit Card Transactions');

In [None]:
# random forest precision-recall curve
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_test, randomforest.predict_proba(X_test_scaled)[:,1] )

plt.figure(dpi=80)
plt.plot(threshold_curve, precision_curve[1:],label='precision')
plt.plot(threshold_curve, recall_curve[1:], label='recall')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability, label as fraud)');
plt.title('Precision and Recall Curves');

In [None]:
# random forest ROC curve
fpr, tpr, thresholds = roc_curve(y_test, randomforest.predict_proba(X_test_scaled)[:,1])

plt.plot(fpr, tpr,lw=2)
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])


plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for fraud problem');
print("ROC AUC score = ", roc_auc_score(y_test, randomforest.predict_proba(X_test_scaled)[:,1]))

In [None]:
# instantiate XGBClassifier and fit (all features)
xgb = XGBClassifier(
                    objective='binary:logistic', 
                    n_estimators=100, 
                    learning_rate=0.5, 
                    )
# max_depth
# nthread
# colsample_bytree
# min_child_weight

xgb.fit(X_train_scaled, y_train)

In [None]:
print('XGBoost Scores:' )

# xgb accuracy score
print('Accuracy (train): ', xgb.score(X_train_scaled,y_train))
print('Accuracy (test): ', xgb.score(X_test_scaled,y_test))


# xgb precision, recall, f1 (default threshold)
y_predict = xgb.predict(X_test_scaled)
print('Precision: ', precision_score(y_test, y_predict))
print('Recall: ', recall_score(y_test, y_predict))
print('F1: ', f1_score(y_test, y_predict))

# lxgb log loss
y_prob_pred_train = xgb.predict_proba(X_train_scaled)[:,1].astype(np.float64)
y_prob_pred_test = xgb.predict_proba(X_test_scaled)[:,1].astype(np.float64)

print('Log Loss (train): ', log_loss(y_train,y_prob_pred_train))
print('Log Loss (test): ', log_loss(y_test,y_prob_pred_test))

In [None]:
# XGBoost confusion matrix
xgb_confusion = confusion_matrix(y_test, xgb.predict(X_test_scaled))
plt.figure(dpi=80)
sns.heatmap(xgb_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['legit', 'fraud'],
           yticklabels=['legit', 'fraud'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('XGBoost Confusion Matrix: Credit Card Transactions');

In [None]:
# xgb precision-recall curve
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_test, xgb.predict_proba(X_test_scaled)[:,1] )

plt.figure(dpi=80)
plt.plot(threshold_curve, precision_curve[1:],label='precision')
plt.plot(threshold_curve, recall_curve[1:], label='recall')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability, label as fraud)');
plt.title('Precision and Recall Curves');

In [None]:
# xgb ROC curve
fpr, tpr, thresholds = roc_curve(y_test, xgb.predict_proba(X_test_scaled)[:,1])

plt.plot(fpr, tpr,lw=2)
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])


plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for fraud problem');
print("ROC AUC score = ", roc_auc_score(y_test, xgb.predict_proba(X_test_scaled)[:,1]))

In [None]:
## and the winner is: DecisionTreeClassifier

In [None]:
# decision tree feature importances before tuning/cross-validation
decisiontree.feature_importances_

In [None]:
def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('IMPORTANCE')
    plt.ylabel('FEATURE NAMES');

In [None]:
plot_feature_importance(decisiontree.feature_importances_,X_train.columns,'DECISION TREE')


In [None]:
# decision tree gridsearch cross validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

search_space = {'max_leaf_nodes': list(range(2,100)), 'min_samples_split':[2, 3, 4]}

decisiontree_gridsearch = GridSearchCV(estimator=decisiontree, param_grid=search_space,
                                       cv=kfold, n_jobs= -1, scoring='roc_auc', verbose=1)

decisiontree_gridsearch.fit(X_train_scaled, y_train)

In [None]:
print("Best estimator: ", decisiontree_gridsearch.best_estimator_)
print("Best score: ", decisiontree_gridsearch.best_score_)


In [None]:
# instantiate DecisionTreeClassifier and fit (best parameters)
decisiontree = DecisionTreeClassifier(max_leaf_nodes=46, min_samples_split=3)
decisiontree.fit(X_train_scaled, y_train)

In [None]:
print('Decision Tree Scores:' )

# decision tree accuracy score
print('Accuracy (train): ', decisiontree.score(X_train_scaled,y_train))
print('Accuracy (test): ', decisiontree.score(X_test_scaled,y_test))


# decision tree precision, recall, f1 (default threshold)
y_predict = decisiontree.predict(X_test_scaled)
print('Precision: ', precision_score(y_test, y_predict))
print('Recall: ', recall_score(y_test, y_predict))
print('F1: ', f1_score(y_test, y_predict))

# ldecision tree log loss
y_prob_pred_train = decisiontree.predict_proba(X_train_scaled)[:,1]
y_prob_pred_test = decisiontree.predict_proba(X_test_scaled)[:,1]

print('Log Loss (train): ', log_loss(y_train,y_prob_pred_train))
print('Log Loss (test): ', log_loss(y_test,y_prob_pred_test))

In [None]:
# decsion tree confusion matrix
decisiontree_confusion = confusion_matrix(y_test, decisiontree.predict(X_test_scaled))
plt.figure(dpi=80)
sns.heatmap(decisiontree_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['legit', 'fraud'],
           yticklabels=['legit', 'fraud'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Decision Tree Confusion Matrix: Credit Card Transactions');