In [25]:
import pandas as pd
import numpy as np
from collections import Counter
from matplotlib import pyplot
from sklearn.dummy import DummyClassifier
from numpy import mean
from numpy import std
from pandas import read_csv
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [5]:
# load the dataset
def load_dataset(full_path):
    # load the dataset as a numpy array
    data = read_csv(full_path)
    # dependent and independent variables
    X = data.drop(['Class'], axis=1)
    y = data['Class']
    # Trian Test Split
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)
    return X_train, X_test, y_train, y_test

# calculate precision-recall area under curve
def pr_auc(y_true, probas_pred):
    # calculate precision-recall curve
    p, r, _ = precision_recall_curve(y_true, probas_pred)
    # calculate area under curve
    return auc(r, p)

# evaluate a model
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # define the model evaluation the metric
    metric = make_scorer(pr_auc, needs_proba=True)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
    return scores

# Final evaluation and display
def final_eval(model,X_train, y_train,X_test,y_test, model_name):
    # Score model
    scores = evaluate_model(X_train, y_train,model)
    # summarize performance
    print(model_name)
    print('Mean PR AUC: %.3f (%.3f)' % (mean(scores), std(scores)))
    # fit model
    model.fit(X_train,y_train)
    # predict y_test
    y_pred = model.predict(X_test)
    # final score
    print('PR AUC: %.3f' % (pr_auc(y_test,y_pred)))
    print("Accuracy: ",accuracy_score(y_test, y_pred))

In [6]:
# define the location of the dataset
full_path = 'creditcard.csv'
# load the dataset
X_train, X_test, y_train, y_test = load_dataset(full_path)

In [27]:
# Define Model
DTC_plain = DecisionTreeClassifier()
# Call final_eval on the Model
final_eval(DTC_plain,X_train, y_train,X_test,y_test,'Decision Tree Plain')

Decision Tree Plain
Mean PR AUC: 0.778 (0.051)
PR AUC: 0.681
Accuracy:  0.9988413328183702


In [28]:
# Define Model
steps = [('s',StandardScaler()),('m',KNeighborsClassifier())]
kNN_plain = Pipeline(steps=steps)
# Call final_eval on the Model
final_eval(kNN_plain,X_train, y_train,X_test,y_test,'k Nearest Neighbors Plain')

k Nearest Neighbors Plain
Mean PR AUC: 0.874 (0.041)
PR AUC: 0.783
Accuracy:  0.9992275552122467


In [33]:
# Define Model
BAG_plain = BaggingClassifier(n_estimators=100)
# Call final_eval on the Model
final_eval(BAG_plain,X_train, y_train,X_test,y_test,'Bagging Classifier plain')

Bagging Classifier plain
Mean PR AUC: 0.852 (0.046)
PR AUC: 0.803
Accuracy:  0.9992977774656788


In [30]:
# Define Model
RFC_plain = RandomForestClassifier(n_estimators=100)
# Call final_eval on the Model
final_eval(RFC_plain,X_train, y_train,X_test,y_test,'Random Forest plain')

Random Forest plain
Mean PR AUC: 0.859 (0.047)
PR AUC: 0.787
Accuracy:  0.9992275552122467


In [31]:
# Define Model
ETC_plain = ExtraTreesClassifier(n_estimators=100)
# Call final_eval on the Model
final_eval(ETC_plain,X_train, y_train,X_test,y_test,'Extra Trees Classifier plain')

Extra Trees Classifier plain
Mean PR AUC: 0.864 (0.048)
PR AUC: 0.781
Accuracy:  0.9992275552122467
