In [21]:
import pandas as pd
import numpy as np
from collections import Counter
from matplotlib import pyplot
from sklearn.dummy import DummyClassifier
from numpy import mean
from numpy import std
from pandas import read_csv
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [16]:
# load the dataset
def load_dataset(full_path):
	# load the dataset as a numpy array
	data = read_csv(full_path)
	# Trian Test Split
	data, test= train_test_split(data, test_size=0.1)
	# retrieve numpy array
	data = data.values
	# split into input and output elements
	X, y = data[:, :-1], data[:, -1]
	return X, y,test

In [17]:
# define the location of the dataset
full_path = 'creditcard.csv'
# load the dataset
X, y, test = load_dataset(full_path)

In [18]:
# define the reference model
model = DecisionTreeClassifier()

In [19]:
# calculate precision-recall area under curve
def pr_auc(y_true, probas_pred):
	# calculate precision-recall curve
	p, r, _ = precision_recall_curve(y_true, probas_pred)
	# calculate area under curve
	return auc(r, p)

In [20]:
# evaluate a model
def evaluate_model(X, y, model):
	# define evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# define the model evaluation the metric
	metric = make_scorer(pr_auc, needs_proba=True)
	# evaluate model
	scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
	return scores

In [7]:
# evaluate model
scores = evaluate_model(X, y, model)

In [8]:
# summarize performance
print('Mean PR AUC: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean PR AUC: 0.762 (0.042)


- Decision Tree
- k-Nearest Neighbors
- Bagged Decision Trees
- Random Forest
- Extra Trees

# Decision Tree
- criterion = ['gini', 'entropy']
- max_depth = [3,5,7,10,15]
- min_samples_split = [3,5,7,10,15]
- min_samples_leaf = [3,6,9]

In [22]:
# define the reference model
DTC = DecisionTreeClassifier(criterion='gini',max_depth=2)
# evaluate model
DTCscores = evaluate_model(X, y, model)
# summarize performance
print('Decision Tree Classifier, gini,  max=2')
print('Mean PR AUC: %.3f (%.3f)' % (mean(scores), std(scores)))

Decision Tree Classifier, gini,  max=2
Mean PR AUC: 0.762 (0.042)


In [24]:
parameters = {'criterion':['gini', 'entropy']}

DTC_GS = DecisionTreeClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

metric = make_scorer(pr_auc, needs_proba=True)

clf = GridSearchCV(DTC_GS, parameters,scoring=metric,cv=cv)

clf.fit(X, y)

clf.best_params_

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_criterion',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split10_test_score',
 'split11_test_score',
 'split12_test_score',
 'split13_test_score',
 'split14_test_score',
 'split15_test_score',
 'split16_test_score',
 'split17_test_score',
 'split18_test_score',
 'split19_test_score',
 'split1_test_score',
 'split20_test_score',
 'split21_test_score',
 'split22_test_score',
 'split23_test_score',
 'split24_test_score',
 'split25_test_score',
 'split26_test_score',
 'split27_test_score',
 'split28_test_score',
 'split29_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'split5_test_score',
 'split6_test_score',
 'split7_test_score',
 'split8_test_score',
 'split9_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [26]:
clf.best_params_

{'criterion': 'gini'}