In [5]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [7]:
def pr_auc(y_true, prob_pred):
  p, r, _ = precision_recall_curve(y_true, prob_pred)
  return auc(r, p)

def evaluate_model(X, y, model):
  cv = RepeatedStratifiedKFold(n_splits = 13, n_repeats = 2, random_state = 25)
  metric = make_scorer(pr_auc, needs_proba = True)
  scores = cross_val_score(model, X, y, scoring = metric, cv = cv, n_jobs = -1)
  return scores

def get_models():
  models, names = list(), list()
  # CART
  models.append(DecisionTreeClassifier())
  names.append('CART')
  # KNN
  steps = [('s',StandardScaler()),('m',KNeighborsClassifier())]
  models.append(Pipeline(steps = steps))
  names.append('KNN')
  # Bagging
  models.append(BaggingClassifier(n_estimators = 100))
  names.append('BAG')
  # RF
  models.append(RandomForestClassifier(n_estimators = 100))
  names.append('RF')
  # ET
  models.append(ExtraTreesClassifier(n_estimators = 100))
  names.append('ET')
  return models, names

def get_results(X, y, models, names):
  results = list()
  for i in range(len(models)):
    scores = evaluate_model(X, y, models[i])
    results.append(scores)
    print('>%s %.3f (%.3f)' % (names[i], np.mean(scores), np.std(scores)))
  return results

def show_plot(results, names):
  pyplot.boxplot(results, labels=names, showmeans=True)
  pyplot.show()

In [34]:
#loading the csv data into a DataFrame
data = pd.read_csv('./creditcard.csv')

In [9]:
# fisrt 5 rows of the dataset
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [10]:
# last 5 rows of the dataset
data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [11]:
# information of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [12]:
# missing values in each column
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [13]:
# distribution of legit and fraud transactions
data['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [14]:
X = data.drop(columns = 'Class', axis = 1)
y = data['Class']

In [15]:
X = data.drop(columns = 'Class', axis = 1)
y = data['Class']

In [16]:
models, names = get_models()

In [17]:
results = get_results(X, y, models)




>CART 0.770 (0.040)




>KNN 0.868 (0.036)




KeyboardInterrupt: 

In [18]:
show_plot(results, names)

NameError: name 'results' is not defined

In [19]:
model = KNeighborsClassifier()
pipeline = Pipeline(steps = [('s',StandardScaler()),('m',model)])

In [20]:
pipeline.fit(X, y)

In [45]:
legit_data = data[data.Class == 0]
fraud_data = data[data.Class == 1]

sample_legit_data = legit_data.drop(columns = 'Class', axis = 1).sample(n = 3)
sample_fraud_data = fraud_data.drop(columns = 'Class', axis = 1).sample(n = 3)

In [46]:
for row in sample_legit_data.to_numpy():
    pred = pipeline.predict_proba([row])
    print('>Predicted=%.3f (expected 0)' % (pred[0][1]))

>Predicted=0.000 (expected 0)
>Predicted=0.000 (expected 0)
>Predicted=0.000 (expected 0)




In [47]:
for row in sample_fraud_data.to_numpy():
    pred = pipeline.predict_proba([row])
    print('>Predicted=%.3f (expected 1)' % (pred[0][1]))

>Predicted=1.000 (expected 1)
>Predicted=1.000 (expected 1)
>Predicted=1.000 (expected 1)


