In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_recall_curve, auc, recall_score, make_scorer
from imblearn.over_sampling import SMOTE

In [2]:
creditcard_df = pd.read_csv("creditcard.csv")
data = creditcard_df
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
print(data.columns)

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


In [4]:
# Preprocessing
# Scale 'Amount'
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [5]:
# Split the dataset
X = data.drop('Class', axis=1)
y = data['Class']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [7]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [1, 3, 5, 7],
    'weights': ['uniform', 'distance'],
    'leaf_size': [5, 10, 15]
}

# Create a custom scoring function based on precision
scorer = make_scorer(recall_score)

# Create a GridSearchCV object with recall scoring
grid_clf = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=3)

In [8]:
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END leaf_size=5, n_neighbors=1, weights=uniform;, score=0.999 total time=  10.8s
[CV 2/5] END leaf_size=5, n_neighbors=1, weights=uniform;, score=0.999 total time=  10.7s
[CV 3/5] END leaf_size=5, n_neighbors=1, weights=uniform;, score=0.999 total time=  11.8s
[CV 4/5] END leaf_size=5, n_neighbors=1, weights=uniform;, score=0.999 total time=  14.7s
[CV 5/5] END leaf_size=5, n_neighbors=1, weights=uniform;, score=0.998 total time=  13.9s
[CV 1/5] END leaf_size=5, n_neighbors=1, weights=distance;, score=0.999 total time=  12.7s
[CV 2/5] END leaf_size=5, n_neighbors=1, weights=distance;, score=0.999 total time=  13.9s
[CV 3/5] END leaf_size=5, n_neighbors=1, weights=distance;, score=0.999 total time=  13.7s
[CV 4/5] END leaf_size=5, n_neighbors=1, weights=distance;, score=0.999 total time=  14.9s
[CV 5/5] END leaf_size=5, n_neighbors=1, weights=distance;, score=0.998 total time=  14.3s
[CV 1/5] END leaf_size=5, n_neigh

[CV 1/5] END leaf_size=15, n_neighbors=3, weights=uniform;, score=0.998 total time=  21.3s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=uniform;, score=0.998 total time=  21.4s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=uniform;, score=0.999 total time=  17.9s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=uniform;, score=0.999 total time=  17.6s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=uniform;, score=0.998 total time=  17.7s
[CV 1/5] END leaf_size=15, n_neighbors=3, weights=distance;, score=0.999 total time=  18.8s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=distance;, score=0.999 total time=  18.2s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=distance;, score=0.999 total time=  18.9s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=distance;, score=0.999 total time=  18.7s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=distance;, score=0.999 total time=  17.4s
[CV 1/5] END leaf_size=15, n_neighbors=5, weights=uniform;, score=0.998 total time=  

In [9]:
# List the best parameters for this dataset
print(grid_clf.best_params_)

# List the best score
print(grid_clf.best_score_)

{'leaf_size': 5, 'n_neighbors': 3, 'weights': 'distance'}
0.9986256290991152


In [11]:
target_names = ['legitimate', 'fraudulent']
GSCVpredictions = grid_clf.predict(X_test)
print(classification_report(y_test, GSCVpredictions, target_names=target_names))

              precision    recall  f1-score   support

  legitimate       1.00      1.00      1.00     85295
  fraudulent       1.00      0.18      0.31       148

    accuracy                           1.00     85443
   macro avg       1.00      0.59      0.65     85443
weighted avg       1.00      1.00      1.00     85443

