# New Section

In [19]:
# Initial imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_recall_curve, auc, recall_score, make_scorer
from imblearn.over_sampling import SMOTE

In [20]:
# Loading data
data = pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [21]:
data.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [22]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Creating StandardScaler instance
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
# scaler.fit_transform(data)
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,-0.342475,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,-0.073403,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,-0.350151,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,-0.254117,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,-0.081839,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,-0.313249,0


In [23]:
data.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [29]:
# Split the dataset
X = data.drop('Class', axis=1)
y = data['Class']

In [31]:
# Use sklearn to split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [39]:
# Create the grid search estimator along with a parameter object containing the values to adjust.
# Try adjusting n_neighbors with values of 1 through 19. Adjust leaf_size by using 10, 50, 100, and 500.
# Include both uniform and distance options for weights.
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [1, 2, 5],
    'weights': ['distance'],
    'leaf_size': [1, 5, 10]
}

# Create a custom scoring function based on precision
scorer = make_scorer(recall_score)

# Create a GridSearchCV object with recall scoring
grid_clf = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=3)

In [40]:
# Fit the model by using the grid search estimator.
# This will take the KNN model and try each combination of parameters.
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END leaf_size=1, n_neighbors=1, weights=distance;, score=0.999 total time=   9.5s
[CV 2/5] END leaf_size=1, n_neighbors=1, weights=distance;, score=0.999 total time=  11.1s
[CV 3/5] END leaf_size=1, n_neighbors=1, weights=distance;, score=0.999 total time=  14.1s
[CV 4/5] END leaf_size=1, n_neighbors=1, weights=distance;, score=0.999 total time=  12.0s
[CV 5/5] END leaf_size=1, n_neighbors=1, weights=distance;, score=0.998 total time=  17.2s
[CV 1/5] END leaf_size=1, n_neighbors=2, weights=distance;, score=0.999 total time=  13.9s
[CV 2/5] END leaf_size=1, n_neighbors=2, weights=distance;, score=0.999 total time=  11.6s
[CV 3/5] END leaf_size=1, n_neighbors=2, weights=distance;, score=0.999 total time=  11.8s
[CV 4/5] END leaf_size=1, n_neighbors=2, weights=distance;, score=0.999 total time=  14.0s
[CV 5/5] END leaf_size=1, n_neighbors=2, weights=distance;, score=0.998 total time=  12.2s
[CV 1/5] END leaf_size=1, n_ne

In [41]:
# from sklearn.model_selection import GridSearchCV

# List the best parameters for this dataset
print(grid_clf.best_params_)

# List the best score
print(grid_clf.best_score_)

{'leaf_size': 1, 'n_neighbors': 1, 'weights': 'distance'}
0.9985955332941318


In [42]:
target_names = ["legitimate", "fraudulent"]

# Calculate the classification reports
GSCVpredictions = grid_clf.predict(X_test)
print(classification_report(y_test, GSCVpredictions, target_names=target_names))


              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     85295
    positive       0.76      0.24      0.36       148

    accuracy                           1.00     85443
   macro avg       0.88      0.62      0.68     85443
weighted avg       1.00      1.00      1.00     85443



In [43]:
# untunned KNeighbors Model

model = KNeighborsClassifier()
model = model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     85295
    positive       1.00      0.09      0.16       148

    accuracy                           1.00     85443
   macro avg       1.00      0.54      0.58     85443
weighted avg       1.00      1.00      1.00     85443

