<a href="https://colab.research.google.com/github/ShimilSBabu/Training/blob/main/credit_card_fraud_detection_using_k_means_and_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import time

In [None]:
tic=time.time() # Recording the current time

## Reading the dataset

In [None]:
full_data=pd.read_csv("/content/drive/MyDrive/IDRBT/creditcard.csv")
full_data=full_data.sample(frac=1) #randomize the whole dataset
full_data.head() #Displaying the 1st 5 records

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
149179,90754.0,-0.562588,1.305985,-2.096885,0.258754,3.593192,3.485403,0.974274,0.670845,0.608623,...,-0.177544,0.074177,-0.241896,0.533475,0.111364,-0.386573,0.621618,0.289811,91.04,0
9358,13642.0,1.22017,-0.036126,0.762629,0.121446,-0.600395,-0.365912,-0.490836,-0.0191,1.480623,...,-0.310389,-0.73973,0.106822,-0.021256,-0.015298,0.752247,-0.097937,-0.003457,18.11,0
113647,73150.0,1.232073,0.126395,0.158571,0.26139,0.164484,0.27473,-0.138243,0.134701,-0.157259,...,-0.232711,-0.672352,0.051517,-0.814473,0.223814,0.158195,-0.017703,-0.002987,1.98,0
95380,65277.0,-0.305362,1.065752,0.832635,-0.32708,0.669941,0.011406,0.613012,0.129249,-0.620189,...,-0.28401,-0.76647,-0.170682,-0.893991,-0.0245,0.126348,0.241579,0.07694,5.38,0
43756,41641.0,-0.606198,0.290522,0.08774,-1.709809,-0.264325,-0.83992,0.029103,0.358654,-0.951743,...,0.243686,0.576829,-0.159068,-0.458484,-0.144509,-0.273961,0.25937,0.138844,24.0,0


## Removing the Time and Class features from the dataset

In [None]:
full_features=full_data.drop(["Time","Class"],axis=1)
full_features.tail() #Displaying the last 5 records

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
4715,0.93502,-0.233224,2.247853,3.113497,-1.076881,1.569162,-1.326054,0.483972,2.08104,-0.007361,...,-0.051181,-0.019305,0.568895,-0.207388,0.034504,0.470851,0.206519,0.059179,0.030042,52.48
174258,1.400961,-1.865317,-0.60229,-0.425359,-1.244164,0.266471,-0.693451,0.270763,2.042763,-0.531569,...,0.326011,-0.05008,-0.591765,0.162853,0.714303,-0.722326,0.328833,-0.085488,-0.0033,296.0
191466,-3.565426,3.477905,-2.831556,-0.886672,-0.707109,-0.698496,-1.001844,2.103479,0.483688,0.178686,...,0.369806,-0.100713,-0.193712,0.173717,0.018206,0.511757,0.505273,0.282892,0.21375,5.41
171730,-4.350429,-1.764845,-1.297726,-0.576341,2.506038,0.298357,1.32519,-3.899371,1.763802,0.78905,...,-2.964242,1.127662,-0.473116,1.602153,-0.281242,-0.12917,-0.327356,-1.124097,0.619151,76.61
42500,-0.195276,-1.912162,-0.626016,1.891629,-0.598437,-0.104347,1.264147,-0.342911,-0.191136,-0.414731,...,1.335563,0.441209,-0.120862,-0.765055,0.167999,0.536855,-0.311863,-0.103827,0.138209,724.44


In [None]:
full_labels=pd.DataFrame(full_data[["Class"]])
full_labels.head()

Unnamed: 0,Class
149179,0
9358,0
113647,0
95380,0
43756,0


## Taking the values and converting them to numpy array

In [None]:
full_features_array=full_features.values
full_labels_array=full_labels.values
print(f"full_features_array\nType : {type(full_features_array)}\n\n{full_features_array}\n\n\n")
print(f"full_labels_array\nType : {type(full_labels_array)}\n\n{full_labels_array}")

full_features_array
Type : <class 'numpy.ndarray'>

[[-5.62587649e-01  1.30598522e+00 -2.09688469e+00 ...  6.21617857e-01
   2.89811102e-01  9.10400000e+01]
 [ 1.22017014e+00 -3.61261619e-02  7.62628961e-01 ... -9.79369832e-02
  -3.45746697e-03  1.81100000e+01]
 [ 1.23207341e+00  1.26394916e-01  1.58571115e-01 ... -1.77033472e-02
  -2.98694601e-03  1.98000000e+00]
 ...
 [-3.56542639e+00  3.47790467e+00 -2.83155615e+00 ...  2.82892261e-01
   2.13750141e-01  5.41000000e+00]
 [-4.35042891e+00 -1.76484481e+00 -1.29772586e+00 ... -1.12409663e+00
   6.19150523e-01  7.66100000e+01]
 [-1.95275947e-01 -1.91216162e+00 -6.26015818e-01 ... -1.03826677e-01
   1.38209234e-01  7.24440000e+02]]



full_labels_array
Type : <class 'numpy.ndarray'>

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


## Splitting the data into train and test sets

In [None]:
train_features,test_features,train_labels,test_labels=train_test_split(full_features_array,full_labels_array,train_size=0.90)

## Normalizing the values

In [None]:
train_features=normalize(train_features)
print(f"train_features\n\n{train_features}\n\n\n")
test_features=normalize(test_features)
print(f"train_features\n\n{train_features}\n\n\n")
print(f"test_features\n\n{test_features}")

train_features

[[-3.87501471e-01  4.89495424e-02 -4.12275791e-02 ...  4.46715432e-02
  -7.80233510e-02  5.51251797e-01]
 [ 1.82925386e-02 -5.29105424e-03 -4.10424461e-03 ...  3.51407115e-04
  -1.86479503e-04  9.99016472e-01]
 [ 2.28645921e-02 -7.11638764e-03 -9.28642750e-03 ...  7.34120042e-05
  -3.32097742e-04  9.99269130e-01]
 ...
 [ 3.58099547e-02  7.14560914e-03  1.76479037e-02 ...  1.39497619e-03
   8.22567772e-04  9.94979494e-01]
 [-3.07537893e-01  1.78658364e-01 -2.38292329e-01 ... -4.41539279e-02
  -3.85505358e-02  1.29355957e-01]
 [ 1.41781921e-03 -1.70205232e-03 -2.76229618e-03 ... -2.77364354e-04
   1.12915760e-04  9.99935406e-01]]



train_features

[[-3.87501471e-01  4.89495424e-02 -4.12275791e-02 ...  4.46715432e-02
  -7.80233510e-02  5.51251797e-01]
 [ 1.82925386e-02 -5.29105424e-03 -4.10424461e-03 ...  3.51407115e-04
  -1.86479503e-04  9.99016472e-01]
 [ 2.28645921e-02 -7.11638764e-03 -9.28642750e-03 ...  7.34120042e-05
  -3.32097742e-04  9.99269130e-01]
 ...
 [ 3.5809

## k_means_classification

In [None]:
kmeans=KMeans(n_clusters=2,random_state=0,algorithm="elkan",max_iter=10000)
kmeans.fit(train_features)
kmeans_predicted_train_labels=kmeans.predict(train_features)

##confusion matrix

In [None]:
print("tn --> true negatives")
print("fp --> false positives")
print("fn --> false negatives")
print("tp --> true positives")
tn,fp,fn,tp=confusion_matrix(train_labels,kmeans_predicted_train_labels).ravel()
reassignflag=False
if tn+tp<fn+fp:
	# clustering is opposite of original classification
	reassignflag=True
kmeans_predicted_test_labels=kmeans.predict(test_features)
if reassignflag:
	kmeans_predicted_test_labels=1-kmeans_predicted_test_labels
#calculating confusion matrix for kmeans test data
tn,fp,fn,tp=confusion_matrix(test_labels,kmeans_predicted_test_labels).ravel()


tn --> true negatives
fp --> false positives
fn --> false negatives
tp --> true positives


##scoring kmeans

Accuracy : number of correct predictions to the total number of predictions.
> * (TP + TN) / (TP + FP + TN + FN)
* Not suitable for class imbalanced data

Precision : proportion of positive prediction that was actually correct.
> * TP / (TP + FP)
> * Maximize precision to minimize the FP errors.




Recall : proportion of actual positive that was identified incorrectly.
> * TP / (TP + FN)
* Maximize recall to minimize the FN error.

F1-Score : Uses both precision and recall, so it should be used if both of them are important for evaluation, but one (precision or recall) is slightly more important to consider than the other.
> * (2 \* Precision \* Recall) / (Precision + Recall)
* Harmonic mean of Precision and Recall.

> * F1 score	Interpretation

>>> * 0.9	=> Very good

>>> * 0.8 - 0.9	=> Good

>>> * 0.5 - 0.8 =>	OK

>>> * < 0.5	=> Not good

In [None]:
kmeans_accuracy_score=accuracy_score(test_labels,kmeans_predicted_test_labels)
kmeans_precison_score=precision_score(test_labels,kmeans_predicted_test_labels)
kmeans_recall_score=recall_score(test_labels,kmeans_predicted_test_labels)
kmeans_f1_score=f1_score(test_labels,kmeans_predicted_test_labels)

##printing

In [None]:
print("")
print("K-Means")
print("Confusion Matrix")
print("tn =",tn,"fp =",fp)
print("fn =",fn,"tp =",tp)
print("Scores")
print("Accuracy -->",kmeans_accuracy_score)
print("Precison -->",kmeans_precison_score)
print("Recall -->",kmeans_recall_score)
print("F1 -->",kmeans_f1_score)


K-Means
Confusion Matrix
tn = 22513 fp = 5921
fn = 22 tp = 25
Scores
Accuracy --> 0.7913345739264773
Precison --> 0.004204507231752439
Recall --> 0.5319148936170213
F1 --> 0.00834306691139663


##k_nearest_neighbours_classification

In [None]:
knn=KNeighborsClassifier(n_neighbors=5,algorithm="kd_tree",n_jobs=-1)
knn.fit(train_features,train_labels.ravel())
knn_predicted_test_labels=knn.predict(test_features)

##calculating confusion matrix for knn

In [None]:
tn,fp,fn,tp=confusion_matrix(test_labels,knn_predicted_test_labels).ravel()

##scoring knn

In [None]:
knn_accuracy_score=accuracy_score(test_labels,knn_predicted_test_labels)
knn_precison_score=precision_score(test_labels,knn_predicted_test_labels)
knn_recall_score=recall_score(test_labels,knn_predicted_test_labels)
knn_f1_score=f1_score(test_labels,knn_predicted_test_labels)

##printing

In [None]:
# print("")
print("K-Nearest Neighbours")
print("Confusion Matrix")
print("tn =",tn,"fp =",fp)
print("fn =",fn,"tp =",tp)
print("Scores")
print("Accuracy -->",knn_accuracy_score)
print("Precison -->",knn_precison_score)
print("Recall -->",knn_recall_score)
print("F1 -->",knn_f1_score)


K-Nearest Neighbours
Confusion Matrix
tn = 28429 fp = 5
fn = 9 tp = 38
Scores
Accuracy --> 0.9995084442259752
Precison --> 0.8837209302325582
Recall --> 0.8085106382978723
F1 --> 0.8444444444444444


##time elapsed

In [None]:
toc=time.time()
elapsedtime=toc-tic
print("")
print("Time Taken : "+str(elapsedtime)+"seconds")


Time Taken : 261.6710875034332seconds
