In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn import set_config
set_config(transform_output='pandas')

In [2]:
df = pd.read_csv("Data/credit_card.csv")
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


## About 0.4% of card transations are fraudulent. We need to detect these anomalies

# KMeans

In [3]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# We need to scale our data for KMeans
scaler = StandardScaler()
# Fit & transform data.
scaled_df = scaler.fit_transform(df)

In [4]:
# Apply K-Means algorithm
kmeans = KMeans(n_clusters=3, n_init=10, random_state = 42)
kmeans.fit(scaled_df)

In [7]:
# Making a dataframe copy of X for saving clusters 
km = scaled_df.copy()
km['cluster'] = kmeans.predict(scaled_df)
km.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,cluster
0,-0.726092,-0.270865,1.38998,0.762227,-0.236899,0.264052,0.285346,0.131083,-0.384443,0.28612,...,0.043066,0.689788,-0.163478,0.083851,0.099198,-0.532722,0.312929,-0.090009,0.459472,0
1,0.94981,-0.01302,-0.647336,0.11071,0.102482,-0.155239,-0.009963,0.120253,-0.915428,0.051569,...,-0.18551,-0.760673,0.268858,-0.603364,0.188884,0.033761,-0.039088,0.0435,-0.333835,0
2,-0.725138,-1.235035,0.733929,0.06281,-0.377373,1.293991,0.797185,0.249752,-1.995239,0.392444,...,0.336466,1.471338,1.918734,-1.193707,-0.959876,-0.442774,-0.153601,-0.234422,1.69611,0
3,-0.467623,-0.356408,0.750937,-0.80803,0.042565,0.868121,0.283501,0.353112,-1.885794,0.153501,...,-0.056083,0.258431,-0.326493,-2.015264,1.303733,-0.591732,0.137995,0.217896,0.318445,0
4,-0.593701,0.452248,0.54094,0.079101,-0.295578,-0.018016,0.613062,-0.163028,0.004829,0.888749,...,0.052846,1.513434,-0.218569,0.209441,-0.677493,0.71064,0.524976,0.791442,0.029532,0


In [8]:
# Saving the cluster centers as a dataframe for visibility
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=scaled_df.columns)
cluster_centers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-0.061196,0.053226,-0.05062,-0.094205,-0.118248,-0.244715,0.185121,-0.035698,-0.680309,0.108181,...,0.041215,0.084629,0.142682,-0.045672,-0.095136,-0.013498,-0.189528,0.098632,0.013024,0.007409
1,0.032499,-0.008109,0.190288,0.151972,-0.140992,-0.145277,-0.054969,-0.042637,0.463768,-0.088042,...,-0.056608,-0.044251,-0.009889,0.028587,-0.12567,-0.042935,0.163101,-0.06739,-0.007712,-0.00405
2,0.06216,-0.223727,-1.324018,-0.7624,1.835461,2.565302,-0.552852,0.554719,-0.171603,0.14761,...,0.25039,-0.091785,-0.699499,0.009924,1.579702,0.435446,-0.333151,0.026162,-0.006537,-0.006551


In [9]:
from scipy.spatial.distance import cdist
# Calculate distance to each cluster center
distances = cdist(scaled_df, kmeans.cluster_centers_, 'euclidean')

In [10]:
# Saving distances as a dataframe for convenience
cluter_cols = [f"Distance (Cluster {c})" for  c in range(len(kmeans.cluster_centers_))]
distance_df = pd.DataFrame(distances, columns = cluter_cols)
distance_df.head(3)

Unnamed: 0,Distance (Cluster 0),Distance (Cluster 1),Distance (Cluster 2)
0,2.804682,4.393264,5.857215
1,2.42643,3.686087,5.075081
2,6.542622,7.337378,8.656002


In [11]:
# Get the minimum distance to any cluster for each point
min_distances = np.min(distances, axis=1)
# Set a threshold based on our known parameter
threshold = np.percentile(min_distances, 99.6)
threshold

20.909887868001256

In [12]:
# Identify anomalies where the distance to closest cluster center is above the threshold
filter_anomalies = min_distances > threshold
# how many were found?
filter_anomalies.sum()

40

### There are forty fraudulent cases in our dataset

In [15]:
# Getting the row indices of the anomalies
idx_anomalies = scaled_df[filter_anomalies].index
# Slicing the anomalies from X (for demonstration)
anoms_km = scaled_df.iloc[idx_anomalies]
anoms_km

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
159,-3.83496,-9.431468,-5.686284,2.104654,-1.153193,3.584223,4.52282,-0.129306,-1.147362,-1.366035,...,12.960326,2.518824,-3.332963,-7.192555,2.023902,-1.141962,-1.367622,-1.773317,2.499616,20.320071
1376,-2.418627,-3.007869,-2.040269,1.407855,-8.255103,4.877629,10.065097,-1.668099,-0.900585,-0.710429,...,-3.75428,-1.015485,1.76492,5.0476,0.394163,0.49937,1.889565,3.994384,-6.126389,12.89069
1619,-7.15007,-7.528445,-11.441444,4.009433,-27.290908,16.374319,31.878466,-5.938228,-2.347745,-2.19564,...,-19.791952,-5.125945,2.41208,-5.911458,1.395896,1.53332,1.248504,9.496001,15.504724,41.292747
2156,-3.674689,-9.158175,-3.656015,2.370064,-3.748184,0.415459,4.262012,-0.732546,-0.492485,-1.483717,...,12.238043,2.443958,-2.902722,-6.553469,1.000686,-2.487275,-0.03134,-1.778584,2.425582,19.461379
2212,-3.215208,3.026652,-1.687193,-2.215652,-0.775564,1.476574,-4.085387,-11.810305,0.773002,0.285954,...,-5.487786,16.278911,-6.764131,4.300451,-0.071619,-0.10071,-0.275131,1.512629,1.061451,-0.326762
2439,-7.824911,-12.184466,-1.114101,2.453688,9.132442,-5.838093,-6.342982,1.306016,-0.64652,-0.180378,...,7.457408,1.892358,-1.190891,0.083439,-0.193509,-0.96516,0.213631,0.481461,-8.850109,0.299547
2594,-1.524177,-2.246208,-3.43837,1.228373,-9.67575,5.977763,11.396173,-1.403798,-1.962029,-1.585241,...,1.340186,-0.064103,0.668031,4.593911,-1.331986,0.138431,-0.625159,2.89828,-2.338725,15.363209
2654,-4.789278,2.105598,-1.241513,0.500971,0.085842,2.026108,-5.749739,-15.222733,0.203976,0.678419,...,-6.412602,10.683389,-4.582514,5.469079,0.69604,0.498896,-0.683721,3.195176,-2.40501,-0.186922
2756,-6.913533,-7.777855,0.673923,1.964941,6.806898,-3.19586,-8.2336,-4.009462,1.273875,-0.009641,...,-8.166138,-4.784251,2.834458,-1.390803,1.062015,-1.474357,1.074814,8.311982,-4.891511,0.110575
2911,-2.686228,3.031804,-2.148883,-0.01232,-1.157837,1.912165,-10.291944,-18.772044,-2.282244,-4.544346,...,9.779945,-12.357685,6.045469,-0.347143,1.526074,1.239905,-0.878097,-0.345087,2.003279,-0.34296


### Above we see the forty anomalies that are mostly likely to be fraudulent

# Isolation Forest

In [16]:
# Instantiate the model with a contaimination of 0.05 (we will identify 5% as anamolous)
iso = IsolationForest(contamination=0.0004, random_state = 42)
# fit the model using .values to avoid a warning
iso.fit(df.values)

In [17]:
# Obtain results from the model
predictions = iso.predict(df.values)
predictions[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [18]:
# Change the labels to match our columns from the kmeans dataframe
# Not anamolies
predictions[predictions ==1] = 0
# Anomalies
predictions[predictions ==-1] = 1
# Preview new labels
predictions[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [39]:
# Add isolation forest predictions to the dataframe
scaled_df['iso_anomalies'] = predictions
# Reloading our km anomalies set with updated df
anoms_km = scaled_df.iloc[idx_anomalies]
# We just want to see agreements
agrees = anoms_km[['iso_anomalies']].loc[anoms_km['iso_anomalies']==1]
agrees

Unnamed: 0,iso_anomalies
1619,1
8163,1
8999,1
9071,1


In [42]:
# Make a list of anomolies identified in both methods
both = agrees.index
both

Int64Index([1619, 8163, 8999, 9071], dtype='int64')

### In the end only four anomalies occurred across both models

### With 10000 cases in this df and 0.04% being fraudulent, about 40 anomalies should be present. If so, there's only a 10% overlap between KMeans and Isolation Forest