In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn import set_config
set_config(transform_output='pandas')

In [2]:
df = pd.read_csv("Data/credit_card.csv")
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


## About 0.4% of card transations are fraudulent. We need to detect these anomalies

# KMeans

In [3]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# We need to scale our data for KMeans
scaler = StandardScaler()
# Fit & transform data.
scaled_df = scaler.fit_transform(df)

In [4]:
# Apply K-Means algorithm
kmeans = KMeans(n_clusters=3, n_init=10, random_state = 42)
kmeans.fit(scaled_df)

In [5]:
# Making a dataframe copy of X for saving clusters 
km = scaled_df.copy()
km['cluster'] = kmeans.predict(scaled_df)
km.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,cluster
0,-0.726092,-0.270865,1.38998,0.762227,-0.236899,0.264052,0.285346,0.131083,-0.384443,0.28612,...,0.043066,0.689788,-0.163478,0.083851,0.099198,-0.532722,0.312929,-0.090009,0.459472,0
1,0.94981,-0.01302,-0.647336,0.11071,0.102482,-0.155239,-0.009963,0.120253,-0.915428,0.051569,...,-0.18551,-0.760673,0.268858,-0.603364,0.188884,0.033761,-0.039088,0.0435,-0.333835,0
2,-0.725138,-1.235035,0.733929,0.06281,-0.377373,1.293991,0.797185,0.249752,-1.995239,0.392444,...,0.336466,1.471338,1.918734,-1.193707,-0.959876,-0.442774,-0.153601,-0.234422,1.69611,0
3,-0.467623,-0.356408,0.750937,-0.80803,0.042565,0.868121,0.283501,0.353112,-1.885794,0.153501,...,-0.056083,0.258431,-0.326493,-2.015264,1.303733,-0.591732,0.137995,0.217896,0.318445,0
4,-0.593701,0.452248,0.54094,0.079101,-0.295578,-0.018016,0.613062,-0.163028,0.004829,0.888749,...,0.052846,1.513434,-0.218569,0.209441,-0.677493,0.71064,0.524976,0.791442,0.029532,0


In [6]:
# Saving the cluster centers as a dataframe for visibility
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=scaled_df.columns)
cluster_centers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-0.061196,0.053226,-0.05062,-0.094205,-0.118248,-0.244715,0.185121,-0.035698,-0.680309,0.108181,...,0.041215,0.084629,0.142682,-0.045672,-0.095136,-0.013498,-0.189528,0.098632,0.013024,0.007409
1,0.032499,-0.008109,0.190288,0.151972,-0.140992,-0.145277,-0.054969,-0.042637,0.463768,-0.088042,...,-0.056608,-0.044251,-0.009889,0.028587,-0.12567,-0.042935,0.163101,-0.06739,-0.007712,-0.00405
2,0.06216,-0.223727,-1.324018,-0.7624,1.835461,2.565302,-0.552852,0.554719,-0.171603,0.14761,...,0.25039,-0.091785,-0.699499,0.009924,1.579702,0.435446,-0.333151,0.026162,-0.006537,-0.006551


In [7]:
from scipy.spatial.distance import cdist
# Calculate distance to each cluster center
distances = cdist(scaled_df, kmeans.cluster_centers_, 'euclidean')

In [8]:
# Saving distances as a dataframe for convenience
cluter_cols = [f"Distance (Cluster {c})" for  c in range(len(kmeans.cluster_centers_))]
distance_df = pd.DataFrame(distances, columns = cluter_cols)
distance_df.head(3)

Unnamed: 0,Distance (Cluster 0),Distance (Cluster 1),Distance (Cluster 2)
0,2.804682,4.393264,5.857215
1,2.42643,3.686087,5.075081
2,6.542622,7.337378,8.656002


In [9]:
# Get the minimum distance to any cluster for each point
min_distances = np.min(distances, axis=1)
# Set a threshold based on our known parameter
threshold = np.percentile(min_distances, 99.6)
threshold

20.909887868001256

In [10]:
# Identify anomalies where the distance to closest cluster center is above the threshold
filter_anomalies = min_distances > threshold
# how many were found?
filter_anomalies.sum()

40

In [20]:
filter_anomalies

array([False, False, False, ..., False, False, False])

### There are forty fraudulent cases in our dataset

In [12]:
# Getting the row indices of the anomalies
idx_anomalies = df[filter_anomalies].index
# Slicing the anomalies from X (for demonstration)
anoms_km = df.iloc[idx_anomalies]
anoms_km

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
159,-6.093248,-12.114213,-5.694973,3.294389,-1.413792,4.776,4.808426,-0.228197,-0.525896,-1.724899,...,7.744222,2.228823,-2.264037,-3.553381,1.215279,-0.406073,-0.653384,-0.711222,0.6729,3828.04
1376,-3.936794,-3.670519,-1.45382,2.29975,-9.74944,6.45641,10.784088,-2.160016,-0.238116,-1.004398,...,-2.203676,-0.979104,0.957186,2.44196,0.250607,0.300908,1.157867,1.624284,-1.638647,2452.03
1619,-11.140706,-9.612726,-12.389545,6.013346,-32.092129,21.393069,34.303177,-7.520784,-1.925732,-2.636622,...,-11.748689,-4.709977,1.36611,-2.925888,0.843551,0.746267,0.801387,3.852046,4.157934,7712.43
2156,-5.849226,-11.754975,-3.333303,3.673246,-4.459583,0.659098,4.527224,-0.985512,0.237794,-1.85423,...,7.314347,2.16087,-1.992179,-3.240351,0.609619,-0.985548,0.089693,-0.713355,0.65306,3669.0
2212,-5.149638,4.26174,-1.043112,-2.872586,-0.970562,2.037714,-4.472909,-14.892657,1.71355,0.090612,...,-3.235393,14.718212,-4.432106,2.075999,-0.025098,0.042432,-0.045874,0.619351,0.287509,4.0
2439,-12.168192,-15.732974,-0.376474,3.792613,10.658654,-7.465603,-6.907038,1.573722,0.058164,-0.421881,...,4.469095,1.660209,-0.910516,0.010468,-0.097246,-0.329918,0.225916,0.201802,-2.368534,120.0
2594,-2.57494,-2.669331,-3.080133,2.043551,-11.416881,7.885721,12.219249,-1.828209,-1.475925,-1.965803,...,0.828357,-0.11558,0.26409,2.219739,-0.771131,0.145438,-0.240517,1.180441,-0.623651,2909.97
2654,-7.546256,3.051033,-0.524684,1.005228,0.040486,2.751676,-6.267406,-19.176657,1.049976,0.521925,...,-3.785808,9.639417,-3.053597,2.648404,0.429294,0.300704,-0.273082,1.300662,-0.641413,29.9
2756,-10.780565,-9.940571,1.703409,3.094958,7.929119,-4.032777,-8.945496,-5.099389,2.297646,-0.234243,...,-4.829446,-4.399837,1.633,-0.711629,0.645921,-0.549247,0.704801,3.372603,-1.307732,85.0
2911,-4.344234,4.268511,-1.580163,0.272536,-1.419244,2.603639,-11.164794,-23.632502,-1.849347,-5.217814,...,5.851381,-11.27389,3.661957,-0.200435,0.920605,0.619883,-0.38117,-0.132891,0.539894,1.0


In [16]:
idx_anomalies

Int64Index([ 159, 1376, 1619, 2156, 2212, 2439, 2594, 2654, 2756, 2911, 2914,
            2917, 2923, 3443, 4779, 5303, 5412, 5413, 5529, 5674, 5704, 5764,
            6489, 6643, 6672, 7322, 7338, 7470, 7596, 7597, 8124, 8163, 8437,
            8442, 8856, 8939, 8999, 9071, 9304, 9326],
           dtype='int64')

### Above we see the forty anomalies that are mostly likely to be fraudulent

# Isolation Forest

In [13]:
# Instantiate the model with a contaimination of 0.004 (we will identify 0.4% as anamolous)
iso = IsolationForest(contamination=0.004, random_state = 42)
# fit the model using .values to avoid a warning
iso.fit(df.values)

In [14]:
# Obtain results from the model
predictions = iso.predict(df.values)
predictions[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [15]:
# Change the labels to match our columns from the kmeans dataframe
# Not anamolies
predictions[predictions ==1] = 0
# Anomalies
predictions[predictions ==-1] = 1
# Preview new labels
predictions[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [17]:
# Add isolation forest predictions to the dataframe
df['iso_anomalies'] = predictions

In [19]:
filter_iso = df['iso_anomalies'] == 1
filter_iso

0       False
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Name: iso_anomalies, Length: 10000, dtype: bool

In [22]:
iso_anomalies = df[filter_iso].index
iso_anomalies

Int64Index([ 159, 1619, 2156, 2756, 2858, 2914, 2917, 2923, 5303, 5412, 5413,
            5704, 6311, 6489, 6581, 6595, 6634, 6643, 6672, 6757, 6761, 6798,
            6829, 7338, 7470, 7596, 7597, 8124, 8163, 8437, 8442, 8627, 8645,
            8667, 8670, 8856, 8999, 9071, 9304, 9326],
           dtype='int64')

In [23]:
# Make a list of anomolies identified in both methods
both = [a for a in iso_anomalies if a in idx_anomalies]
both

[159,
 1619,
 2156,
 2756,
 2914,
 2917,
 2923,
 5303,
 5412,
 5413,
 5704,
 6489,
 6643,
 6672,
 7338,
 7470,
 7596,
 7597,
 8124,
 8163,
 8437,
 8442,
 8856,
 8999,
 9071,
 9304,
 9326]

In [26]:
# How many anomalies overlap?
len(both)

27

### Twenty-seven anomalies occurred across both models

In [29]:
# Calculating percentage of overlap
p = (27/40)*100
p

67.5

### With 10000 cases in this df and 0.04% being fraudulent, about 40 anomalies should be present. If so, there's a 67.5% overlap between KMeans and Isolation Forest