In [1]:
from sklearn.metrics import confusion_matrix, silhouette_score, calinski_harabasz_score, homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.layers import Input, Dense, Lambda
from sklearn.ensemble import IsolationForest
from keras.models import Sequential, Model
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.svm import OneClassSVM
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from keras import backend as K
from pandas import DataFrame
import seaborn as sb
import pandas as pd
import numpy as np
import sklearn
import random
import time

import warnings
warnings.filterwarnings("ignore")

In [2]:
# loading the dataset
data = pd.read_csv('Fraud-detection (Data-world).csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,1,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,2,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,3,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,4,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,5,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
data.shape

(284807, 32)

In [4]:
# Checking for missing data
missing_values = data.isnull().sum()
print(missing_values)

Unnamed: 0    0
Time          0
V1            0
V2            0
V3            0
V4            0
V5            0
V6            0
V7            0
V8            0
V9            0
V10           0
V11           0
V12           0
V13           0
V14           0
V15           0
V16           0
V17           0
V18           0
V19           0
V20           0
V21           0
V22           0
V23           0
V24           0
V25           0
V26           0
V27           0
V28           0
Amount        0
Class         0
dtype: int64


In [5]:
columns_to_drop = ['Unnamed: 0', 'Time']
data.drop(columns=columns_to_drop, inplace=True)

In [6]:
# Splitting Data
X = data.drop(['Class'], axis = 1)
y = data['Class']

In [7]:
unique_values, counts = np.unique(y, return_counts=True)

# Print unique values and their counts
for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")

Value: 0, Count: 284315
Value: 1, Count: 492


# FEATURE SELECTION using Contrastive Learning

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(X)
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)


class SiameseNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(SiameseNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )

    def forward(self, x1, x2):
        out1 = self.fc(x1)
        out2 = self.fc(x2)
        return out1, out2

class ContrastiveLoss(nn.Module):
    def __init__(self, margin):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = nn.functional.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

siamese_network = SiameseNetwork(input_size=29, output_size=24)
contrastive_loss = ContrastiveLoss(margin=1.0)
optimizer = optim.Adam(siamese_network.parameters(), lr=0.001)

class SiameseDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, index):
        x1 = self.X[index]
        label = self.y[index]
        x2 = self.X[torch.randint(len(self.X), (1,)).item()]
        return x1, x2, torch.tensor(label, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

batch_size = 64
train_dataset = SiameseDataset(X_tensor, y_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

num_epochs = 3
for epoch in range(num_epochs):
    for x1, x2, label in train_loader:
        optimizer.zero_grad()
        output1, output2 = siamese_network(x1, x2)
        loss = contrastive_loss(output1, output2, label)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

siamese_network.eval()
with torch.no_grad():
    num_samples = len(X_tensor)
    indices = torch.randint(num_samples, (num_samples,))
    x1_eval = X_tensor
    x2_eval = X_tensor[indices]

    all_outputs, _ = siamese_network(x1_eval, x2_eval)

columns = [f'CL{i+1}' for i in range(24)]
df_CL = pd.DataFrame(data=all_outputs, columns=columns)

Epoch [1/3], Loss: 2.399999643465378e-11
Epoch [2/3], Loss: 2.399999643465378e-11
Epoch [3/3], Loss: 2.399999643465378e-11


# SSML

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
X = df_CL
X_labeled, X_unlabeled, y_labeled, _ = train_test_split(X, y, test_size=0.9, random_state=42)
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_unlabeled)
pseudo_labels = kmeans.fit_predict(X_unlabeled)
X_combined = np.vstack([X_labeled, X_unlabeled])
y_combined = np.concatenate([y_labeled, pseudo_labels])


classifier = LogisticRegression(random_state=42)
classifier.fit(X_combined, y_combined)
y_pred = classifier.predict(X_combined)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))

# Print evaluation metrics
calinski_harabasz_index = calinski_harabasz_score(df_CL, y_pred)
homogeneity = homogeneity_score(y, y_pred)
completeness = completeness_score(y, y_pred)
v_measure = v_measure_score(y, y_pred)


print(f"Calinski-Harabasz Index: {calinski_harabasz_index}")
print(f"Homogeneity: {homogeneity}")
print(f"Completeness: {completeness}")
print(f"V-Measure: {v_measure}")



Confusion Matrix:
[[283960    355]
 [   490      2]]
Calinski-Harabasz Index: 1.0091509820048155
Homogeneity: 0.0002690666009601716
Completeness: 0.00035531897605999187
V-Measure: 0.0003062353541264558
