In [None]:
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
# Load dataset
data = pd.read_csv('creditcard.csv')

# Preprocess data
def preprocess_data(df):
    df = df.drop(columns=['Time'])  # Drop Time column
    return df

data = preprocess_data(data)

# Normalize features
scaler = StandardScaler()
data.iloc[:, :-1] = scaler.fit_transform(data.iloc[:, :-1])

# Handle class imbalance
def balance_data(df):
    minority_class = df[df['Class'] == 1]
    majority_class = df[df['Class'] == 0].sample(n=len(minority_class) * 10, random_state=42)
    balanced_df = pd.concat([minority_class, majority_class]).sample(frac=1, random_state=42)
    return balanced_df

data = balance_data(data)

# Create initial graph data using k-NN approach
def create_graph_data(df, k=5):
    features = df.drop(columns=['Class']).values
    labels = df['Class'].values
    num_nodes = len(df)
    dist_matrix = distance_matrix(features, features)
    edge_index = []
    for i in range(num_nodes):
        neighbors = np.argsort(dist_matrix[i])[1:k+1]  # Select k nearest neighbors
        for n in neighbors:
            edge_index.append([i, n])
            edge_index.append([n, i])  # Make it bidirectional
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    x = torch.tensor(features, dtype=torch.float32)
    y = torch.tensor(labels, dtype=torch.long)
    return Data(x=x, edge_index=edge_index, y=y)

graph_data = create_graph_data(data)

def split_data(data, test_size=0.2):
    num_nodes = data.x.shape[0]
    train_idx, test_idx = train_test_split(np.arange(num_nodes), test_size=test_size, random_state=42, stratify=data.y.numpy())
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    train_mask[train_idx] = True
    test_mask[test_idx] = True
    data.train_mask = train_mask
    data.test_mask = test_mask
    return data

graph_data = split_data(graph_data)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Extract features (x) and labels (y) from the graph
features = graph_data.x.numpy()  # Convert from torch tensor to numpy array
labels = graph_data.y.numpy()    # Convert labels to numpy array

# Create a DataFrame for better inspection (optional)
features_df = pd.DataFrame(features)
labels_df = pd.DataFrame(labels, columns=["Class"])

# Print the first few rows of the features and labels (optional)
print(features_df.head(), labels_df.head())


         0         1         2         3         4         5         6   \
0 -2.603909  2.200345 -2.535144  0.129397 -0.857817  1.202565 -2.429855   
1 -0.438957  0.625134  1.293982  0.761969  0.392903 -0.289577  0.380345   
2 -0.452042  0.834484  1.138837  0.539390 -0.197061 -0.175326  0.293780   
3  1.156146 -0.378593 -1.698718 -0.718181  0.142121 -1.086934  0.233704   
4  0.550839 -0.258247 -0.470772 -0.680758 -0.163034 -0.869149  0.407502   

         7         8         9   ...        19         20        21        22  \
0 -7.238274  1.170055 -3.414142  ... -3.640181  11.273224 -3.854414  1.746642   
1 -0.119400  0.539669 -0.535535  ... -0.067689  -0.610731 -1.549316 -0.323972   
2  0.309625 -0.877996 -0.643262  ...  0.100321  -0.121969 -0.563403 -0.188127   
3 -0.381931 -0.932955  1.065416  ... -0.191513   0.665382  1.884584 -0.441949   
4 -0.225893  0.579333 -0.671154  ...  0.128638   0.151801  0.133996 -0.544871   

         23        24        25        26        27        28 

In [3]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, stratify=labels, test_size=0.2, random_state=42)

# Optional: You can print the shapes to confirm
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)


Training features shape: (4329, 29)
Testing features shape: (1083, 29)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

# Initialize the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Train the model
lr_model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = lr_model.predict(X_test)
y_prob = lr_model.predict_proba(X_test)[:, 1]  # Probability scores for ROC-AUC

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC: {roc_auc}")


Accuracy: 0.987072945521699
Recall: 0.8775510204081632
Precision: 0.9772727272727273
F1 Score: 0.9247311827956989
ROC-AUC: 0.9929555578576608


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]  # Probability scores for ROC-AUC

# Evaluate the model's performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_prob_rf)

# Print evaluation metrics
print(f"Random Forest Metrics:")
print(f"Accuracy: {accuracy_rf}")
print(f"Recall: {recall_rf}")
print(f"Precision: {precision_rf}")
print(f"F1 Score: {f1_rf}")
print(f"ROC-AUC: {roc_auc_rf}")


Random Forest Metrics:
Accuracy: 0.9879963065558633
Recall: 0.8673469387755102
Precision: 1.0
F1 Score: 0.9289617486338798
ROC-AUC: 0.9953744949756553


In [7]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]  # Probability scores for ROC-AUC

# Evaluate the model's performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_prob_xgb)

# Print evaluation metrics
print(f"XGBoost Metrics:")
print(f"Accuracy: {accuracy_xgb}")
print(f"Recall: {recall_xgb}")
print(f"Precision: {precision_xgb}")
print(f"F1 Score: {f1_xgb}")
print(f"ROC-AUC: {roc_auc_xgb}")


XGBoost Metrics:
Accuracy: 0.9889196675900277
Recall: 0.8877551020408163
Precision: 0.9886363636363636
F1 Score: 0.9354838709677419
ROC-AUC: 0.9976691184087849
