In [1]:
# Question 7: DBSCAN on a Real-World Dataset for Anomaly Detection
# Description: Perform DBSCAN on a credit card transaction dataset to detect anomalies.

In [2]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np

# Description: Perform DBSCAN on a credit card transaction dataset to detect anomalies.

# Step 1: Load a sample credit card transaction dataset.
# As I cannot directly download or access local files, I'll create a synthetic
# dataset that mimics some characteristics of a real credit card transaction dataset.
# In a real scenario, you would replace this with pd.read_csv('creditcard.csv')
# For a real dataset, you might need to handle 'Time' and 'Amount' features carefully.
# 'V1' through 'V28' are usually anonymized principal components.

print("Generating synthetic credit card transaction data...")
np.random.seed(42) # for reproducibility

# Normal transactions
num_normal = 1000
data_normal = np.random.randn(num_normal, 28) * 0.5
amount_normal = np.random.rand(num_normal) * 100 + 10 # Amounts between 10 and 110

# Anomalous transactions (simulating fraud)
num_fraud = 20
data_fraud = np.random.randn(num_fraud, 28) * 3 + 5 # Higher variance and shifted mean
amount_fraud = np.random.rand(num_fraud) * 1000 + 500 # Larger amounts

# Combine normal and fraud data
X = np.vstack((data_normal, data_fraud))
amounts = np.hstack((amount_normal, amount_fraud))
y_true = np.hstack((np.zeros(num_normal), np.ones(num_fraud))) # 0 for normal, 1 for fraud

# Create a DataFrame
df = pd.DataFrame(X, columns=[f'V{i}' for i in range(1, 29)])
df['Amount'] = amounts
df['Class'] = y_true # This 'Class' column would not be available during actual anomaly detection

print(f"Synthetic dataset created with {len(df)} transactions ({num_normal} normal, {num_fraud} fraud).")
print(df.head())
print(df.tail())
print("\n")

# Step 2: Preprocess the data.
# For DBSCAN, scaling is crucial as it's a distance-based algorithm.
# We'll scale all numerical features. 'Class' is our target and won't be used in clustering.
features = df.drop('Class', axis=1).columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])

print("Data scaled using StandardScaler.")
print(f"Shape of scaled data: {X_scaled.shape}\n")

# Step 3: Apply DBSCAN.
# Key parameters for DBSCAN:
# - eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.
# - min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
#                This includes the point itself.
# Adjusting these parameters is critical for good performance.
# For anomaly detection, points labeled as -1 (noise) are considered anomalies.

# Let's try some initial parameters. These often require tuning.
# For a dataset with many dimensions (like V1-V28), finding a good eps can be challenging.
# A rule of thumb for min_samples can be 2 * number_of_features.
eps_val = 2.0 # This value is highly dependent on the scaled data and its density
min_samples_val = 5 # Minimum samples to form a dense region

print(f"Applying DBSCAN with eps={eps_val} and min_samples={min_samples_val}...")
dbscan = DBSCAN(eps=eps_val, min_samples=min_samples_val)
clusters = dbscan.fit_predict(X_scaled)

# Step 4: Identify anomalies.
# In DBSCAN, points labeled as -1 are considered noise or outliers.
df['cluster'] = clusters
df['is_anomaly'] = (df['cluster'] == -1)

print("\nDBSCAN clustering complete.")
print(f"Number of clusters found (excluding noise): {len(np.unique(clusters[clusters != -1]))}")
print(f"Number of anomalies detected (noise points): {np.sum(df['is_anomaly'])}")

# Step 5: Evaluate (if ground truth is available).
# Since we generated the data with a 'Class' label, we can compare DBSCAN's output
# with the true fraud labels to see how well it performed.
print("\n--- Anomaly Detection Results ---")

true_anomalies = df[df['Class'] == 1]
detected_anomalies = df[df['is_anomaly'] == True]

print(f"True number of fraudulent transactions: {len(true_anomalies)}")
print(f"Number of transactions DBSCAN labeled as anomalies: {len(detected_anomalies)}")

# Check how many true anomalies were detected
correctly_detected_fraud = df[(df['Class'] == 1) & (df['is_anomaly'] == True)]
print(f"Fraudulent transactions correctly identified as anomalies: {len(correctly_detected_fraud)}")

# Check how many normal transactions were incorrectly labeled as anomalies (false positives)
false_positives = df[(df['Class'] == 0) & (df['is_anomaly'] == True)]
print(f"Normal transactions incorrectly labeled as anomalies (False Positives): {len(false_positives)}")

# Display some detected anomalies
print("\nSample of Detected Anomalies:")
print(df[df['is_anomaly'] == True].head())

# Display some true anomalies that might have been missed (false negatives)
print("\nSample of Missed True Anomalies (False Negatives - if any):")
missed_fraud = df[(df['Class'] == 1) & (df['is_anomaly'] == False)]
print(missed_fraud.head())

Generating synthetic credit card transaction data...
Synthetic dataset created with 1020 transactions (1000 normal, 20 fraud).
         V1        V2        V3        V4        V5        V6        V7  \
0  0.248357 -0.069132  0.323844  0.761515 -0.117077 -0.117068  0.789606   
1 -0.300319 -0.145847 -0.300853  0.926139 -0.006749 -0.528855  0.411272   
2 -0.419609 -0.154606  0.165632  0.487773 -0.239587 -0.092829 -0.553167   
3 -0.404247 -0.250879  0.457701  0.164376 -0.264880  0.256634  0.048539   
4  0.030115  1.231621 -0.096180  0.150774 -0.017356 -0.584339  0.571411   

         V8        V9       V10  ...       V21       V22       V23       V24  \
0  0.383717 -0.234737  0.271280  ...  0.732824 -0.112888  0.033764 -0.712374   
1 -0.610422  0.104432 -0.979835  ...  0.171809 -0.881520  0.162042 -0.192541   
2 -0.598103  0.406263  0.678120  ...  0.043524 -0.149504  0.045880 -0.993784   
3  0.484322 -0.351027 -0.163831  ... -0.080643  0.202025  0.943093  0.087289   
4  0.375967  0.395516 