# OPTICS Clustering for Ethereum Fraud Detection

This notebook demonstrates unsupervised learning using the OPTICS (Ordering Points To Identify the Clustering Structure) algorithm to detect potential fraudulent activities in Ethereum transactions.

## What is OPTICS?
OPTICS is a density-based clustering algorithm that works by ordering points to identify the clustering structure. Unlike k-means, OPTICS:
- Does not require specifying the number of clusters beforehand
- Can find clusters of varying shapes and sizes
- Identifies noise points that don't belong to any cluster (potential anomalies)
- Handles varying densities better than DBSCAN

These characteristics make it particularly suitable for fraud detection, where we don't know in advance how many fraud patterns exist, and fraudulent transactions are often outliers.

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import OPTICS
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('ggplot')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)

# Display all columns in DataFrames
pd.set_option('display.max_columns', None)

## 2. Loading Ethereum Transaction Data

Now let's apply OPTICS to Ethereum transaction data. We'll load a dataset containing Ethereum transactions and prepare it for clustering analysis.

In [None]:
# Load sample Ethereum transaction data (replace with your actual data path)
# If you don't have data, set SAMPLE_DATA to True to generate synthetic data
SAMPLE_DATA = True

if not SAMPLE_DATA:
    # Load real data - adjust path as needed
    try:
        transactions_df = pd.read_csv('../sa/eth_transactions.csv')

        print(f"Loaded {len(transactions_df)} real transactions")
    except FileNotFoundError:
        print("Transaction data file not found. Generating synthetic data instead.")
        SAMPLE_DATA = True
        
if SAMPLE_DATA:
    # Generate synthetic data for demonstration
    np.random.seed(42)  # For reproducibility
    n_samples = 1000
    n_frauds = 50  # 5% fraudulent transactions
    
    # Generate random addresses
    def random_address():
        return '0x' + ''.join([np.random.choice(list('0123456789abcdef')) for _ in range(40)])
    
    addresses = [random_address() for _ in range(100)]
    
    # Generate normal transactions
    normal_txs = {
        'hash': ['0x' + ''.join([np.random.choice(list('0123456789abcdef')) for _ in range(64)]) for _ in range(n_samples - n_frauds)],
        'from': [np.random.choice(addresses) for _ in range(n_samples - n_frauds)],
        'to': [np.random.choice(addresses) for _ in range(n_samples - n_frauds)],
        'value': np.random.pareto(1, n_samples - n_frauds) * 1e17,  # ETH value (in wei)
        'gas': np.random.randint(21000, 100000, n_samples - n_frauds),
        'gasPrice': np.random.randint(1, 50, n_samples - n_frauds) * 1e9,  # Gas price in wei
        'timestamp': np.sort(np.random.randint(1600000000, 1630000000, n_samples - n_frauds)),  # Unix timestamps
        'blockNumber': np.random.randint(10000000, 15000000, n_samples - n_frauds)
    }
    
    # Generate fraudulent transactions with anomalous patterns
    # Pattern: Very high values, unusual gas prices, concentrated timing
    fraud_addresses = [random_address() for _ in range(5)]  # Small group of fraud addresses
    fraud_txs = {
        'hash': ['0x' + ''.join([np.random.choice(list('0123456789abcdef')) for _ in range(64)]) for _ in range(n_frauds)],
        'from': [np.random.choice(fraud_addresses[:2]) for _ in range(n_frauds)],  # Limited senders
        'to': [np.random.choice(fraud_addresses[2:]) for _ in range(n_frauds)],    # Limited recipients
        'value': np.random.pareto(0.7, n_frauds) * 1e18,  # Much higher values
        'gas': np.random.randint(250000, 800000, n_frauds),  # Higher gas values
        'gasPrice': np.random.randint(80, 200, n_frauds) * 1e9,  # Unusual gas prices
        'timestamp': np.sort(np.random.randint(1615000000, 1615001000, n_frauds)),  # Concentrated in time
        'blockNumber': sorted(np.random.randint(12000000, 12001000, n_frauds))
    }
    
    # Combine and create DataFrame
    transactions_df = pd.DataFrame({
        k: np.concatenate([normal_txs[k], fraud_txs[k]]) for k in normal_txs.keys()
    })
    
    # Add a 'ground_truth' column for evaluation (1=fraud, 0=normal)
    # Note: In real-world unsupervised learning, we wouldn't have this
    transactions_df['ground_truth'] = [0] * (n_samples - n_frauds) + [1] * n_frauds
    
    # Shuffle the DataFrame
    transactions_df = transactions_df.sample(frac=1).reset_index(drop=True)
    
    print(f"Generated {len(transactions_df)} synthetic transactions with {n_frauds} fraudulent examples")

# Display the first few rows of the dataset
transactions_df.head()

SyntaxError: unterminated string literal (detected at line 3) (3295031372.py, line 3)

## 3. Feature Extraction for Fraud Detection

Now we'll extract relevant features from the transaction data. For fraud detection, we need to consider various aspects like:
- Transaction values
- Gas usage patterns
- Temporal patterns
- Network structure features

In [None]:
def extract_features(df):
    """Extract features from transaction data for clustering."""
    features = pd.DataFrame()
    
    # Transaction value features
    features['value_eth'] = df['value'].apply(lambda x: float(x) / 1e18)  # Convert Wei to ETH
    
    # Gas-related features
    features['gas_limit'] = df['gas'].astype(float)
    features['gas_price_gwei'] = df['gasPrice'].apply(lambda x: float(x) / 1e9)  # Convert Wei to Gwei
    features['total_gas_cost'] = features['gas_limit'] * features['gas_price_gwei'] / 1e9  # In ETH
    
    # Value/gas ratio (high might indicate washing/unusual transactions)
    features['value_gas_ratio'] = features['value_eth'] / (features['total_gas_cost'] + 1e-10)  # Avoid div by zero
    
    # Time-based features
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    features['hour_of_day'] = df['datetime'].dt.hour
    features['day_of_week'] = df['datetime'].dt.dayofweek
    
    # Sort by timestamp and address to calculate time between transactions
    df_sorted = df.sort_values(['from', 'timestamp'])
    df_sorted['time_diff'] = df_sorted.groupby('from')['timestamp'].diff().fillna(0)
    
    # Map these time differences back to original DataFrame
    time_diff_map = dict(zip(df_sorted.index, df_sorted['time_diff']))
    features['time_since_last_tx'] = df.index.map(time_diff_map).fillna(0)
    
    # Network features
    address_counts = {}
    for addr in set(df['from'].tolist() + df['to'].tolist()):
        address_counts[addr] = {
            'sent': len(df[df['from'] == addr]),
            'received': len(df[df['to'] == addr])
        }
    
    # Add network features
    features['sender_tx_count'] = df['from'].apply(lambda x: address_counts[x]['sent'])
    features['recipient_tx_count'] = df['to'].apply(lambda x: address_counts[x]['received'])
    features['total_tx_count'] = features['sender_tx_count'] + features['recipient_tx_count']
    
    # Transaction frequency - to detect sudden bursts
    features['tx_frequency'] = features['sender_tx_count'] / (features['time_since_last_tx'] + 1)  # Avoid div by zero
    
    # Handle extreme values and missing data
    # Clip extreme values (beyond 99th percentile)
    for col in features.columns:
        if features[col].dtype in [np.float64, np.int64]:
            upper_limit = features[col].quantile(0.99)
            features[col] = features[col].clip(upper=upper_limit)
    
    return features

# Extract features from our transaction data
transaction_features = extract_features(transactions_df)
print(f"Extracted {transaction_features.shape[1]} features from the transaction data")

# Display feature statistics
transaction_features.describe()

In [None]:
# Normalize features for better clustering
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(transaction_features)

# Convert back to DataFrame for readability
scaled_df = pd.DataFrame(scaled_features, columns=transaction_features.columns)
scaled_df.describe()

## 4. Running OPTICS Clustering

Now we'll apply the OPTICS algorithm to our feature set. OPTICS has several important parameters:

- `min_samples`: Number of samples in a neighborhood for a point to be considered a core point
- `xi`: Determines the minimum steepness on the reachability plot that constitutes a cluster boundary
- `min_cluster_size`: Minimum number of samples in a cluster
- `max_eps`: Maximum distance between samples for them to be considered as in the same neighborhood

In [None]:
# Define OPTICS parameters
min_samples = 10  # Minimum samples in a neighborhood
xi = 0.05         # Steepness threshold for cluster boundary
min_cluster_size = max(5, int(0.01 * len(scaled_features)))  # At least 1% of data or 5 points

# Create and fit OPTICS model
optics_model = OPTICS(
    min_samples=min_samples,
    xi=xi,
    min_cluster_size=min_cluster_size,
    metric='euclidean',  # Distance metric
    cluster_method='xi'  # Use xi-steep areas for cluster extraction
)

# Fit the model and get cluster labels
cluster_labels = optics_model.fit_predict(scaled_features)

# Add cluster labels to our transactions dataframe
transactions_df['cluster'] = cluster_labels

# Get basic clustering statistics
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"OPTICS clustering results:")
print(f"  - Number of clusters: {n_clusters}")
print(f"  - Number of noise points (potential anomalies): {n_noise} ({n_noise/len(cluster_labels)*100:.2f}%)")

# Count samples in each cluster
cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
print("\nCluster sizes:")
for cluster, count in cluster_counts.items():
    label = "Noise (anomalies)" if cluster == -1 else f"Cluster {cluster}"
    print(f"  - {label}: {count} transactions ({count/len(cluster_labels)*100:.2f}%)")

## 5. Visualizing the Clustering Results

Let's create visualizations to help understand the clustering results and potential fraud patterns.

In [None]:
# Visualization 1: OPTICS Reachability Plot
def plot_optics_reachability(optics):
    # Get the ordering of points
    space = np.arange(len(optics.labels_))
    # Get reachability distances
    reachability = optics.reachability_[optics.ordering_]
    # Get ordered labels
    labels = optics.labels_[optics.ordering_]
    
    # Create plot
    plt.figure(figsize=(12, 6))
    
    # Replace infinite values with maximum finite value * 1.1
    reach_plot = reachability.copy()
    finite_reach = reach_plot[np.isfinite(reach_plot)]
    if len(finite_reach) > 0:  # Check if there are any finite values
        max_reach = np.max(finite_reach)
        reach_plot[~np.isfinite(reach_plot)] = max_reach * 1.1
    
    # Plot bars
    unique_labels = sorted(set(labels))
    colors = plt.cm.nipy_spectral(np.linspace(0, 1, len(unique_labels)))
    color_map = {label: colors[i] for i, label in enumerate(unique_labels)}
    bar_colors = [color_map[label] if label != -1 else 'black' for label in labels]
    
    plt.bar(space, reach_plot, color=bar_colors, width=1.0)
    plt.ylabel('Reachability Distance')
    plt.xlabel('Points (ordered by cluster)')
    plt.title('OPTICS Reachability Plot')
    
    # Add legend
    legend_elements = [plt.Line2D([0], [0], color='black', lw=4, label='Noise/Anomalies')]
    for label in sorted([l for l in unique_labels if l != -1]):
        legend_elements.append(plt.Line2D([0], [0], color=color_map[label], lw=4, label=f'Cluster {label}'))
    
    plt.legend(handles=legend_elements)
    plt.tight_layout()
    plt.show()

# Visualization 2: 2D Projection of Clusters (using t-SNE)
def plot_clusters_2d(features, labels, method='tsne'):
    # Dimension reduction
    if method.lower() == 'tsne':
        reducer = TSNE(n_components=2, random_state=42)
        title_prefix = 't-SNE'
    else:  # default to PCA
        reducer = PCA(n_components=2)
        title_prefix = 'PCA'
    
    # Transform data to 2D
    features_2d = reducer.fit_transform(features)
    
    # Create scatter plot
    plt.figure(figsize=(12, 10))
    
    # Plot regular clusters
    unique_labels = sorted(set(labels))
    colors = plt.cm.nipy_spectral(np.linspace(0, 1, len(unique_labels)))
    
    for i, label in enumerate(unique_labels):
        if label == -1:
            # Plot noise points with black X markers
            mask = labels == -1
            plt.scatter(features_2d[mask, 0], features_2d[mask, 1], 
                        marker='x', s=60, color='black', alpha=0.8, label='Noise/Anomalies')
