# Ethereum Fraud Detection - Data Preprocessing

Author: Victor Oketch Sabare  
Date: January 2025

This notebook focuses on cleaning and preprocessing the Ethereum transaction data to prepare it for feature engineering and model development.

## 1. Setup and Import Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Data preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Custom utilities
import sys
sys.path.append('../')
from src.utils.helpers import load_config
from src.data.preprocessing import clean_addresses, normalize_values

%matplotlib inline

## 2. Load Raw Data

In [None]:
# Load the raw transaction data
raw_data = pd.read_csv('../data/raw/ethereum_transactions.csv')
print(f"Loaded {len(raw_data)} transactions")
raw_data.head()

## 3. Data Cleaning

In [None]:
def clean_data(df):
    """Main cleaning function for transaction data"""
    
    # Create copy to avoid modifying original data
    cleaned = df.copy()
    
    # Remove duplicates
    cleaned = cleaned.drop_duplicates()
    
    # Convert timestamps
    cleaned['timestamp'] = pd.to_datetime(cleaned['timestamp'], unit='s')
    
    # Normalize ethereum values to ETH (from Wei)
    cleaned['value_eth'] = cleaned['value'] / 1e18
    
    # Normalize gas prices
    cleaned['gas_price_gwei'] = cleaned['gas_price'] / 1e9
    
    # Handle missing values
    cleaned = handle_missing_values(cleaned)
    
    return cleaned

def handle_missing_values(df):
    """Handle missing values in the dataset"""
    
    # Fill numeric missing values with median
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    imputer = SimpleImputer(strategy='median')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    return df

# Clean the data
cleaned_data = clean_data(raw_data)

# Display cleaning results
print("Cleaning Summary:")
print(f"Original rows: {len(raw_data)}")
print(f"Cleaned rows: {len(cleaned_data)}")
print(f"Removed rows: {len(raw_data) - len(cleaned_data)}")

## 4. Data Validation

In [None]:
def validate_data(df):
    """Validate cleaned data meets requirements"""
    
    validations = {
        'no_missing_values': df.isnull().sum().sum() == 0,
        'valid_timestamps': df['timestamp'].min() > pd.Timestamp('2015-07-30'),  # Ethereum launch date
        'valid_values': (df['value_eth'] >= 0).all(),
        'valid_gas': (df['gas_price_gwei'] > 0).all()
    }
    
    return pd.Series(validations)

# Run validations
validation_results = validate_data(cleaned_data)
print("Validation Results:")
print(validation_results)

## 5. Feature Scaling

In [None]:
def scale_features(df):
    """Scale numeric features"""
    
    # Select numeric columns for scaling
    numeric_features = ['value_eth', 'gas_price_gwei', 'gas_used']
    
    # Create scalers
    standard_scaler = StandardScaler()
    minmax_scaler = MinMaxScaler()
    
    # Apply standard scaling
    df_scaled = df.copy()
    df_scaled[f'{numeric_features}_scaled'] = standard_scaler.fit_transform(df[numeric_features])
    
    # Apply minmax scaling
    df_scaled[f'{numeric_features}_normalized'] = minmax_scaler.fit_transform(df[numeric_features])
    
    return df_scaled

# Scale the features
scaled_data = scale_features(cleaned_data)

## 6. Address Normalization

In [None]:
def normalize_addresses(df):
    """Normalize Ethereum addresses"""
    
    # Convert addresses to lowercase
    df['from_address'] = df['from_address'].str.lower()
    df['to_address'] = df['to_address'].str.lower()
    
    # Validate address format
    def is_valid_address(addr):
        return addr.startswith('0x') and len(addr) == 42
    
    # Filter valid addresses
    valid_from = df['from_address'].apply(is_valid_address)
    valid_to = df['to_address'].apply(is_valid_address)
    
    print(f"Invalid 'from' addresses: {(~valid_from).sum()}")
    print(f"Invalid 'to' addresses: {(~valid_to).sum()}")
    
    return df[valid_from & valid_to]

# Normalize addresses
normalized_data = normalize_addresses(scaled_data)

## 7. Data Partitioning

In [None]:
def partition_data(df):
    """Partition data by date for temporal analysis"""
    
    # Sort by timestamp
    df_sorted = df.sort_values('timestamp')
    
    # Create time-based partitions
    partitions = {
        'training': df_sorted.iloc[:int(len(df_sorted)*0.7)],
        'validation': df_sorted.iloc[int(len(df_sorted)*0.7):int(len(df_sorted)*0.85)],
        'testing': df_sorted.iloc[int(len(df_sorted)*0.85):]
    }
    
    return partitions

# Create partitions
data_partitions = partition_data(normalized_data)

# Display partition sizes
for name, partition in data_partitions.items():
    print(f"{name}: {len(partition)} records")

## 8. Save Processed Data

In [None]:
# Save processed datasets
for name, partition in data_partitions.items():
    partition.to_csv(f'../data/processed/{name}_data.csv', index=False)
    print(f"Saved {name} dataset")

# Save preprocessing metadata
preprocessing_metadata = {
    'original_rows': len(raw_data),
    'processed_rows': len(normalized_data),
    'features': list(normalized_data.columns),
    'partition_sizes': {name: len(partition) for name, partition in data_partitions.items()}
}

import json
with open('../data/processed/preprocessing_metadata.json', 'w') as f:
    json.dump(preprocessing_metadata, f, indent=2)

## 9. Preprocessing Summary

In [None]:
# Display final preprocessing summary
print("Preprocessing Summary:")
print(f"Original records: {len(raw_data)}")
print(f"After cleaning: {len(cleaned_data)}")
print(f"After normalization: {len(normalized_data)}")
print("\nFeature Statistics:")
print(normalized_data.describe())