# Ethereum Fraud Detection - Data Exploration

Author: Victor Oketch Sabare  
Date: January 2025

This notebook explores the Ethereum transaction data to understand patterns and characteristics that might indicate fraudulent activity.

## 1. Setup and Import Libraries

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Ethereum interaction
from web3 import Web3
from etherscan import Etherscan

# Custom utilities
import sys
sys.path.append('../')
from src.utils.helpers import load_config

# Set plotting style
plt.style.use('seaborn')
%matplotlib inline

ModuleNotFoundError: No module named 'web3'

## 2. Data Loading

In [None]:
# Load transaction data
def load_transaction_data(filepath):
    """Load Ethereum transaction data from specified path"""
    try:
        df = pd.read_csv(filepath)
        print(f"Loaded {len(df)} transactions")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Load the data
transactions_df = load_transaction_data('../data/raw/Eth_Txs.csv')

## 3. Initial Data Overview

In [None]:
# Basic information about the dataset
print("Dataset Information:")
transactions_df.info()

print("\nSample of the data:")
transactions_df.head()

print("\nBasic statistics:")
transactions_df.describe()

## 4. Transaction Value Analysis

In [None]:
# Distribution of transaction values
plt.figure(figsize=(12, 6))
sns.histplot(data=transactions_df, x='value', bins=50)
plt.title('Distribution of Transaction Values')
plt.xlabel('Value (ETH)')
plt.ylabel('Count')
plt.show()

## 5. Temporal Analysis

In [None]:
# Convert timestamp to datetime
transactions_df['datetime'] = pd.to_datetime(transactions_df['timestamp'], unit='s')

# Transaction frequency over time
daily_transactions = transactions_df.resample('D', on='datetime').size()

plt.figure(figsize=(15, 6))
daily_transactions.plot()
plt.title('Daily Transaction Volume')
plt.xlabel('Date')
plt.ylabel('Number of Transactions')
plt.show()

## 6. Account Analysis

In [None]:
# Analyze account activity
account_stats = pd.DataFrame({
    'sent_count': transactions_df['from_address'].value_counts(),
    'received_count': transactions_df['to_address'].value_counts()
})

print("Top 10 most active accounts:")
account_stats.head(10)

## 7. Gas Price Analysis

In [None]:
# Gas price distribution
plt.figure(figsize=(12, 6))
sns.boxplot(y=transactions_df['gas_price'])
plt.title('Gas Price Distribution')
plt.ylabel('Gas Price (Wei)')
plt.show()

## 8. Network Analysis

In [None]:
# Basic network metrics
unique_addresses = pd.concat([transactions_df['from_address'], 
                            transactions_df['to_address']]).nunique()

print(f"Number of unique addresses: {unique_addresses}")
print(f"Number of transactions: {len(transactions_df)}")
print(f"Average transactions per address: {len(transactions_df)/unique_addresses:.2f}")

## 9. Identifying Potential Fraud Patterns

In [None]:
# Example: Look for suspicious patterns
def identify_suspicious_patterns(df):
    suspicious = {
        'high_frequency': df['from_address'].value_counts()[df['from_address'].value_counts() > 100].index,
        'high_value': df[df['value'] > df['value'].quantile(0.99)]['from_address'].unique(),
        'unusual_gas': df[df['gas_price'] > df['gas_price'].quantile(0.99)]['from_address'].unique()
    }
    return suspicious

suspicious_patterns = identify_suspicious_patterns(transactions_df)
for pattern, addresses in suspicious_patterns.items():
    print(f"\n{pattern}: Found {len(addresses)} suspicious addresses")