In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')

### Load and Preprocess Data (Simplified from EDA)

In [None]:

def load_and_preprocess_data(filepath):
    """Loads and performs basic cleaning and type conversion."""
    try:
        df = pd.read_csv(filepath)
    except FileNotFoundError:
        print(f"Error: {filepath} not found.")
        return pd.DataFrame()

    relevant_columns = ['from', 'to', 'value', 'gas', 'gasPrice', 'gasUsed',
                        'timeStamp', 'isError', 'txreceipt_status', 'functionName',
                        'wallet_address', 'protocol_version', 'methodId', 'blockNumber']
    existing_columns = [col for col in relevant_columns if col in df.columns]
    df_clean = df[existing_columns].copy()

    
    if 'timeStamp' in df_clean.columns:
        df_clean['timeStamp'] = pd.to_datetime(df_clean['timeStamp'], unit='s')
    if 'value' in df_clean.columns:
        df_clean['value'] = pd.to_numeric(df_clean['value'], errors='coerce')
    # Fill categorical NaNs
    for col in ['functionName', 'protocol_version']:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna('unknown')
    # Fill other NaNs and convert types
    if 'methodId' in df_clean.columns:
        df_clean['methodId'] = df_clean['methodId'].fillna('0x00000000')
    for col in ['gas', 'gasPrice', 'gasUsed', 'blockNumber']:
        if col in df_clean.columns:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    df_clean.dropna(subset=['value'], inplace=True)
    return df_clean

print("Loading and preprocessing data...")
df_processed = load_and_preprocess_data('data/compound_v2_v3_transactions.csv')
if df_processed.empty:
    print("Stopping execution due to missing data file.")
else:
    print(f"Data loaded. Shape: {df_processed.shape}")

Loading and preprocessing data...
Data loaded. Shape: (277, 14)


### Advanced Feature Engineering

In [None]:

def create_advanced_wallet_features(df):
   
    wallet_features_list = []
    wallet_col = 'wallet_address' if 'wallet_address' in df.columns and df['wallet_address'].notna().any() else 'from'
    all_wallets = df[wallet_col].dropna().unique()

    print(f"Processing features for {len(all_wallets)} wallets using '{wallet_col}' as identifier...")
    for i, wallet in enumerate(all_wallets):
        if i > 0 and i % 1000 == 0:
            print(f"Processed {i} wallets...")

        wallet_txns = df[df[wallet_col] == wallet]
        sent_txns = wallet_txns[wallet_txns['from'] == wallet] if 'from' in wallet_txns else pd.DataFrame()
        received_txns = df[df['to'] == wallet] if 'to' in df else pd.DataFrame()

        if len(wallet_txns) == 0: continue

        features = {'wallet_id': wallet}
        # BASIC & VALUE FEATURES
        features['total_transactions'] = len(wallet_txns)
        features['sent_transactions'] = len(sent_txns)
        features['received_transactions'] = len(received_txns)
        features['send_receive_ratio'] = len(sent_txns) / max(len(received_txns), 1)
        features['total_value_sent'] = sent_txns['value'].sum()
        features['total_value_received'] = received_txns['value'].sum()
        features['avg_transaction_value'] = wallet_txns['value'].mean()
        features['max_transaction_value'] = wallet_txns['value'].max()
        features['value_std'] = wallet_txns['value'].std()
        features['zero_value_ratio'] = (wallet_txns['value'] == 0).mean()

        # GAS FEATURES
        if 'gasUsed' in wallet_txns.columns and 'gasPrice' in wallet_txns.columns:
            gas_costs = wallet_txns['gasUsed'] * wallet_txns['gasPrice']
            features['avg_gas_used'] = wallet_txns['gasUsed'].mean()
            features['total_gas_cost'] = gas_costs.sum()
            features['error_rate'] = wallet_txns['isError'].mean() if 'isError' in wallet_txns else 0
        else:
            features['avg_gas_used'] = features['total_gas_cost'] = features['error_rate'] = 0

        # TEMPORAL FEATURES
        if len(wallet_txns) > 1:
            time_sorted = wallet_txns.sort_values('timeStamp')
            time_diffs = time_sorted['timeStamp'].diff().dt.total_seconds().dropna()
            features['avg_time_between_txns_hr'] = time_diffs.mean() / 3600
            activity_span_days = (time_sorted['timeStamp'].max() - time_sorted['timeStamp'].min()).days
            features['activity_span_days'] = max(activity_span_days, 1)
            features['transaction_frequency'] = len(wallet_txns) / features['activity_span_days']
        else:
            features['avg_time_between_txns_hr'] = 0
            features['activity_span_days'] = 1
            features['transaction_frequency'] = 1

        # COUNTERPARTY & FUNCTION ANALYSIS 
        features['unique_recipients'] = sent_txns['to'].nunique() if len(sent_txns) > 0 else 0
        features['unique_senders'] = received_txns['from'].nunique() if len(received_txns) > 0 else 0
        features['recipient_concentration'] = len(sent_txns) / max(features['unique_recipients'], 1)
        if 'functionName' in wallet_txns.columns:
            features['unique_functions'] = wallet_txns['functionName'].nunique()
            dominant_func_ratio = wallet_txns['functionName'].value_counts(normalize=True).iloc[0]
            features['contract_complexity'] = 1 - dominant_func_ratio
        else:
            features['unique_functions'] = 0
            features['contract_complexity'] = 0

        wallet_features_list.append(features)

    return pd.DataFrame(wallet_features_list)

if not df_processed.empty:
    
    wallet_df = create_advanced_wallet_features(df_processed)
    

    wallet_df = wallet_df.fillna(0)
    
    
    initial_count = len(wallet_df)
    single_txn_wallets = (wallet_df['total_transactions'] == 1).sum()
    
    print(f"Total wallets: {initial_count}")
    print(f"Single-transaction wallets: {single_txn_wallets}")
    print(f"Multi-transaction wallets: {initial_count - single_txn_wallets}")
    print("Keeping all wallets for analysis")
    
   
    print("\nKey feature statistics:")
    key_features = ['total_transactions', 'avg_transaction_value', 'error_rate',
                    'transaction_frequency', 'unique_recipients', 'contract_complexity']
    available_key_features = [f for f in key_features if f in wallet_df.columns]
    print(wallet_df[available_key_features].describe())


Creating advanced wallet-level features...
Processing features for 80 wallets using 'wallet_address' as identifier...
Total wallets: 80
Single-transaction wallets: 55
Multi-transaction wallets: 25
Keeping all wallets for analysis

Key feature statistics:
       total_transactions  avg_transaction_value  error_rate  \
count            80.00000                   80.0   80.000000   
mean              3.46250                    0.0    0.005696   
std               7.24428                    0.0    0.034331   
min               1.00000                    0.0    0.000000   
25%               1.00000                    0.0    0.000000   
50%               1.00000                    0.0    0.000000   
75%               2.00000                    0.0    0.000000   
max              42.00000                    0.0    0.285714   

       transaction_frequency  unique_recipients  contract_complexity  
count              80.000000          80.000000            80.000000  
mean                1.0051

### Advanced Risk Scoring

In [17]:

def calculate_advanced_risk_score(df):
    """Calculate a sophisticated, weighted risk score."""
    feature_cols = [col for col in df.columns if col != 'wallet_id']
    features = df[feature_cols].copy().replace([np.inf, -np.inf], 0)
    scaler = MinMaxScaler()
    features_scaled = pd.DataFrame(scaler.fit_transform(features), columns=feature_cols, index=features.index)

    risk_components = {
        'volume_risk': {'features': ['total_transactions', 'total_value_sent', 'max_transaction_value'], 'weight': 0.20},
        'behavioral_risk': {'features': ['send_receive_ratio', 'recipient_concentration', 'transaction_frequency'], 'weight': 0.25},
        'technical_risk': {'features': ['avg_gas_used', 'total_gas_cost', 'error_rate'], 'weight': 0.20},
        'temporal_risk': {'features': ['avg_time_between_txns_hr'], 'weight': 0.15},
        'diversity_risk': {'features': ['unique_recipients', 'unique_senders', 'unique_functions', 'contract_complexity'], 'weight': 0.20}
    }
    risk_scores = pd.DataFrame(index=df.index)
    for component, config in risk_components.items():
        available_features = [f for f in config['features'] if f in features_scaled.columns]
        if available_features:
            component_score = features_scaled[available_features].mean(axis=1)
            risk_scores[component] = component_score * config['weight']
        else:
            risk_scores[component] = 0

    base_score = risk_scores.sum(axis=1)
   
    if 'error_rate' in features_scaled: base_score[features_scaled['error_rate'] > 0.1] *= 1.3
    if 'zero_value_ratio' in features_scaled: base_score[features_scaled['zero_value_ratio'] > 0.5] *= 1.2
    if 'transaction_frequency' in features_scaled: base_score[features_scaled['transaction_frequency'] > 0.95] *= 1.4

    return np.minimum(base_score * 1000, 1000)

if not df_processed.empty:
    print("Calculating advanced risk scores...")
    wallet_df['base_risk_score'] = calculate_advanced_risk_score(wallet_df)
    print("Base risk score calculation complete.")

Calculating advanced risk scores...
Base risk score calculation complete.


### Anomaly Detection and Clustering for Score Refinement

In [None]:

if not df_processed.empty:
    print("Applying anomaly detection and clustering...")
    feature_cols_anomaly = [col for col in wallet_df.columns if col not in ['wallet_id', 'base_risk_score']]
    X = wallet_df[feature_cols_anomaly].fillna(0).replace([np.inf, -np.inf], 0)
    X_scaled = StandardScaler().fit_transform(X)

    # Isolation Forest for anomaly detection
    iso_forest = IsolationForest(contamination=0.05, random_state=42, n_estimators=100)
    wallet_df['is_anomaly'] = (iso_forest.fit_predict(X_scaled) == -1)

    # K-means clustering for behavioral grouping
    kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
    wallet_df['cluster'] = kmeans.fit_predict(X_scaled)
    cluster_risk_adj = wallet_df.groupby('cluster')['base_risk_score'].mean() / wallet_df['base_risk_score'].mean()
    wallet_df['cluster_risk_adjustment'] = wallet_df['cluster'].map(cluster_risk_adj)
    print(f"Identified {wallet_df['is_anomaly'].sum()} anomalous wallets.")
    print(f"Clustered wallets into 5 groups.")

Applying anomaly detection and clustering...
Identified 4 anomalous wallets.
Clustered wallets into 5 groups.


### Final Risk Score Calculation and Output

In [None]:

def calculate_final_risk_score(df):
    """Combines base score with anomaly and cluster adjustments."""
    final_scores = df['base_risk_score'].copy()
    final_scores[df['is_anomaly']] *= 1.5 # Anomaly boost
    final_scores *= df['cluster_risk_adjustment'] 
    return np.minimum(np.maximum(final_scores, 0), 1000).round(0).astype(int)

if not df_processed.empty:
    wallet_df['final_risk_score'] = calculate_final_risk_score(wallet_df)

   
    def categorize_risk(score):
        if score < 200: return 'Very Low'
        elif score < 400: return 'Low'
        elif score < 600: return 'Medium'
        elif score < 800: return 'High'
        else: return 'Very High'
    wallet_df['risk_category'] = wallet_df['final_risk_score'].apply(categorize_risk)

    output_df = wallet_df[['wallet_id', 'final_risk_score']].copy()
    output_df.columns = ['wallet_id', 'score']
    output_df = output_df.sort_values('score', ascending=False)
    output_df.to_csv('wallet_risk_scores.csv', index=False)

    print("\n=== FINAL RESULTS ===")
    print(f"Total wallets analyzed: {len(output_df)}")
    print(f"Average risk score: {output_df['score'].mean():.0f}")
    print("\nTop 5 highest risk wallets:")
    print(output_df.head(5))
    print("\nRisk category distribution:")
    print(wallet_df['risk_category'].value_counts())
    print("\n✓ wallet_risk_scores.csv saved successfully.")



=== FINAL RESULTS ===
Total wallets analyzed: 80
Average risk score: 128

Top 5 highest risk wallets:
                                     wallet_id  score
0   0x0039f22efb07a647557c7c5d17854cfd6d489ef3   1000
39  0x70d8e4ab175dfe0eab4e9a7f33e0a2d19f44001e   1000
23  0x4814be124d7fe3b240eb46061f7ddfab468fe122   1000
22  0x427f2ac5fdf4245e027d767e7c3ac272a1f40a65   1000
57  0xa7f3c74f0255796fd5d3ddcf88db769f7a6bf46a    793

Risk category distribution:
risk_category
Very Low     71
Very High     4
High          4
Medium        1
Name: count, dtype: int64

✓ wallet_risk_scores.csv saved successfully.
