# ML-Driven On-Chain Metrics Analysis

## Package Requirements
This notebook requires the following packages. Run the installation cell below if needed.

In [None]:
# Package Installation (run once)
# Uncomment and run these lines if packages are missing

# Core ML packages
# !pip install scikit-learn xgboost lightgbm

# Technical analysis and financial data
# !pip install ta yfinance

# Statistical analysis
# !pip install statsmodels arch

# Data processing (should already be installed)
# !pip install pandas numpy plotly

print("✅ Required packages:")
print("- pandas, numpy (data manipulation)")
print("- scikit-learn (machine learning)")
print("- plotly (visualization)")
print("- xgboost, lightgbm (gradient boosting)")
print("- ta (technical analysis)")
print("- statsmodels (statistical modeling)")
print("- dune-client (Dune API)")
print("- python-dotenv (environment variables)")

# Verify key imports
try:
    import pandas as pd
    import numpy as np
    import sklearn
    print("\n✅ Core packages imported successfully")
except ImportError as e:
    print(f"\n❌ Missing package: {e}")
    print("Run the pip install commands above")

# Fetching on-chain Data from Dune API #

In [None]:
import requests
import os
from dotenv import load_dotenv
from dune_client.client import DuneClient
from dune_client.types import QueryParameter
from dune_client.client import DuneClient
from dune_client.query import QueryBase


# Clear any existing environment variables first
if 'DUNE_API_KEY' in os.environ:
    del os.environ['DUNE_API_KEY']

# Load environment variables from .env file in project root
load_dotenv('../.env', override=True)

API_KEY = os.getenv('DUNE_API_KEY')      # Make sure your .env file has DUNE_API_KEY=your_actual_api_key
# print(f"API_KEY: {API_KEY  }")

if API_KEY:
    dune = DuneClient(API_KEY)
    df = dune.get_latest_result_dataframe(5745512)  # From https://dune.com/troutmax/onchain-metrics dashboard
    print("Query executed successfully!")
    print("===============================================================================")
    print(df.head())
    
else:
    print("API key not found - please check your .env file")

# Importing Dune API data using new system #

In [None]:
# Cell 6 - Fixed to save to project root data directory
import sys
import os

# Add project root to Python path
project_root = os.path.abspath('..')  # Go up one level from notebooks/
sys.path.insert(0, project_root)

print(f"Added to Python path: {project_root}")
print(f"Current working directory: {os.getcwd()}")

from src.data_providers import setup_providers
from datetime import datetime

# Setup providers
manager = setup_providers()

# Test connections
print("Connection Status:")
for provider, status in manager.test_all_connections().items():
    print(f"  {provider}: {'✅ Connected' if status else '❌ Failed'}")

# Get Dune data using the new system
dune = manager.get_provider('dune')
if dune:
    # Get data
    df = dune.get_bot_volume_data()
    
    # Save raw data to PROJECT ROOT data directory (not notebooks/data)
    data_dir = os.path.join(project_root, 'data', 'raw', 'dune')
    os.makedirs(data_dir, exist_ok=True)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    raw_path = os.path.join(data_dir, f'bot_volume_{timestamp}.parquet')
    
    # Save the data
    df.to_parquet(raw_path)
    print(f"✅ Raw data saved: {raw_path}")
    
    # Verify file was created and show some info
    if os.path.exists(raw_path):
        file_size_mb = os.path.getsize(raw_path) / 1024 / 1024
        print(f"📊 File size: {file_size_mb:.2f} MB")
        print(f"📁 Full path: {raw_path}")
    
    # Continue with your existing analysis
    print(f"\nData shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    
    # Show data preview
    print(f"\nData preview:")
    print(df.head(3))
    
else:
    print("❌ Dune provider not available")

In [None]:
df.info()
print("=====================================================================================")
df.head()

# Current File Structure Analysis #

In [None]:
def create_optimal_structure():
    """Create optimal data directory structure"""
    
    # Use project root data directory, not notebooks/data
    base_data_dir = '../data'  # Go up one level from notebooks/
    
    optimal_structure = {
        f'{base_data_dir}/raw/dune/': 'Raw Dune API responses',
        f'{base_data_dir}/raw/hyperliquid/': 'Raw Hyperliquid data',
        f'{base_data_dir}/raw/backup/': 'Backup copies of critical datasets',
        
        f'{base_data_dir}/processed/daily/': 'Daily aggregated features',
        f'{base_data_dir}/processed/hourly/': 'Hourly features for real-time signals',
        f'{base_data_dir}/processed/features/': 'Engineered features ready for ML',
        
        f'{base_data_dir}/cache/': 'Temporary processing files',
        f'{base_data_dir}/cache/api_responses/': 'Cached API responses',
        
        f'{base_data_dir}/models/': 'Trained ML models',
        f'{base_data_dir}/models/checkpoints/': 'Model training checkpoints',
        
        f'{base_data_dir}/exports/': 'Data exports for sharing/presentation',
        f'{base_data_dir}/metadata/': 'Data quality reports and schemas'
    }
    
    
    print("🏗️  CREATING OPTIMAL DIRECTORY STRUCTURE\n")
    
    for dir_path, description in optimal_structure.items():
        os.makedirs(dir_path, exist_ok=True)
        print(f"✅ Created: {dir_path:<30} | {description}")
    
    # Create a README for data organization
    readme_content = """# Data Directory Structure

## Raw Data
- `raw/dune/`: Raw blockchain data from Dune Analytics
- `raw/hyperliquid/`: Raw DEX data from Hyperliquid
- `raw/backup/`: Critical dataset backups

## Processed Data  
- `processed/daily/`: Daily aggregated metrics
- `processed/hourly/`: Hourly features for real-time analysis
- `processed/features/`: ML-ready feature datasets

## Cache & Temporary
- `cache/`: Temporary processing files
- `cache/api_responses/`: Cached API calls (1-hour TTL)

## Models & Outputs
- `models/`: Trained ML models and scalers
- `exports/`: Clean datasets for sharing
- `metadata/`: Data schemas and quality reports

## File Naming Convention
- Raw: `{source}_{dataset}_{YYYYMMDD_HHMMSS}.parquet`
- Processed: `{feature_type}_{timeframe}_{YYYYMMDD}.parquet`
- Models: `{model_type}_{version}_{YYYYMMDD}.pkl`
"""
    
    

# Create the optimal structure
create_optimal_structure()

# Plotting Bot Volume Data across chains # 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Convert block_date to datetime if it's not already
df['block_date'] = pd.to_datetime(df['block_date'])

# Sort by date for proper plotting
df_sorted = df.sort_values('block_date')

print("Data columns:", df.columns.tolist())
print("\nUnique blockchains:", df['blockchain'].unique())
print(f"\nDate range: {df['block_date'].min()} to {df['block_date'].max()}")

In [None]:
# Create interactive plots with Plotly
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Volume USD by Chain', 'Number of Users by Chain', 
                   'Bot Revenue USD by Chain', 'Number of Trades by Chain'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Get unique blockchains and generate enough colors
blockchains = df['blockchain'].unique()
# Generate colors dynamically - cycle through color palettes if needed
colors = px.colors.qualitative.Set1 + px.colors.qualitative.Set2 + px.colors.qualitative.Set3
colors = colors[:len(blockchains)]  # Take only as many as needed

print(f"Number of blockchains: {len(blockchains)}")
print(f"Blockchains: {list(blockchains)}")

# Plot 1: Volume USD by Chain
for i, blockchain in enumerate(blockchains):
    chain_data = df_sorted[df_sorted['blockchain'] == blockchain]
    volume_col = pd.to_numeric(chain_data['volumeUSD'], errors='coerce')
    fig.add_trace(
        go.Scatter(x=chain_data['block_date'], y=volume_col,
                  name=f'{blockchain}', line=dict(color=colors[i]),
                  legendgroup=blockchain, showlegend=True),
        row=1, col=1
    )

# Plot 2: Number of Users by Chain  
for i, blockchain in enumerate(blockchains):
    chain_data = df_sorted[df_sorted['blockchain'] == blockchain]
    fig.add_trace(
        go.Scatter(x=chain_data['block_date'], y=chain_data['numberOfUsers'],
                  name=f'{blockchain}', line=dict(color=colors[i]),
                  legendgroup=blockchain, showlegend=False),
        row=1, col=2
    )

# Plot 3: Bot Revenue USD by Chain
for i, blockchain in enumerate(blockchains):
    chain_data = df_sorted[df_sorted['blockchain'] == blockchain]
    bot_revenue_col = pd.to_numeric(chain_data['botRevenueUSD'], errors='coerce')
    fig.add_trace(
        go.Scatter(x=chain_data['block_date'], y=bot_revenue_col,
                  name=f'{blockchain}', line=dict(color=colors[i]),
                  legendgroup=blockchain, showlegend=False),
        row=2, col=1
    )

# Plot 4: Number of Trades by Chain
for i, blockchain in enumerate(blockchains):
    chain_data = df_sorted[df_sorted['blockchain'] == blockchain]
    fig.add_trace(
        go.Scatter(x=chain_data['block_date'], y=chain_data['numberOfTrades'],
                  name=f'{blockchain}', line=dict(color=colors[i]),
                  legendgroup=blockchain, showlegend=False),
        row=2, col=2
    )

fig.update_layout(height=800, title_text="Trading Bot Activity by Blockchain Over Time")
fig.show()

In [None]:
# Stacked area chart for all blockchains volume
fig = go.Figure()

# First, convert all data to numeric and calculate total volumes per blockchain
df_numeric = df.copy()
df_numeric['volumeUSD'] = pd.to_numeric(df_numeric['volumeUSD'], errors='coerce')

# Calculate total volume per blockchain to order them
blockchain_totals = df_numeric.groupby('blockchain')['volumeUSD'].sum().sort_values(ascending=False)
blockchains_ordered = blockchain_totals.index.tolist()  # Highest to lowest

# Define specific colors for key blockchains
blockchain_colors = {
    'Solana': 'rgba(138, 43, 226, 0.8)',      # Purple for Solana
    'Ethereum': 'rgba(54, 162, 235, 0.8)',    # Blue for Ethereum  
    'BSC': 'rgba(255, 205, 86, 0.8)',         # Yellow for BSC
    'Base': 'rgba(75, 192, 192, 0.8)',        # Teal for Base
    'TON': 'rgba(255, 99, 132, 0.8)',         # Red for TON
    'Avalanche': 'rgba(153, 102, 255, 0.8)',  # Light Purple
    'Arbitrum': 'rgba(255, 159, 64, 0.8)',    # Orange
    'sonic': 'rgba(199, 199, 199, 0.8)',      # Gray
    'Blast': 'rgba(83, 102, 255, 0.8)',       # Light Blue
    'Fantom': 'rgba(255, 99, 255, 0.8)',      # Pink
    'Polygon': 'rgba(99, 255, 132, 0.8)',     # Light Green
    'Optimism': 'rgba(255, 132, 99, 0.8)',    # Light Red
    'Linea': 'rgba(132, 255, 99, 0.8)',       # Green
    'Scroll': 'rgba(255, 165, 0, 0.8)'        # Dark Orange
}

print(f"Found {len(blockchains_ordered)} blockchains ordered by total volume:")
for i, blockchain in enumerate(blockchains_ordered):
    total_vol = blockchain_totals[blockchain]
    print(f"{i+1}. {blockchain}: ${total_vol:,.0f}")

# Add each blockchain in order from largest to smallest (largest becomes bottom layer)
for i, blockchain in enumerate(blockchains_ordered):
    blockchain_data = df_sorted[df_sorted['blockchain'] == blockchain].copy()
    
    if len(blockchain_data) > 0:
        # Ensure proper date sorting
        blockchain_data = blockchain_data.sort_values('block_date').reset_index(drop=True)
        
        # Convert volume to numeric (it's stored as string in scientific notation)
        blockchain_data['volumeUSD'] = pd.to_numeric(blockchain_data['volumeUSD'], errors='coerce')
        
        # Remove any NaN values that might have been created
        blockchain_data = blockchain_data.dropna(subset=['volumeUSD'])
        
        # Get color for this blockchain
        color = blockchain_colors.get(blockchain, f'rgba({50 + i*30}, {100 + i*20}, {150 + i*10}, 0.7)')
        
        fig.add_trace(
            go.Scatter(
                x=blockchain_data['block_date'], 
                y=blockchain_data['volumeUSD'],
                name=blockchain,
                fill='tonexty' if i > 0 else 'tozeroy',  # First one fills to zero, others stack on top
                mode='none',  # No lines, just filled area
                fillcolor=color,
                hovertemplate=f'<b>{blockchain}</b><br>Date: %{{x}}<br>Volume: $%{{y:,.0f}}<extra></extra>',
                stackgroup='one'  # Enable stacking
            )
        )

# Clean formatting
fig.update_layout(
    title='Bot Volume Last 765 Days', 
    title_x=0.5,
    xaxis_title='Date', 
    yaxis_title='Volume USD', 
    height=600,
    width=1200,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family="Arial", size=12),
    showlegend=True,
    legend=dict(
        orientation="v",
        yanchor="top",
        y=1,
        xanchor="left",
        x=1.02
    ),
    yaxis=dict(
        tickformat='$,.0f',
        showgrid=True,
        gridcolor='lightgray',
        gridwidth=1
    ),
    xaxis=dict(
        showgrid=True,
        gridcolor='lightgray',
        gridwidth=1
    )
)

fig.show()

In [None]:
# Summary Statistics and Trends
print("=== TRADING BOT ACTIVITY SUMMARY ===\n")

# First, convert all numeric columns from scientific notation strings to proper numbers
df_numeric = df.copy()
numeric_columns = ['volumeUSD', 'botRevenueUSD', 'numberOfUsers', 'numberOfTrades', 'numberOfNewUsers', 'averageVolumePerUserUSD', 'averageVolumePerTradeUSD']

for col in numeric_columns:
    if col in df_numeric.columns:
        df_numeric[col] = pd.to_numeric(df_numeric[col], errors='coerce')

# Overall statistics
total_volume = df_numeric['volumeUSD'].sum()
total_revenue = df_numeric['botRevenueUSD'].sum()
total_trades = df_numeric['numberOfTrades'].sum()
avg_users_per_day = df_numeric['numberOfUsers'].mean()

print(f"Total Volume: ${total_volume:,.2f}")
print(f"Total Bot Revenue: ${total_revenue:,.2f}")
print(f"Total Trades: {total_trades:,}")
print(f"Average Users per Day: {avg_users_per_day:.0f}")
print(f"Revenue as % of Volume: {(total_revenue/total_volume)*100:.2f}%\n")

# By Blockchain
print("=== BY BLOCKCHAIN ===")
blockchain_summary = df_numeric.groupby('blockchain').agg({
    'volumeUSD': ['sum', 'mean'],
    'botRevenueUSD': ['sum', 'mean'],
    'numberOfUsers': ['sum', 'mean'],
    'numberOfTrades': ['sum', 'mean'],
    'numberOfNewUsers': 'sum'
}).round(2)

blockchain_summary.columns = ['Total_Volume', 'Avg_Daily_Volume', 'Total_Revenue', 'Avg_Daily_Revenue',
                             'Total_Users', 'Avg_Daily_Users', 'Total_Trades', 'Avg_Daily_Trades', 'Total_New_Users']

print(blockchain_summary)

# Recent trends (last 30 days)
recent_data = df_numeric[df_numeric['block_date'] >= df_numeric['block_date'].max() - pd.Timedelta(days=30)]
print(f"\n=== RECENT TRENDS (Last 30 Days) ===")
print(f"Recent Volume: ${recent_data['volumeUSD'].sum():,.2f}")
print(f"Recent Revenue: ${recent_data['botRevenueUSD'].sum():,.2f}")
print(f"Most Active Chain: {recent_data.groupby('blockchain')['volumeUSD'].sum().idxmax()}")

# Simple Sentiment Analysis Using Vader for Tweets #

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

sample_tweet = "Bitcoin is going to the moon!"
print(f"Sentiment score: {analyze_sentiment(sample_tweet)}")


# Training a Random Forest Classifier on Features #

In [None]:
# Feature Engineering and ML Example
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Create features from our existing data
def create_features(df):
    """Create ML features from trading bot data"""
    df_features = df.copy()
    
    # Convert to numeric
    numeric_columns = ['volumeUSD', 'botRevenueUSD', 'numberOfUsers', 'numberOfTrades']
    for col in numeric_columns:
        df_features[col] = pd.to_numeric(df_features[col], errors='coerce')
    
    # Sort by blockchain and date for proper rolling calculations
    df_features = df_features.sort_values(['blockchain', 'block_date'])
    
    # Rolling features (7-day windows)
    df_features['volume_ma_7d'] = df_features.groupby('blockchain')['volumeUSD'].rolling(7, min_periods=1).mean().reset_index(0, drop=True)
    df_features['volume_std_7d'] = df_features.groupby('blockchain')['volumeUSD'].rolling(7, min_periods=1).std().reset_index(0, drop=True)
    
    # Volume change features
    df_features['volume_pct_change'] = df_features.groupby('blockchain')['volumeUSD'].pct_change()
    df_features['volume_change_7d'] = df_features.groupby('blockchain')['volumeUSD'].pct_change(7)
    
    # User activity features
    df_features['users_per_trade'] = df_features['numberOfUsers'] / (df_features['numberOfTrades'] + 1)
    df_features['volume_per_user'] = df_features['volumeUSD'] / (df_features['numberOfUsers'] + 1)
    
    # Create target: High volume day (top 25% of volume for each blockchain)
    df_features['volume_rank'] = df_features.groupby('blockchain')['volumeUSD'].rank(pct=True)
    df_features['high_volume_day'] = (df_features['volume_rank'] > 0.75).astype(int)
    
    return df_features

# Create features
print("Creating features...")
df_ml = create_features(df_numeric)

# Remove rows with NaN values
df_ml = df_ml.dropna()

if len(df_ml) > 50:  # Only proceed if we have enough data
    # Select features for ML
    feature_cols = [
        'volume_ma_7d', 'volume_std_7d', 'volume_pct_change', 'volume_change_7d',
        'users_per_trade', 'volume_per_user', 'numberOfUsers', 'numberOfTrades'
    ]
    
    X = df_ml[feature_cols]
    y = df_ml['high_volume_day']
    
    print(f"Dataset shape: {X.shape}")
    print(f"Target distribution: {y.value_counts()}")
    
    # Time series split (respecting temporal order)
    tscv = TimeSeriesSplit(n_splits=3)
    
    # Scale features
    scaler = StandardScaler()
    
    accuracies = []
    
    for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Scale features
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train model
        model = RandomForestClassifier(
            n_estimators=100, 
            max_depth=5, 
            random_state=42,
            class_weight='balanced'  # Handle class imbalance
        )
        model.fit(X_train_scaled, y_train)
        
        # Predictions
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        print(f"\n=== Fold {fold + 1} ===")
        print(f"Accuracy: {accuracy:.3f}")
        print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
    
    print(f"\n=== Overall Results ===")
    print(f"Mean CV Accuracy: {np.mean(accuracies):.3f} (+/- {np.std(accuracies)*2:.3f})")
    
    # Feature importance from last model
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\n=== Feature Importance ===")
    print(feature_importance)
    
else:
    print("Not enough data for ML training. Need more historical data.")