# === SECTION 1: SETUP & CONFIGURATION ===

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Core libraries
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime, timedelta
import time

print("‚úì Core libraries imported")

In [None]:
# Configure Polars display settings
pl.Config.set_tbl_rows(25)
pl.Config.set_tbl_cols(-1)
pl.Config.set_tbl_width_chars(1000)

print("‚úì Polars display configured")

In [None]:
# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("‚úì Matplotlib configured")

In [None]:
# Enable autoreload for development
%load_ext autoreload
%autoreload 2

print("‚úì Autoreload enabled")

In [None]:
print("="*60)
print("‚úÖ SETUP COMPLETE - Ready to load data")
print("="*60)

# === SECTION 2: DATA LOADING & VALIDATION ===

In [None]:
# GCS Authentication (for Google Colab)
try:
    from google.colab import auth
    auth.authenticate_user()
    
    import os
    os.environ['GCLOUD_PROJECT'] = 'alpine-charge-404612'
    
    print("‚úì GCP authenticated")
except ImportError:
    print("‚ÑπÔ∏è  Not running in Colab - skipping GCP auth")

In [None]:
# GCS Configuration
import tarfile
import zipfile
import shutil
import traceback
from google.cloud import storage
from google.api_core import retry

# Config
BUCKET_NAME = "spread-eth-oxford"
GCS_PREFIX = "polydata/"
GCS_RAW_ARCHIVE = "archive.tar.xz"
WORK_DIR = "data"

# Initialize GCS with retry
try:
    storage_client = storage.Client()
    bucket = storage_client.bucket(BUCKET_NAME)
    print(f"‚úì Connected to GCS bucket: {BUCKET_NAME}")
except Exception as e:
    print(f"‚ö†Ô∏è  GCS connection failed: {e}")
    storage_client = None
    bucket = None

In [None]:
def check_disk_space(required_gb=100):
    """Check if we have enough disk space"""
    disk = shutil.disk_usage('.')
    free_gb = disk.free / (1024**3)

    print(f"üíæ Disk space check:")
    print(f"   Free: {free_gb:.1f} GB")
    print(f"   Required: ~{required_gb} GB")

    if free_gb < required_gb:
        print(f"\n‚ùå ERROR: Not enough disk space!")
        print(f"   Need {required_gb}GB, have {free_gb:.1f}GB")
        return False

    print(f"   ‚úì Sufficient space available\n")
    return True

def check_gcs_data():
    """Check if processed data exists in GCS"""
    if not bucket:
        return False
    try:
        blobs = [b for b in bucket.list_blobs(prefix=GCS_PREFIX) if b.name.endswith('.zip')]
        return len(blobs) > 0
    except Exception as e:
        print(f"‚ö†Ô∏è  Warning: Could not check GCS: {e}")
        return False

print("‚úì GCS helper functions defined")

In [None]:
def download_from_gcs():
    """Download and extract zip archives from GCS with retry"""
    if not bucket:
        print("‚ùå GCS not connected")
        return False

    try:
        os.makedirs(WORK_DIR, exist_ok=True)

        blobs = [b for b in bucket.list_blobs(prefix=GCS_PREFIX) if b.name.endswith('.zip')]
        print(f"üì• Downloading {len(blobs)} zip archive(s) from GCS...\n")

        for i, blob in enumerate(blobs, 1):
            filename = blob.name.split('/')[-1]
            zip_path = filename

            # Skip if already extracted
            expected_files = Path(WORK_DIR).glob(f"{filename.replace('.zip', '')}*.parquet")
            if any(expected_files):
                print(f"[{i}/{len(blobs)}] {filename} - Already extracted, skipping")
                continue

            # Reload blob to get metadata
            blob.reload()
            blob_size_mb = (blob.size or 0) / (1024**2)
            print(f"[{i}/{len(blobs)}] {filename} ({blob_size_mb:.1f} MB)...", end=" ")

            # Download with retry
            max_retries = 3
            for attempt in range(max_retries):
                try:
                    start = time.time()
                    blob.download_to_filename(zip_path)
                    elapsed = time.time() - start
                    file_size_mb = os.path.getsize(zip_path) / (1024**2)
                    speed = file_size_mb / elapsed if elapsed > 0 else 0
                    print(f"‚úì ({speed:.1f} MB/s)")
                    break
                except Exception as e:
                    if attempt < max_retries - 1:
                        print(f"\n   Retry {attempt+1}/{max_retries}...", end=" ")
                        time.sleep(5)
                    else:
                        raise Exception(f"Download failed after {max_retries} attempts: {e}")

            # Extract
            print(f"    Extracting...", end=" ")
            try:
                with zipfile.ZipFile(zip_path, 'r') as zipf:
                    zipf.extractall(WORK_DIR)
                print(f"‚úì")
            except Exception as e:
                print(f"\n‚ùå ERROR extracting {filename}: {e}")
                raise

            # Remove zip
            os.remove(zip_path)
            print()

        print(f"‚úÖ Downloaded and extracted to {WORK_DIR}/\n")
        return True

    except Exception as e:
        print(f"\n‚ùå ERROR in download_from_gcs: {e}")
        traceback.print_exc()
        return False

print("‚úì download_from_gcs() defined")

In [None]:
# Main GCS data loading execution
print("="*60)
print("üöÄ DATA LOADER")
print("="*60)
print()

success = False

if check_gcs_data():
    print("‚úÖ Processed data found in GCS! Downloading...\n")
    success = download_from_gcs()
else:
    print("‚ÑπÔ∏è  No processed data in GCS")
    print("   Place data files in data/ directory or run GCS upload process")

if success:
    print("="*60)
    print("‚úÖ DATA READY!")
    print("="*60)

    # Show files
    print(f"\nFiles in {WORK_DIR}/:")
    parquet_files = sorted(Path(WORK_DIR).glob('*.parquet'))
    total_size = 0
    for f in parquet_files:
        size_mb = f.stat().st_size / (1024**2)
        total_size += size_mb
        print(f"  üìÑ {f.name} - {size_mb:.1f} MB")

    if parquet_files:
        print(f"\nTotal: {len(parquet_files)} files, {total_size/1024:.2f} GB")

    disk = shutil.disk_usage('.')
    print(f"üíæ Disk: {disk.free/(1024**3):.1f} GB free")

In [None]:
# Import poly_utils
# Import poly_utils for missing token updates and platform wallets
# Import poly_utils for optional CSV-based market updates and platform wallets
from poly_utils.utils import get_markets, update_missing_tokens, PLATFORM_WALLETS

In [None]:
# Load markets using poly_utils
print("üìä Loading markets...")

# Load markets from parquet (downloaded from GCS)
markets_parquet = Path(f"{WORK_DIR}/markets.parquet")

if markets_parquet.exists():
    markets_df = pl.read_parquet(markets_parquet)
    print(f"‚úì Markets loaded from parquet: {len(markets_df):,} markets")
    print(f"   Volume range: ${markets_df['volume'].min():,.0f} - ${markets_df['volume'].max():,.0f}")

    # Check if datetime column needs parsing
    if markets_df['createdAt'].dtype == pl.Utf8:
        markets_df = markets_df.with_columns(
            pl.col("createdAt").str.to_datetime().alias("createdAt")
        )
else:
    print(f"‚ùå No markets data found")
    print(f"   Expected: {markets_parquet}")
    print(f"   Run data update pipeline in Section 4 to generate data")
    markets_df = None

In [None]:
# Load trades from processed directory
print("üìà Loading trades...")

# Load trades from parquet files
trades_files = sorted(Path(WORK_DIR).glob("trades*.parquet"))

if len(trades_files) > 0:
    print(f"üìà Loading {len(trades_files)} trade file(s)...")

    if len(trades_files) == 1:
        trades_df = pl.read_parquet(trades_files[0])
    else:
        # Multiple files - use lazy scanning for memory efficiency
        print("   Using memory-efficient lazy loading...")
        trades_df = pl.concat([
            pl.scan_parquet(f) for f in trades_files
        ]).collect(streaming=True)

    # Check if timestamp needs parsing
    if trades_df['timestamp'].dtype == pl.Utf8:
        trades_df = trades_df.with_columns(
            pl.col("timestamp").str.to_datetime().alias("timestamp")
        )

    print(f"‚úì Trades loaded: {len(trades_df):,} trades")
    print(f"   Total volume: ${trades_df['usd_amount'].sum()/1e9:.2f}B")
    print(f"   Date range: {trades_df['timestamp'].min()} to {trades_df['timestamp'].max()}")
    print(f"   Unique markets: {trades_df['market_id'].n_unique():,}")
else:
    print("‚ùå No trades data found")
    print(f"   Expected: {WORK_DIR}/trades*.parquet")
    print(f"   Run data update pipeline in Section 4 to generate data")
    trades_df = None

In [None]:
if markets_df is not None and trades_df is not None:
    # Validation: Check for missing markets in trades
    print("üîç Validating data...")
    
    missing_market_ids = set(trades_df['market_id'].unique()) - set(markets_df['id'].unique())
    
    if missing_market_ids:
        print(f"‚ö†Ô∏è  Warning: {len(missing_market_ids)} market IDs in trades not found in markets")
        print(f"   Sample missing IDs: {list(missing_market_ids)[:5]}")
        print(f"   Consider running update_missing_tokens() in Section 3")
    else:
        print("‚úÖ All trade markets found in markets dataset")
else:
    print("‚ö†Ô∏è  Cannot validate - data not loaded")
    missing_market_ids = set()

In [None]:
if trades_df is not None:
    # Validation: Data freshness check
    last_trade_time = trades_df['timestamp'].max()
    hours_since_last = (datetime.now() - last_trade_time).total_seconds() / 3600
    
    print(f"\n‚è∞ Data freshness:")
    print(f"   Last trade: {last_trade_time}")
    print(f"   Hours ago: {hours_since_last:.1f}h")
    
    if hours_since_last > 24:
        print("   ‚ö†Ô∏è  Data is stale - consider running update pipeline in Section 4")
    else:
        print("   ‚úÖ Data is fresh")
        
    print("\n" + "="*60)
    print("‚úÖ DATA VALIDATION COMPLETE")
    print("="*60)
else:
    print("‚ö†Ô∏è  Cannot check freshness - trades data not loaded")

# === SECTION 3: POLYUTILS - MARKET MANAGEMENT ===

In [None]:
# Display platform wallet addresses
print("üíº Platform Wallets Tracked:")
for i, wallet in enumerate(PLATFORM_WALLETS, 1):
    print(f"   {i}. {wallet}")

In [None]:
if markets_df is not None:
    # Example: Filter markets by volume and keyword
    print("üîç Market Filtering Examples:\n")
    
    # High volume markets
    high_volume = markets_df.filter(pl.col('volume') >= 1_000_000).sort('volume', descending=True)
    print(f"1. High volume markets (>=1M): {len(high_volume):,}")
    print(high_volume.select(['question', 'volume']).head(5))
    
    print("\n" + "-"*60 + "\n")
    
    # Search by keyword
    keyword = "Trump"
    keyword_markets = markets_df.filter(
        pl.col('question').str.contains(keyword, literal=False)
    ).sort('volume', descending=True)
    print(f"2. Markets containing '{keyword}': {len(keyword_markets):,}")
    print(keyword_markets.select(['question', 'volume']).head(5))
else:
    print("‚ö†Ô∏è  Markets data not loaded")

In [None]:
# Example: Update missing tokens (COMMENTED OUT - run only when needed)
print("üìù Update Missing Tokens Example:")
print("   Use this when you have market IDs in trades but not in markets dataset")
print("   Uncomment and run the code below:\n")

print("   # missing_ids = ['market_id_1', 'market_id_2', ...]")
print("   # update_missing_tokens(missing_ids, output_file='missing_markets.csv')")
print("   # Then reload markets with get_markets()")

if missing_market_ids:
    print(f"\n   üí° You currently have {len(missing_market_ids)} missing markets")
    print(f"   Sample IDs to update: {list(missing_market_ids)[:3]}")

In [None]:
# Display market schema
print("üìã Market Data Schema:")
print(f"   Total columns: {len(markets_df.columns)}")
print(f"\n   Columns: {', '.join(markets_df.columns)}")
print(f"\n   Sample row:")
print(markets_df.head(1))

# === SECTION 4: UPDATE UTILS - DATA PIPELINE ===

In [None]:
# Import update_utils modules
from update_utils.update_markets import update_markets
from update_utils.update_goldsky import update_goldsky
from update_utils.process_live import process_live

print("‚úì update_utils modules imported")

In [None]:
# Define wrapper function for complete data update
def update_all_data():
    """
    Run complete data pipeline update:
    1. Update markets from Polymarket API
    2. Scrape order events from Goldsky subgraph
    3. Process raw orders into structured trades
    """
    print("üîÑ Starting data update pipeline...\n")
    
    try:
        print("1Ô∏è‚É£ Updating markets from Polymarket API...")
        update_markets()
        print("   ‚úì Markets updated\n")
        
        print("2Ô∏è‚É£ Scraping order events from Goldsky...")
        update_goldsky()
        print("   ‚úì Orders scraped\n")
        
        print("3Ô∏è‚É£ Processing trades...")
        process_live()
        print("   ‚úì Trades processed\n")
        
        print("="*60)
        print("‚úÖ DATA PIPELINE COMPLETE")
        print("="*60)
        print("\nüí° Reload data in Section 2 to see updates")
        return True
        
    except Exception as e:
        print(f"\n‚ùå Pipeline failed: {e}")
        return False

print("‚úì update_all_data() function defined")

In [None]:
# Check cursor state for Goldsky (tracks scraping progress)
cursor_state_file = Path("goldsky/cursor_state.json")

print("üìç Goldsky Cursor State:")
if cursor_state_file.exists():
    import json
    with open(cursor_state_file, 'r') as f:
        cursor_state = json.load(f)
    print(f"   File: {cursor_state_file}")
    print(f"   State: {cursor_state}")
    print("   ‚úì Cursor state exists - scraper will resume from last position")
else:
    print(f"   File: {cursor_state_file}")
    print("   ‚ö†Ô∏è  No cursor state found - first run will start from beginning")

In [None]:
# Display pipeline file status
import os

print("üìÇ Data Pipeline Files:")

files_to_check = [
    ("data/markets.parquet", "Markets (main)"),
    ("data/missing_markets.parquet", "Markets (missing)"),
    ("goldsky/orderFilled.parquet", "Raw orders"),
    ("processed/trades.parquet", "Processed trades")
]

for filepath, description in files_to_check:
    path = Path(filepath)
    if path.exists():
        mod_time = datetime.fromtimestamp(path.stat().st_mtime)
        age_hours = (datetime.now() - mod_time).total_seconds() / 3600
        size_mb = path.stat().st_size / (1024**2)
        print(f"   ‚úì {description}")
        print(f"      {filepath} ({size_mb:.1f} MB, {age_hours:.1f}h old)")
    else:
        print(f"   ‚ùå {description}")
        print(f"      {filepath} (not found)")

In [None]:
# Manual data refresh (COMMENTED OUT - uncomment to run)
print("üîÑ Manual Data Refresh:")
print("   Uncomment and run to update all data:\n")
print("   # update_all_data()")
print("\n   ‚ö†Ô∏è  This may take several minutes depending on data volume")
print("   ‚ö†Ô∏è  Ensure you have internet connectivity")

# === SECTION 5: MARKET ANALYSIS & EXPLORATION ===

In [None]:
# Top 20 markets by volume
print("üèÜ Top 20 Markets by Volume:\n")

top_markets = (
    markets_df
    .sort('volume', descending=True)
    .head(20)
    .select(['question', 'volume', 'createdAt'])
)

print(top_markets)

In [None]:
# Market search configuration
ANALYSIS_CONFIG = {
    'market_keyword': 'Trump',  # Search keyword
    'min_volume': 1_000_000,    # Minimum volume filter
}

print(f"üîç Market Search Configuration:")
print(f"   Keyword: '{ANALYSIS_CONFIG['market_keyword']}'")
print(f"   Min Volume: ${ANALYSIS_CONFIG['min_volume']:,}\n")

# Find matching markets
candidate_markets = (
    markets_df
    .filter(pl.col('question').str.contains(ANALYSIS_CONFIG['market_keyword'], literal=False))
    .filter(pl.col('volume') >= ANALYSIS_CONFIG['min_volume'])
    .sort('volume', descending=True)
)

print(f"Found {len(candidate_markets)} markets matching criteria:")
print(candidate_markets.select(['id', 'question', 'volume']))

In [None]:
if markets_df is not None:
    # Select target market (top result)
    if len(candidate_markets) > 0:
        TARGET_MARKET_ID = candidate_markets.row(0, named=True)['id']
        TARGET_MARKET = markets_df.filter(pl.col('id') == TARGET_MARKET_ID).row(0, named=True)
        
        print(f"\nüéØ Selected Target Market:")
        print(f"   ID: {TARGET_MARKET['id']}")
        print(f"   Question: {TARGET_MARKET['question']}")
        print(f"   Volume: ${TARGET_MARKET['volume']:,.0f}")
        print(f"   Created: {TARGET_MARKET['createdAt']}")
    else:
        print("\n‚ö†Ô∏è  No markets found - adjust search criteria")
        TARGET_MARKET_ID = None
else:
    print("‚ö†Ô∏è  Markets data not loaded")

In [None]:
# Extract trades for target market
if TARGET_MARKET_ID:
    market_trades = (
        trades_df
        .filter(pl.col('market_id') == TARGET_MARKET_ID)
        .with_columns([
            # Standardize price to token1 perspective
            pl.when(pl.col('nonusdc_side') == 'token2')
            .then(1 - pl.col('price'))
            .otherwise(pl.col('price'))
            .alias('price_standardized')
        ])
    )
    
    print(f"\nüìä Market Trade Statistics:")
    print(f"   Total trades: {len(market_trades):,}")
    print(f"   Price range: {market_trades['price_standardized'].min():.3f} - {market_trades['price_standardized'].max():.3f}")
    print(f"   Avg trade size: ${market_trades['usd_amount'].mean():.2f}")
    print(f"   Total volume: ${market_trades['usd_amount'].sum():,.0f}")
else:
    print("\n‚ö†Ô∏è  Skipping - no target market selected")

In [None]:
# Maker-taker analysis
if TARGET_MARKET_ID:
    print("\nüîÑ Maker-Taker Dynamics:")
    
    # Analyze makers per transaction
    taker_stats = (
        market_trades
        .group_by("transactionHash")
        .agg([
            pl.n_unique("maker").alias("num_makers"),
            pl.col("usd_amount").sum().alias("total_volume")
        ])
    )
    
    print(f"   Avg makers per taker order: {taker_stats['num_makers'].mean():.2f}")
    print(f"   Median makers per taker order: {taker_stats['num_makers'].median():.0f}")
    print(f"   Max makers in single order: {taker_stats['num_makers'].max()}")
else:
    print("\n‚ö†Ô∏è  Skipping - no target market selected")

In [None]:
# Price and volume visualization
if TARGET_MARKET_ID:
    # Convert to pandas for plotting
    market_trades_pd = market_trades.to_pandas()
    market_trades_pd = market_trades_pd.set_index('timestamp').sort_index()
    
    # Create 2-panel visualization
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8), sharex=True)
    
    # Price scatter plot
    ax1.scatter(market_trades_pd.index, market_trades_pd['price_standardized'], 
                alpha=0.3, s=10, c='steelblue')
    ax1.set_ylabel('Price', fontsize=12)
    ax1.set_title(f"Price & Volume History: {TARGET_MARKET['question'][:80]}...", fontsize=14, fontweight='bold')
    ax1.grid(True, alpha=0.3)
    ax1.set_ylim(0, 1)
    
    # Volume bar chart (hourly)
    volume_hourly = market_trades_pd['usd_amount'].resample('1H').sum()
    ax2.bar(volume_hourly.index, volume_hourly.values, color='green', alpha=0.6, width=0.04)
    ax2.set_ylabel('Volume (USD)', fontsize=12)
    ax2.set_xlabel('Time', fontsize=12)
    ax2.set_title('Hourly Trading Volume', fontsize=12, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\n‚úÖ Visualization complete")
else:
    print("\n‚ö†Ô∏è  Skipping - no target market selected")

# === SECTION 6: BACKTRADER PLOTTING SETUP ===

In [None]:
# Import backtrader plotting modules
try:
    from backtrader_plotting import Bokeh
    from backtrader_plotting.schemes import Blackly, Tradimo
    import backtrader as bt
    
    print("‚úì backtrader and backtrader_plotting imported")
except ImportError as e:
    print(f"‚ö†Ô∏è  Import error: {e}")
    print("   Install with: pip install backtrader backtrader_plotting")

In [None]:
# Display available plotting schemes
print("üé® Available Plotting Schemes:\n")

schemes_info = [
    ("Blackly", "Dark background theme with high contrast"),
    ("Tradimo", "Professional light theme"),
]

for scheme_name, description in schemes_info:
    print(f"   ‚Ä¢ {scheme_name}: {description}")

print("\nüí° Usage example:")
print("   scheme = Blackly()")
print("   b = Bokeh(style='bar', plot_mode='single', scheme=scheme)")

In [None]:
# Create plotting configuration
PLOT_CONFIG = {
    'scheme': 'Blackly',        # Theme: 'Blackly' or 'Tradimo'
    'style': 'bar',              # Style: 'bar', 'line', or 'candle'
    'plot_mode': 'single',       # Mode: 'single' or 'tabs'
    'output_dir': 'processed/',  # Directory for saved plots
}

print("‚öôÔ∏è  Plotting Configuration:")
for key, value in PLOT_CONFIG.items():
    print(f"   {key}: {value}")

print("\n‚úì Configuration ready for strategy backtesting")

In [None]:
# List available analyzer tables
print("üìä Available Backtrader Analyzers:\n")

analyzers = [
    ("SharpeRatio", "Risk-adjusted return metric"),
    ("DrawDown", "Maximum drawdown analysis"),
    ("Returns", "Total and annualized returns"),
    ("TradeAnalyzer", "Win/loss statistics"),
    ("Calmar", "Calmar ratio (return/max drawdown)"),
    ("VWR", "Variability Weighted Return"),
    ("SQN", "System Quality Number"),
    ("TimeReturn", "Period-based returns"),
]

for analyzer_name, description in analyzers:
    print(f"   ‚Ä¢ {analyzer_name}")
    print(f"     {description}")

print("\nüí° Add to backtest with:")
print("   cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='sharpe')")

In [None]:
print("="*60)
print("‚úÖ BACKTRADER PLOTTING SETUP COMPLETE")
print("="*60)
print("\nüìù Next Steps:")
print("   ‚Ä¢ Section 7: Add strategy implementation")
print("   ‚Ä¢ Section 8: Add backtesting framework")
print("   ‚Ä¢ Section 9: Add live trading capabilities")
print("\nüí° All utilities are now integrated and ready to use!")