# ETH Candle Direction Prediction Analysis

This notebook provides interactive analysis for predicting 1-hour ETH candle direction based on the first 45 minutes of data.

## Overview
- Load and prepare OHLCV data at multiple timeframes
- Extract features from first 45 minutes using statistical methods
- Train machine learning models to predict candle direction  
- Make real-time predictions with confidence scores
- Interactive Plotly visualizations and dashboards


In [None]:
import pandas as pd
import numpy as np
import ccxt
import time
import warnings
from datetime import datetime, timezone, timedelta

# Plotly for all visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ML libraries

# Import our custom predictor
from src.data_utils.data_loading_utils import load_eth_historical_data
from src.eth_candle_predictor import EthCandlePredictor

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

print("📊 Libraries imported successfully!")
print("🎨 Using Plotly for all interactive visualizations")

📊 Libraries imported successfully!
🎨 Using Plotly for all interactive visualizations


## 📈 Data Loading and Preparation

Using your existing approach to fetch and prepare ETH data at multiple timeframes. You can either fetch new data or use existing dataframes.


In [14]:
def fetch_eth_data(days=14, since=None):
    """Fetch ETH/USDT data - similar to your existing approach"""
    exchange = ccxt.binance()
    symbol = "ETH/USDT"
    timeframe = "1m"
    if since is None:
        print(f"🔄 Fetching {days} days of ETH data...")
        since = exchange.parse8601(
            (datetime.now(timezone.utc) - timedelta(days=days)).isoformat()
        )
    else:
        since = exchange.parse8601(since)

    all_ohlcv = []
    limit = 1000

    while True:
        try:
            ohlcv = exchange.fetch_ohlcv(symbol, timeframe, since, limit)
            if len(ohlcv) == 0:
                break
            all_ohlcv.extend(ohlcv)
            since = ohlcv[-1][0] + 1
            time.sleep(exchange.rateLimit / 1000)
            if len(all_ohlcv) % 5000 == 0:
                print(f"  📥 Fetched {len(all_ohlcv)} candles...")
        except Exception as e:
            print(f"❌ Error: {e}")
            break

    df = pd.DataFrame(
        all_ohlcv, columns=["timestamp", "open", "high", "low", "close", "volume"]
    )
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
    df = df.set_index("timestamp").sort_index()
    df = df[~df.index.duplicated(keep="last")]

    print(
        f"✅ Loaded {len(df)} 1-minute candles from {df.index[0]} to {df.index[-1]} using up-to-date data"
    )
    return df


def get_all_candles():
    df_hist = load_eth_historical_data()
    df_current = fetch_eth_data(since="2025-06-01T00:00:00Z")
    df = pd.concat([df_hist, df_current])
    df = df.sort_index()
    df = df[~df.index.duplicated(keep="last")]
    print(
        f"✅ Loaded {len(df)} 1-minute candles from {df.index[0]} to {df.index[-1]} using historical data"
    )
    return df


df = get_all_candles()

  📥 Fetched 5000 candles...
  📥 Fetched 10000 candles...
  📥 Fetched 15000 candles...
  📥 Fetched 20000 candles...
  📥 Fetched 25000 candles...
  📥 Fetched 30000 candles...
  📥 Fetched 35000 candles...
  📥 Fetched 40000 candles...
✅ Loaded 41565 1-minute candles from 2025-06-01 00:00:00+00:00 to 2025-06-29 20:44:00+00:00 using up-to-date data
✅ Loaded 259005 1-minute candles from 2025-01-01 00:00:00+00:00 to 2025-06-29 20:44:00+00:00 using historical data


In [3]:
# Create resampled dataframes (use your existing ones if available)
if "df" in locals() and df is not None:
    print("🔄 Creating resampled timeframes...")

    # Resample to different timeframes
    df_5min = (
        df.resample("5min")
        .agg(
            {
                "open": "first",
                "high": "max",
                "low": "min",
                "close": "last",
                "volume": "sum",
            }
        )
        .dropna()
    )

    df_15min = (
        df.resample("15min")
        .agg(
            {
                "open": "first",
                "high": "max",
                "low": "min",
                "close": "last",
                "volume": "sum",
            }
        )
        .dropna()
    )

    df_1h = (
        df.resample("1H")
        .agg(
            {
                "open": "first",
                "high": "max",
                "low": "min",
                "close": "last",
                "volume": "sum",
            }
        )
        .dropna()
    )

    print("✅ Resampled data summary:")
    print(f"   📊 1-min: {len(df):,} candles")
    print(f"   📊 5-min: {len(df_5min):,} candles")
    print(f"   📊 15-min: {len(df_15min):,} candles")
    print(f"   📊 1-hour: {len(df_1h):,} candles")

    print("\n📋 Recent 1-hour data:")
    display(df_1h.tail())

    # Show basic stats
    latest_price = df_1h.iloc[-1]["close"]
    price_change_24h = (
        (df_1h.iloc[-1]["close"] - df_1h.iloc[-25]["close"]) / df_1h.iloc[-25]["close"]
    ) * 100
    print(f"\n💰 Current ETH Price: ${latest_price:,.2f}")
    print(f"📈 24h Change: {price_change_24h:+.2f}%")

else:
    print(
        "⚠️  Please load 'df' first by running the fetch_eth_data function or use your existing dataframe"
    )
    print("    Example: df = fetch_eth_data(days=14)")

🔄 Creating resampled timeframes...
✅ Resampled data summary:
   📊 1-min: 258,994 candles
   📊 5-min: 51,799 candles
   📊 15-min: 17,267 candles
   📊 1-hour: 4,317 candles

📋 Recent 1-hour data:


Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-06-29 16:00:00+00:00,2436.69,2440.18,2429.58,2439.06,9579.2707
2025-06-29 17:00:00+00:00,2439.07,2440.44,2432.63,2439.14,3744.594
2025-06-29 18:00:00+00:00,2439.14,2442.55,2434.16,2437.84,4724.2172
2025-06-29 19:00:00+00:00,2437.85,2439.59,2433.58,2437.96,2987.5391
2025-06-29 20:00:00+00:00,2437.97,2439.0,2433.2,2435.62,1728.3008



💰 Current ETH Price: $2,435.62
📈 24h Change: +0.25%


## 🔄 Historical Analysis: Candle Flipping Behavior

Analyze how often candles change direction between the 45-minute mark and close. This gives us insights into market behavior and the predictability of hourly candle outcomes.


In [4]:
def analyze_candle_flipping(df_1h, df_15min):
    """Analyze how often candles flip direction between 45min and close"""
    results = []

    print("🔍 Analyzing candle flipping behavior...")

    for i, ts in enumerate(df_1h.index):
        if (len(df_1h) <= 1000 and i % 100 == 0) or (
            len(df_1h) > 1000 and i % 1000 == 0
        ):
            print(f"   Processing candle {i + 1}/{len(df_1h)}")

        open_price = df_1h.loc[ts, "open"]
        close_price = df_1h.loc[ts, "close"]

        # Find 45-min mark price - this should be the CLOSE of the 3rd 15-min candle
        # 15-min candles in an hour: [0-15], [15-30], [30-45], [45-60]
        # We want the close price of the [30-45] candle, which ends at the 45-min mark
        ts_30 = ts + pd.Timedelta(minutes=30)  # Start of 3rd 15-min candle
        ts_45 = ts + pd.Timedelta(minutes=45)  # End of 3rd 15-min candle (45-min mark)

        # Find the 15-min candle that starts at the 30-minute mark
        # This candle closes at the 45-minute mark, giving us the price at 45 minutes
        matching_candles = df_15min.index[
            (df_15min.index >= ts_30) & (df_15min.index < ts_45)
        ]

        if len(matching_candles) > 0:
            # Use the close price of this candle (which represents the price at 45-min mark)
            price_45 = df_15min.loc[matching_candles[0], "close"]
        else:
            # Fallback: find all 15-min candles in the first 45 minutes and take the last one
            first_45_candles = df_15min.index[
                (df_15min.index >= ts) & (df_15min.index < ts_45)
            ]
            if len(first_45_candles) >= 3:
                # Take the close of the 3rd candle (index 2) which should end at 45 minutes
                price_45 = df_15min.loc[first_45_candles[2], "close"]
            else:
                # Skip this hour if we don't have enough data
                continue

        # Calculate changes
        delta_45 = price_45 - open_price
        delta_close = close_price - open_price

        direction_45 = "up" if delta_45 > 0 else "down" if delta_45 < 0 else "flat"
        direction_close = (
            "up" if delta_close > 0 else "down" if delta_close < 0 else "flat"
        )
        flipped = direction_45 != direction_close

        results.append(
            {
                "timestamp": ts,
                "open": open_price,
                "price_at_45min": price_45,
                "close": close_price,
                "delta_45_pct": (delta_45 / open_price) * 100,
                "delta_close_pct": (delta_close / open_price) * 100,
                "direction_45": direction_45,
                "direction_close": direction_close,
                "flipped": flipped,
                "green_candle": close_price > open_price,
                "hour_of_day": ts.hour,
                "day_of_week": ts.day_name(),
            }
        )

    return pd.DataFrame(results)


# Analyze historical behavior
if "df_1h" in locals():
    flip_df = analyze_candle_flipping(df_1h, df_15min)

    # Print comprehensive statistics
    total_candles = len(flip_df)
    green_candles = flip_df["green_candle"].sum()
    flip_rate = flip_df["flipped"].mean()

    print(f"\n📊 Historical Analysis ({total_candles:,} candles):")
    print(
        f"   🟢 Green candles: {green_candles:,} ({green_candles / total_candles:.1%})"
    )
    print(
        f"   🔴 Red candles: {total_candles - green_candles:,} ({(total_candles - green_candles) / total_candles:.1%})"
    )
    print(f"   🔄 Overall flip rate: {flip_rate:.1%}")

    # Analyze flip rates by different thresholds
    print("\n🎯 Flip Rate Analysis by Magnitude:")
    for threshold in [0.1, 0.25, 0.5, 1.0]:
        subset_up = flip_df[flip_df["delta_45_pct"] > threshold]
        subset_down = flip_df[flip_df["delta_45_pct"] < -threshold]

        if len(subset_up) > 0:
            flip_rate_up = subset_up["flipped"].mean()
            print(
                f"   📈 Up >{threshold:>4}% at 45min: {flip_rate_up:.1%} flip rate ({len(subset_up):,} samples)"
            )

        if len(subset_down) > 0:
            flip_rate_down = subset_down["flipped"].mean()
            print(
                f"   📉 Down <-{threshold:>3}% at 45min: {flip_rate_down:.1%} flip rate ({len(subset_down):,} samples)"
            )

    # Time-based analysis
    print("\n🕐 Flip Rate by Hour of Day (UTC):")
    hourly_flip = flip_df.groupby("hour_of_day")["flipped"].agg(["mean", "count"])
    for hour in range(24):
        if hour in hourly_flip.index:
            rate = hourly_flip.loc[hour, "mean"]
            count = hourly_flip.loc[hour, "count"]
            print(
                f"   {hour:2d}:00-{hour:2d}:59  {rate:.1%} flip rate ({count:,} candles)"
            )

    print("\n📋 Sample of flip analysis data:")
    display(flip_df.head(10))

    # Validation: Check that price_at_45min is different from close price
    same_price_count = (flip_df["price_at_45min"] == flip_df["close"]).sum()
    different_price_count = (flip_df["price_at_45min"] != flip_df["close"]).sum()

    print("\n✅ Validation Check:")
    print(
        f"   📊 Same price at 45min and close: {same_price_count:,} ({same_price_count / len(flip_df):.1%})"
    )
    print(
        f"   📊 Different price at 45min and close: {different_price_count:,} ({different_price_count / len(flip_df):.1%})"
    )

    if (
        same_price_count > len(flip_df) * 0.8
    ):  # If >80% have same price, something is still wrong
        print("⚠️  Warning: Most 45-min prices are same as close prices - check logic!")
    else:
        print(
            "✅ Logic appears correct - 45-min prices are properly different from close prices"
        )

else:
    print("⚠️  Please ensure df_1h is available first")

🔍 Analyzing candle flipping behavior...
   Processing candle 1/4317
   Processing candle 1001/4317
   Processing candle 2001/4317
   Processing candle 3001/4317
   Processing candle 4001/4317

📊 Historical Analysis (4,317 candles):
   🟢 Green candles: 2,206 (51.1%)
   🔴 Red candles: 2,111 (48.9%)
   🔄 Overall flip rate: 14.7%

🎯 Flip Rate Analysis by Magnitude:
   📈 Up > 0.1% at 45min: 9.7% flip rate (1,838 samples)
   📉 Down <-0.1% at 45min: 9.4% flip rate (1,726 samples)
   📈 Up >0.25% at 45min: 5.2% flip rate (1,332 samples)
   📉 Down <-0.25% at 45min: 5.2% flip rate (1,219 samples)
   📈 Up > 0.5% at 45min: 1.9% flip rate (726 samples)
   📉 Down <-0.5% at 45min: 1.8% flip rate (679 samples)
   📈 Up > 1.0% at 45min: 0.9% flip rate (214 samples)
   📉 Down <-1.0% at 45min: 1.4% flip rate (222 samples)

🕐 Flip Rate by Hour of Day (UTC):
    0:00- 0:59  12.2% flip rate (180 candles)
    1:00- 1:59  13.3% flip rate (180 candles)
    2:00- 2:59  17.8% flip rate (180 candles)
    3:00- 3:59

Unnamed: 0,timestamp,open,price_at_45min,close,delta_45_pct,delta_close_pct,direction_45,direction_close,flipped,green_candle,hour_of_day,day_of_week
0,2025-01-01 00:00:00+00:00,3337.78,3356.09,3363.7,0.548568,0.776564,up,up,False,True,0,Wednesday
1,2025-01-01 01:00:00+00:00,3363.69,3352.46,3346.54,-0.33386,-0.509857,down,down,False,False,1,Wednesday
2,2025-01-01 02:00:00+00:00,3346.54,3357.78,3362.61,0.335869,0.480197,up,up,False,True,2,Wednesday
3,2025-01-01 03:00:00+00:00,3362.61,3355.95,3355.2,-0.19806,-0.220365,down,down,False,False,3,Wednesday
4,2025-01-01 04:00:00+00:00,3355.2,3345.51,3341.14,-0.288805,-0.419051,down,down,False,False,4,Wednesday
5,2025-01-01 05:00:00+00:00,3341.14,3348.09,3345.41,0.208013,0.127801,up,up,False,True,5,Wednesday
6,2025-01-01 06:00:00+00:00,3345.41,3344.48,3346.6,-0.027799,0.035571,down,up,True,True,6,Wednesday
7,2025-01-01 07:00:00+00:00,3346.61,3344.02,3347.12,-0.077392,0.015239,down,up,True,True,7,Wednesday
8,2025-01-01 08:00:00+00:00,3347.11,3339.69,3337.01,-0.221684,-0.301753,down,down,False,False,8,Wednesday
9,2025-01-01 09:00:00+00:00,3337.0,3327.16,3334.64,-0.294876,-0.070722,down,down,False,False,9,Wednesday



✅ Validation Check:
   📊 Same price at 45min and close: 3 (0.1%)
   📊 Different price at 45min and close: 4,314 (99.9%)
✅ Logic appears correct - 45-min prices are properly different from close prices


In [5]:
# 📊 Interactive Plotly Visualizations for Flip Behavior
if "flip_df" in locals():
    # 1. Flip behavior histogram
    fig1 = px.histogram(
        flip_df,
        x="delta_45_pct",
        color="flipped",
        nbins=50,
        title="🔄 Candle Flip Behavior by 45-Minute Change",
        labels={"delta_45_pct": "45-Minute Change (%)", "count": "Number of Candles"},
        color_discrete_map={True: "#ff6b6b", False: "#4ecdc4"},
        height=500,
    )
    fig1.update_layout(
        xaxis_title="45-Minute Change (%)",
        yaxis_title="Number of Candles",
        legend_title="Flipped Direction",
    )
    fig1.show()

    # 2. Green vs Red distribution
    fig2 = px.histogram(
        flip_df,
        x="delta_45_pct",
        color="green_candle",
        nbins=50,
        title="🟢🔴 Green vs Red Candles by 45-Minute Change",
        labels={"delta_45_pct": "45-Minute Change (%)", "count": "Number of Candles"},
        color_discrete_map={True: "#00ff00", False: "#ff0000"},
        height=500,
    )
    fig2.update_layout(
        xaxis_title="45-Minute Change (%)",
        yaxis_title="Number of Candles",
        legend_title="Candle Color",
    )
    fig2.show()

    # 3. Flip rate by hour of day
    hourly_stats = (
        flip_df.groupby("hour_of_day")
        .agg({"flipped": ["mean", "count"], "green_candle": "mean"})
        .round(3)
    )
    hourly_stats.columns = ["flip_rate", "count", "green_rate"]
    hourly_stats = hourly_stats.reset_index()

    fig3 = px.bar(
        hourly_stats,
        x="hour_of_day",
        y="flip_rate",
        title="🕐 Flip Rate by Hour of Day",
        labels={"hour_of_day": "Hour of Day", "flip_rate": "Flip Rate"},
        color="flip_rate",
        color_continuous_scale="RdYlBu_r",
        height=400,
    )
    fig3.update_layout(
        xaxis_title="Hour of Day (UTC)",
        yaxis_title="Flip Rate",
        xaxis=dict(tickmode="linear"),
    )
    fig3.show()

    # 4. Scatter plot: 45-min change vs final change
    fig4 = px.scatter(
        flip_df.sample(min(2000, len(flip_df))),  # Sample for performance
        x="delta_45_pct",
        y="delta_close_pct",
        color="flipped",
        title="📈 45-Minute Change vs Final Change (Sampled)",
        labels={
            "delta_45_pct": "45-Minute Change (%)",
            "delta_close_pct": "Final Change (%)",
        },
        color_discrete_map={True: "#ff6b6b", False: "#4ecdc4"},
        opacity=0.6,
        height=500,
    )
    # Add diagonal line (y=x) to show where no change would occur
    fig4.add_shape(
        type="line",
        x0=-3,
        y0=-3,
        x1=3,
        y1=3,
        line=dict(color="gray", width=2, dash="dash"),
    )
    fig4.update_layout(
        xaxis_title="45-Minute Change (%)",
        yaxis_title="Final Change (%)",
        legend_title="Flipped Direction",
    )
    fig4.show()

    # 5. Day of week analysis
    daily_stats = (
        flip_df.groupby("day_of_week")
        .agg({"flipped": "mean", "green_candle": "mean"})
        .round(3)
    )
    daily_stats = daily_stats.reindex(
        ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    )
    daily_stats = daily_stats.reset_index()

    fig5 = px.bar(
        daily_stats,
        x="day_of_week",
        y="flipped",
        title="📅 Flip Rate by Day of Week",
        labels={"day_of_week": "Day of Week", "flipped": "Flip Rate"},
        color="flipped",
        color_continuous_scale="viridis",
        height=400,
    )
    fig5.update_layout(xaxis_title="Day of Week", yaxis_title="Flip Rate")
    fig5.show()

    print("✅ All flip behavior visualizations completed!")

else:
    print("⚠️  Please run the flip analysis first")

✅ All flip behavior visualizations completed!


In [7]:
# 🏋️ Train different models
if "df" in locals():
    models = {}
    performances = {}

    print("🤖 Training machine learning models...")
    print("⏳ This may take a few minutes depending on data size...")

    for model_type in ["random_forest", "gradient_boost", "logistic"]:
        print(f"\n🔄 Training {model_type} model...")

        try:
            predictor = EthCandlePredictor(model_type=model_type)
            performance = predictor.train(df, test_size=0.2)

            models[model_type] = predictor
            performances[model_type] = performance

            print(f"   ✅ Accuracy: {performance.accuracy:.3f}")
            print(f"   ✅ AUC: {performance.auc_score:.3f}")
            print(f"   ✅ Features used: {len(predictor.feature_names)}")

        except Exception as e:
            print(f"   ❌ Error training {model_type}: {e}")

    if models:
        print(f"\n🎉 Successfully trained {len(models)} models!")

        # Create performance comparison dataframe
        performance_df = pd.DataFrame(
            {
                "Model": list(performances.keys()),
                "Accuracy": [p.accuracy for p in performances.values()],
                "Precision": [p.precision for p in performances.values()],
                "Recall": [p.recall for p in performances.values()],
                "F1_Score": [p.f1_score for p in performances.values()],
                "AUC": [p.auc_score for p in performances.values()],
            }
        )

        print("\n📊 Model Performance Comparison:")
        display(performance_df.round(3))

        # Identify best model
        best_model_name = performance_df.loc[performance_df["AUC"].idxmax(), "Model"]
        best_auc = performance_df["AUC"].max()
        print(f"\n🏆 Best performing model: {best_model_name} (AUC: {best_auc:.3f})")

    else:
        print("❌ No models were successfully trained")

else:
    print("⚠️  Please load data first by running the data loading cells")

🤖 Training machine learning models...
⏳ This may take a few minutes depending on data size...

🔄 Training random_forest model...


No data close to 45-minute mark for 2025-06-29 20:00:00+00:00


   ✅ Accuracy: 0.844
   ✅ AUC: 0.927
   ✅ Features used: 39

🔄 Training gradient_boost model...


No data close to 45-minute mark for 2025-06-29 20:00:00+00:00


   ✅ Accuracy: 0.826
   ✅ AUC: 0.922
   ✅ Features used: 39

🔄 Training logistic model...


No data close to 45-minute mark for 2025-06-29 20:00:00+00:00


   ✅ Accuracy: 0.848
   ✅ AUC: 0.927
   ✅ Features used: 39

🎉 Successfully trained 3 models!

📊 Model Performance Comparison:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score,AUC
0,random_forest,0.844,0.851,0.845,0.848,0.927
1,gradient_boost,0.826,0.836,0.825,0.83,0.922
2,logistic,0.848,0.867,0.834,0.85,0.927



🏆 Best performing model: logistic (AUC: 0.927)


In [8]:
# 🔮 Make predictions on recent data
if "models" in locals() and "df_1h" in locals() and len(models) > 0:
    recent_hours = df_1h.index[-15:]  # Last 15 hours for better analysis
    predictions = []

    print("🔮 Making predictions on recent hours...")
    print("⏳ Using best performing model for predictions...\n")

    # Use best performing model
    best_model_name = performance_df.loc[performance_df["AUC"].idxmax(), "Model"]
    best_model = models[best_model_name]
    print(f"🏆 Using best model: {best_model_name}")
    print("=" * 70)

    successful_predictions = 0

    for i, hour_start in enumerate(recent_hours):
        try:
            # Make prediction
            pred = best_model.predict(df, hour_start)

            # Get actual result
            actual_close = df_1h.loc[hour_start, "close"]
            actual_open = df_1h.loc[hour_start, "open"]
            actual_direction = "green" if actual_close > actual_open else "red"
            actual_change = (actual_close - actual_open) / actual_open * 100

            correct = pred.predicted_direction == actual_direction

            predictions.append(
                {
                    "timestamp": hour_start,
                    "predicted": pred.predicted_direction,
                    "actual": actual_direction,
                    "confidence": pred.confidence,
                    "prob_green": pred.probability_green,
                    "prob_red": pred.probability_red,
                    "actual_change": actual_change,
                    "correct": correct,
                    "open_price": actual_open,
                    "close_price": actual_close,
                }
            )

            # Emoji indicators
            status = "✅" if correct else "❌"
            confidence_emoji = (
                "🔥"
                if pred.confidence > 0.7
                else "⚡"
                if pred.confidence > 0.6
                else "💡"
            )
            direction_emoji = "🟢" if pred.predicted_direction == "green" else "🔴"

            print(
                f"{hour_start.strftime('%m-%d %H:%M')} | "
                f"{direction_emoji} Pred: {pred.predicted_direction:>5} {confidence_emoji}({pred.confidence:.1%}) | "
                f"Actual: {actual_direction:>5} ({actual_change:+.2f}%) {status}"
            )

            successful_predictions += 1

        except Exception as e:
            print(f"❌ Failed to predict for {hour_start}: {e}")

    if predictions:
        accuracy = sum(p["correct"] for p in predictions) / len(predictions)
        avg_confidence = np.mean([p["confidence"] for p in predictions])

        print("=" * 70)
        print("📊 PREDICTION SUMMARY:")
        print(
            f"   🎯 Accuracy: {accuracy:.1%} ({sum(p['correct'] for p in predictions)}/{len(predictions)})"
        )
        print(f"   💪 Average Confidence: {avg_confidence:.1%}")
        print(
            f"   ✅ Successful Predictions: {successful_predictions}/{len(recent_hours)}"
        )

        # Additional stats
        correct_preds = [p for p in predictions if p["correct"]]
        if correct_preds:
            avg_correct_confidence = np.mean([p["confidence"] for p in correct_preds])
            print(f"   🔥 Avg Confidence (Correct): {avg_correct_confidence:.1%}")

        incorrect_preds = [p for p in predictions if not p["correct"]]
        if incorrect_preds:
            avg_incorrect_confidence = np.mean(
                [p["confidence"] for p in incorrect_preds]
            )
            print(f"   💔 Avg Confidence (Wrong): {avg_incorrect_confidence:.1%}")

        print("\n📋 Detailed predictions saved to 'predictions' variable")

else:
    if "models" not in locals() or len(models) == 0:
        print("⚠️  Please train models first")
    else:
        print("⚠️  Please ensure hourly data is available")

No data close to 45-minute mark for 2025-06-29 20:00:00+00:00


🔮 Making predictions on recent hours...
⏳ Using best performing model for predictions...

🏆 Using best model: logistic
06-29 06:00 | 🟢 Pred: green 💡(57.4%) | Actual: green (+0.13%) ✅
06-29 07:00 | 🟢 Pred: green 💡(56.1%) | Actual: green (+0.15%) ✅
06-29 08:00 | 🟢 Pred: green 🔥(89.5%) | Actual: green (+0.13%) ✅
06-29 09:00 | 🟢 Pred: green 🔥(90.5%) | Actual: green (+0.19%) ✅
06-29 10:00 | 🔴 Pred:   red 💡(53.7%) | Actual: green (+0.33%) ❌
06-29 11:00 | 🟢 Pred: green 🔥(89.6%) | Actual: green (+0.24%) ✅
06-29 12:00 | 🔴 Pred:   red 🔥(96.6%) | Actual:   red (-0.30%) ✅
06-29 13:00 | 🔴 Pred:   red 🔥(78.9%) | Actual:   red (-0.32%) ✅
06-29 14:00 | 🔴 Pred:   red 🔥(90.1%) | Actual:   red (-0.28%) ✅
06-29 15:00 | 🔴 Pred:   red ⚡(62.0%) | Actual:   red (-0.14%) ✅
06-29 16:00 | 🔴 Pred:   red 🔥(77.0%) | Actual: green (+0.10%) ❌
06-29 17:00 | 🔴 Pred:   red 🔥(80.7%) | Actual: green (+0.00%) ❌
06-29 18:00 | 🟢 Pred: green 🔥(70.2%) | Actual:   red (-0.05%) ❌
06-29 19:00 | 🔴 Pred:   red ⚡(66.5%) | Actual: gr

In [9]:
# 🔍 Feature Importance Analysis (Plotly Visualization)
if "models" in locals() and len(models) > 0:
    print("🔍 Analyzing feature importance...")

    # Use Random Forest for feature importance (it provides the most interpretable results)
    if "random_forest" in models:
        rf_model = models["random_forest"]

        if hasattr(rf_model.model, "feature_importances_"):
            feature_importance = pd.DataFrame(
                {
                    "feature": rf_model.feature_names,
                    "importance": rf_model.model.feature_importances_,
                }
            ).sort_values("importance", ascending=False)

            print(f"✅ Extracted {len(feature_importance)} feature importances")

            # Create interactive Plotly bar chart for top features
            top_features = feature_importance.head(20)  # Top 20 features

            fig_importance = px.bar(
                top_features,
                x="importance",
                y="feature",
                orientation="h",
                title="🎯 Top 20 Most Important Features (Random Forest)",
                labels={"importance": "Feature Importance", "feature": "Feature Name"},
                color="importance",
                color_continuous_scale="viridis",
                height=700,
            )

            # Customize layout
            fig_importance.update_layout(
                yaxis={"categoryorder": "total ascending"},
                xaxis_title="Feature Importance",
                yaxis_title="Feature Name",
                showlegend=False,
                margin=dict(l=200),  # More space for feature names
            )

            fig_importance.show()

            # Show top 10 in a table
            print("\n📊 Top 10 Most Important Features:")
            display(feature_importance.head(10).round(4))

            # Categorize features
            feature_categories = {
                "price": ["open", "high", "low", "close", "price_45", "price_change"],
                "volume": ["volume", "vwap", "volume_sma"],
                "technical": ["rsi", "macd", "bb_", "sma", "ema"],
                "volatility": ["atr", "volatility", "std"],
                "time": ["hour", "minute", "day_of_week"],
            }

            # Analyze feature categories
            category_importance = {}
            for category, keywords in feature_categories.items():
                category_features = feature_importance[
                    feature_importance["feature"].str.contains(
                        "|".join(keywords), case=False
                    )
                ]
                if len(category_features) > 0:
                    category_importance[category] = category_features[
                        "importance"
                    ].sum()

            if category_importance:
                category_df = pd.DataFrame(
                    list(category_importance.items()),
                    columns=["Category", "Total_Importance"],
                )
                category_df = category_df.sort_values(
                    "Total_Importance", ascending=False
                )

                # Create category importance chart
                fig_category = px.pie(
                    category_df,
                    values="Total_Importance",
                    names="Category",
                    title="📈 Feature Importance by Category",
                    color_discrete_sequence=px.colors.qualitative.Set3,
                    height=500,
                )

                fig_category.update_traces(
                    textposition="inside", textinfo="percent+label"
                )

                fig_category.show()

                print("\n🏷️  Feature Category Analysis:")
                for i, row in category_df.iterrows():
                    print(f"   {row['Category']:>12}: {row['Total_Importance']:.3f}")

        else:
            print(
                "❌ Model doesn't have feature importances (might be logistic regression)"
            )
    else:
        print("❌ Random Forest model not available for feature importance analysis")
else:
    print("⚠️  Please train models first")

🔍 Analyzing feature importance...
✅ Extracted 39 feature importances



📊 Top 10 Most Important Features:


Unnamed: 0,feature,importance
4,price_change_abs,0.2034
5,price_change_pct,0.2023
8,upward_momentum,0.1152
35,momentum_15min,0.1116
7,position_in_range,0.0447
9,upper_half,0.0391
22,macd_signal,0.0373
21,macd,0.0274
26,shooting_star_like,0.0205
25,hammer_like,0.0188



🏷️  Feature Category Analysis:
          price: 0.420
      technical: 0.079
         volume: 0.027
           time: 0.007
     volatility: 0.007


In [10]:
# 📊 Comprehensive Interactive Dashboard
if "predictions" in locals() and "performance_df" in locals() and len(predictions) > 0:
    print("🎨 Creating comprehensive Plotly dashboard...")

    # Create subplots dashboard
    fig = make_subplots(
        rows=3,
        cols=2,
        subplot_titles=(
            "📈 Recent ETH Price (1H Candlesticks)",
            "🏆 Model Performance Comparison",
            "🎯 Prediction Confidence Over Time",
            "💰 Prediction vs Actual Price Changes",
            "📊 Model Performance Metrics",
            "🕐 Prediction Accuracy by Hour",
        ),
        specs=[
            [{"type": "candlestick"}, {"type": "bar"}],
            [{"type": "scatter"}, {"type": "scatter"}],
            [{"type": "bar"}, {"type": "bar"}],
        ],
        vertical_spacing=0.08,
        horizontal_spacing=0.1,
    )

    # 1. Recent price action (candlestick)
    recent_data = df_1h.tail(48)  # Last 48 hours
    fig.add_trace(
        go.Candlestick(
            x=recent_data.index,
            open=recent_data["open"],
            high=recent_data["high"],
            low=recent_data["low"],
            close=recent_data["close"],
            name="ETH Price",
            showlegend=False,
        ),
        row=1,
        col=1,
    )

    # 2. Model performance comparison
    fig.add_trace(
        go.Bar(
            x=performance_df["Model"],
            y=performance_df["AUC"],
            name="AUC Score",
            marker_color=["#1f77b4", "#ff7f0e", "#2ca02c"][: len(performance_df)],
            showlegend=False,
            text=performance_df["AUC"].round(3),
            textposition="outside",
        ),
        row=1,
        col=2,
    )

    # 3. Prediction confidence over time
    pred_df = pd.DataFrame(predictions)
    colors = ["#00ff00" if correct else "#ff0000" for correct in pred_df["correct"]]
    fig.add_trace(
        go.Scatter(
            x=pred_df["timestamp"],
            y=pred_df["confidence"],
            mode="markers+lines",
            marker=dict(color=colors, size=10),
            line=dict(color="gray", width=1),
            name="Confidence",
            showlegend=False,
        ),
        row=2,
        col=1,
    )

    # 4. Predicted vs actual price changes
    symbols = ["circle" if correct else "x" for correct in pred_df["correct"]]
    pred_colors = [
        "#00ff00" if pred == "green" else "#ff0000" for pred in pred_df["predicted"]
    ]
    fig.add_trace(
        go.Scatter(
            x=pred_df["timestamp"],
            y=pred_df["actual_change"],
            mode="markers",
            marker=dict(
                color=pred_colors,
                size=12,
                symbol=symbols,
                line=dict(width=2, color="black"),
            ),
            name="Predictions",
            showlegend=False,
        ),
        row=2,
        col=2,
    )

    # 5. Model performance metrics comparison
    metrics = ["Accuracy", "Precision", "Recall", "F1_Score"]

    for i, metric in enumerate(metrics):
        fig.add_trace(
            go.Bar(
                x=performance_df["Model"],
                y=performance_df[metric],
                name=metric,
                offsetgroup=i,
                showlegend=True if i < 2 else False,  # Only show legend for first 2
                marker_color=px.colors.qualitative.Set1[i],
            ),
            row=3,
            col=1,
        )

    # 6. Prediction accuracy by hour (if we have enough data)
    if len(pred_df) > 5:
        pred_df["hour"] = pred_df["timestamp"].dt.hour
        hourly_accuracy = (
            pred_df.groupby("hour")["correct"].agg(["mean", "count"]).reset_index()
        )
        hourly_accuracy.columns = ["hour", "accuracy", "count"]

        fig.add_trace(
            go.Bar(
                x=hourly_accuracy["hour"],
                y=hourly_accuracy["accuracy"],
                name="Hourly Accuracy",
                marker_color="lightblue",
                showlegend=False,
                text=hourly_accuracy["count"].astype(str) + " samples",
                textposition="outside",
            ),
            row=3,
            col=2,
        )
        # Update layout
        fig.update_layout(
            height=1000,
            title_text="🎯 ETH Candle Prediction Analysis Dashboard",
            title_x=0.5,
            title_font_size=20,
            showlegend=True,
            legend=dict(
                orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
            ),
        )
        # Update axis labels
        fig.update_xaxes(title_text="Time", row=1, col=1)
        fig.update_yaxes(title_text="Price ($)", row=1, col=1)

        fig.update_xaxes(title_text="Model", row=1, col=2)
        fig.update_yaxes(title_text="AUC Score", row=1, col=2)

        fig.update_xaxes(title_text="Time", row=2, col=1)
        fig.update_yaxes(title_text="Confidence", row=2, col=1)

        fig.update_xaxes(title_text="Time", row=2, col=2)
        fig.update_yaxes(title_text="Price Change (%)", row=2, col=2)

        fig.update_xaxes(title_text="Model", row=3, col=1)
        fig.update_yaxes(title_text="Score", row=3, col=1)

        fig.update_xaxes(title_text="Hour (UTC)", row=3, col=2)
        fig.update_yaxes(title_text="Accuracy", row=3, col=2)

        fig.show()
        print("✅ Interactive dashboard created successfully!")

    else:
        if "predictions" not in locals():
            print("⚠️  Please run predictions first")
        elif len(predictions) == 0:
            print("⚠️  No predictions available for dashboard")
        else:
            print("⚠️  Please ensure performance_df is available")

🎨 Creating comprehensive Plotly dashboard...


✅ Interactive dashboard created successfully!


In [15]:
# 🔮 Live Prediction Function
def make_live_prediction(models):
    """Make a prediction for the current hour based on available data"""
    if "models" not in locals() or len(models) == 0:
        print("❌ Please train models first")
        return None

    current_time = pd.Timestamp.now(tz="UTC")
    current_hour_start = current_time.floor("H")

    # Check if we have at least 45 minutes of data for current hour
    minutes_elapsed = (current_time - current_hour_start).total_seconds() / 60

    if minutes_elapsed >= 45:
        try:
            # Use best model
            best_model_name = performance_df.loc[
                performance_df["AUC"].idxmax(), "Model"
            ]
            best_model = models[best_model_name]

            # Make prediction
            pred = best_model.predict(df, current_hour_start)

            print(
                f"🔮 LIVE PREDICTION for {current_hour_start.strftime('%Y-%m-%d %H:%M')} hour:"
            )
            print(f"  📈 Predicted Direction: {pred.predicted_direction.upper()}")
            print(f"  🎯 Confidence: {pred.confidence:.1%}")
            print(f"  🟢 Probability Green: {pred.probability_green:.1%}")
            print(f"  🔴 Probability Red: {pred.probability_red:.1%}")
            print(f"  🤖 Model Used: {pred.model_name}")
            print(f"  ⏰ Time Remaining: {60 - minutes_elapsed:.0f} minutes")

            # Create a simple live prediction chart
            fig_live = go.Figure()

            fig_live.add_trace(
                go.Indicator(
                    mode="gauge+number",
                    value=pred.confidence * 100,
                    domain={"x": [0, 1], "y": [0, 1]},
                    title={
                        "text": f"Prediction Confidence<br>{pred.predicted_direction.title()} Candle"
                    },
                    gauge={
                        "axis": {"range": [None, 100]},
                        "bar": {
                            "color": "green"
                            if pred.predicted_direction == "green"
                            else "red"
                        },
                        "steps": [
                            {"range": [0, 50], "color": "lightgray"},
                            {"range": [50, 75], "color": "yellow"},
                            {"range": [75, 100], "color": "lightgreen"},
                        ],
                        "threshold": {
                            "line": {"color": "black", "width": 4},
                            "thickness": 0.75,
                            "value": 70,
                        },
                    },
                )
            )

            fig_live.update_layout(
                title=f"Live Prediction - {current_hour_start.strftime('%H:%M UTC')}",
                height=400,
            )
            fig_live.show()

            return pred

        except Exception as e:
            print(f"❌ Cannot make live prediction: {e}")
            return None
    else:
        print(f"⏳ Current hour has only {minutes_elapsed:.0f} minutes of data.")
        print("   Need at least 45 minutes for prediction.")
        return None


# Try to make a live prediction
print("🕐 Checking for live prediction opportunity...")
live_pred = make_live_prediction(models)

🕐 Checking for live prediction opportunity...
🔮 LIVE PREDICTION for 2025-06-29 20:00 hour:
  📈 Predicted Direction: RED
  🎯 Confidence: 92.2%
  🟢 Probability Green: 7.8%
  🔴 Probability Red: 92.2%
  🤖 Model Used: logistic
  ⏰ Time Remaining: 15 minutes
