# ETH Candle Direction Prediction Analysis

This notebook provides interactive analysis for predicting 1-hour ETH candle direction based on the first 45 minutes of data.

## Overview
- Load and prepare OHLCV data at multiple timeframes
- Extract features from first 45 minutes using statistical methods
- Train machine learning models to predict candle direction  
- Make real-time predictions with confidence scores
- Interactive Plotly visualizations and dashboards


In [4]:
import pandas as pd
import numpy as np
import ccxt
import time
import warnings
from datetime import datetime, timezone, timedelta
import logging
import os

# Plotly for all visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# ML libraries
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import our custom predictor
from src.eth_candle_predictor import EthCandlePredictor, PredictionResult

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

print("📊 Libraries imported successfully!")
print("🎨 Using Plotly for all interactive visualizations")

📊 Libraries imported successfully!
🎨 Using Plotly for all interactive visualizations


## 📈 Data Loading and Preparation

Using your existing approach to fetch and prepare ETH data at multiple timeframes. You can either fetch new data or use existing dataframes.


In [8]:
def fetch_eth_data(days=14):
    """Fetch ETH/USDT data - similar to your existing approach"""
    exchange = ccxt.binance()
    symbol = "ETH/USDT"
    timeframe = "1m"
    since = exchange.parse8601(
        (datetime.now(timezone.utc) - timedelta(days=days)).isoformat()
    )

    all_ohlcv = []
    limit = 1000

    print(f"🔄 Fetching {days} days of ETH data...")
    while True:
        try:
            ohlcv = exchange.fetch_ohlcv(symbol, timeframe, since, limit)
            if len(ohlcv) == 0:
                break
            all_ohlcv.extend(ohlcv)
            since = ohlcv[-1][0] + 1
            time.sleep(exchange.rateLimit / 1000)
            if len(all_ohlcv) % 5000 == 0:
                print(f"  📥 Fetched {len(all_ohlcv)} candles...")
        except Exception as e:
            print(f"❌ Error: {e}")
            break

    df = pd.DataFrame(
        all_ohlcv, columns=["timestamp", "open", "high", "low", "close", "volume"]
    )
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
    df = df.set_index("timestamp").sort_index()
    df = df[~df.index.duplicated(keep="last")]

    print(f"✅ Loaded {len(df)} 1-minute candles from {df.index[0]} to {df.index[-1]}")
    return df


# Uncomment the line below to fetch new data (or use your existing df)
df = fetch_eth_data(days=15)

🔄 Fetching 15 days of ETH data...
  📥 Fetched 5000 candles...
  📥 Fetched 10000 candles...
  📥 Fetched 15000 candles...
  📥 Fetched 20000 candles...
✅ Loaded 21600 1-minute candles from 2025-06-14 17:30:00+00:00 to 2025-06-29 17:29:00+00:00


In [9]:
# Create resampled dataframes (use your existing ones if available)
if "df" in locals() and df is not None:
    print("🔄 Creating resampled timeframes...")

    # Resample to different timeframes
    df_5min = (
        df.resample("5min")
        .agg(
            {
                "open": "first",
                "high": "max",
                "low": "min",
                "close": "last",
                "volume": "sum",
            }
        )
        .dropna()
    )

    df_15min = (
        df.resample("15min")
        .agg(
            {
                "open": "first",
                "high": "max",
                "low": "min",
                "close": "last",
                "volume": "sum",
            }
        )
        .dropna()
    )

    df_1h = (
        df.resample("1H")
        .agg(
            {
                "open": "first",
                "high": "max",
                "low": "min",
                "close": "last",
                "volume": "sum",
            }
        )
        .dropna()
    )

    print(f"✅ Resampled data summary:")
    print(f"   📊 1-min: {len(df):,} candles")
    print(f"   📊 5-min: {len(df_5min):,} candles")
    print(f"   📊 15-min: {len(df_15min):,} candles")
    print(f"   📊 1-hour: {len(df_1h):,} candles")

    print("\n📋 Recent 1-hour data:")
    display(df_1h.tail())

    # Show basic stats
    latest_price = df_1h.iloc[-1]["close"]
    price_change_24h = (
        (df_1h.iloc[-1]["close"] - df_1h.iloc[-25]["close"]) / df_1h.iloc[-25]["close"]
    ) * 100
    print(f"\n💰 Current ETH Price: ${latest_price:,.2f}")
    print(f"📈 24h Change: {price_change_24h:+.2f}%")

else:
    print(
        "⚠️  Please load 'df' first by running the fetch_eth_data function or use your existing dataframe"
    )
    print("    Example: df = fetch_eth_data(days=14)")

🔄 Creating resampled timeframes...
✅ Resampled data summary:
   📊 1-min: 21,600 candles
   📊 5-min: 4,320 candles
   📊 15-min: 1,440 candles
   📊 1-hour: 361 candles

📋 Recent 1-hour data:


Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-06-29 13:00:00+00:00,2454.83,2456.59,2446.27,2446.91,8605.6572
2025-06-29 14:00:00+00:00,2446.92,2451.7,2435.78,2440.18,10951.5297
2025-06-29 15:00:00+00:00,2440.18,2444.4,2432.88,2436.68,6967.2572
2025-06-29 16:00:00+00:00,2436.69,2440.18,2429.58,2439.06,9579.2707
2025-06-29 17:00:00+00:00,2439.07,2440.44,2435.94,2435.99,2001.126



💰 Current ETH Price: $2,435.99
📈 24h Change: -0.33%


## 🔄 Historical Analysis: Candle Flipping Behavior

Analyze how often candles change direction between the 45-minute mark and close. This gives us insights into market behavior and the predictability of hourly candle outcomes.


In [10]:
def analyze_candle_flipping(df_1min, df_1h, df_15min):
    """Analyze how often candles flip direction between 45min and close"""
    results = []

    print("🔍 Analyzing candle flipping behavior...")

    for i, ts in enumerate(df_1h.index):
        if i % 100 == 0:
            print(f"   Processing candle {i + 1}/{len(df_1h)}")

        open_price = df_1h.loc[ts, "open"]
        close_price = df_1h.loc[ts, "close"]

        # Find 45-min mark price (closest 15-min candle to 45-min mark)
        ts_45 = ts + pd.Timedelta(minutes=45)
        closest_idx = np.abs(df_15min.index - ts_45).argmin()

        if np.abs(df_15min.index[closest_idx] - ts_45) <= pd.Timedelta(minutes=15):
            price_45 = df_15min.iloc[closest_idx]["close"]
        else:
            continue

        # Calculate changes
        delta_45 = price_45 - open_price
        delta_close = close_price - open_price

        direction_45 = "up" if delta_45 > 0 else "down" if delta_45 < 0 else "flat"
        direction_close = (
            "up" if delta_close > 0 else "down" if delta_close < 0 else "flat"
        )
        flipped = direction_45 != direction_close

        results.append(
            {
                "timestamp": ts,
                "open": open_price,
                "price_at_45min": price_45,
                "close": close_price,
                "delta_45_pct": (delta_45 / open_price) * 100,
                "delta_close_pct": (delta_close / open_price) * 100,
                "direction_45": direction_45,
                "direction_close": direction_close,
                "flipped": flipped,
                "green_candle": close_price > open_price,
                "hour_of_day": ts.hour,
                "day_of_week": ts.day_name(),
            }
        )

    return pd.DataFrame(results)


# Analyze historical behavior
if "df_1h" in locals():
    flip_df = analyze_candle_flipping(df, df_1h, df_15min)

    # Print comprehensive statistics
    total_candles = len(flip_df)
    green_candles = flip_df["green_candle"].sum()
    flip_rate = flip_df["flipped"].mean()

    print(f"\n📊 Historical Analysis ({total_candles:,} candles):")
    print(
        f"   🟢 Green candles: {green_candles:,} ({green_candles / total_candles:.1%})"
    )
    print(
        f"   🔴 Red candles: {total_candles - green_candles:,} ({(total_candles - green_candles) / total_candles:.1%})"
    )
    print(f"   🔄 Overall flip rate: {flip_rate:.1%}")

    # Analyze flip rates by different thresholds
    print(f"\n🎯 Flip Rate Analysis by Magnitude:")
    for threshold in [0.1, 0.25, 0.5, 1.0]:
        subset_up = flip_df[flip_df["delta_45_pct"] > threshold]
        subset_down = flip_df[flip_df["delta_45_pct"] < -threshold]

        if len(subset_up) > 0:
            flip_rate_up = subset_up["flipped"].mean()
            print(
                f"   📈 Up >{threshold:>4}% at 45min: {flip_rate_up:.1%} flip rate ({len(subset_up):,} samples)"
            )

        if len(subset_down) > 0:
            flip_rate_down = subset_down["flipped"].mean()
            print(
                f"   📉 Down <-{threshold:>3}% at 45min: {flip_rate_down:.1%} flip rate ({len(subset_down):,} samples)"
            )

    # Time-based analysis
    print(f"\n🕐 Flip Rate by Hour of Day (UTC):")
    hourly_flip = flip_df.groupby("hour_of_day")["flipped"].agg(["mean", "count"])
    for hour in range(24):
        if hour in hourly_flip.index:
            rate = hourly_flip.loc[hour, "mean"]
            count = hourly_flip.loc[hour, "count"]
            print(
                f"   {hour:2d}:00-{hour:2d}:59  {rate:.1%} flip rate ({count:,} candles)"
            )

    print(f"\n📋 Sample of flip analysis data:")
    display(flip_df.head(10))

else:
    print("⚠️  Please ensure df_1h is available first")

🔍 Analyzing candle flipping behavior...
   Processing candle 1/361
   Processing candle 101/361
   Processing candle 201/361
   Processing candle 301/361

📊 Historical Analysis (360 candles):
   🟢 Green candles: 181 (50.3%)
   🔴 Red candles: 179 (49.7%)
   🔄 Overall flip rate: 0.0%

🎯 Flip Rate Analysis by Magnitude:
   📈 Up > 0.1% at 45min: 0.0% flip rate (150 samples)
   📉 Down <-0.1% at 45min: 0.0% flip rate (143 samples)
   📈 Up >0.25% at 45min: 0.0% flip rate (98 samples)
   📉 Down <-0.25% at 45min: 0.0% flip rate (97 samples)
   📈 Up > 0.5% at 45min: 0.0% flip rate (56 samples)
   📉 Down <-0.5% at 45min: 0.0% flip rate (48 samples)
   📈 Up > 1.0% at 45min: 0.0% flip rate (19 samples)
   📉 Down <-1.0% at 45min: 0.0% flip rate (21 samples)

🕐 Flip Rate by Hour of Day (UTC):
    0:00- 0:59  0.0% flip rate (15 candles)
    1:00- 1:59  0.0% flip rate (15 candles)
    2:00- 2:59  0.0% flip rate (15 candles)
    3:00- 3:59  0.0% flip rate (15 candles)
    4:00- 4:59  0.0% flip rate (15 

Unnamed: 0,timestamp,open,price_at_45min,close,delta_45_pct,delta_close_pct,direction_45,direction_close,flipped,green_candle,hour_of_day,day_of_week
0,2025-06-14 17:00:00+00:00,2515.08,2511.49,2511.49,-0.142739,-0.142739,down,down,False,False,17,Saturday
1,2025-06-14 18:00:00+00:00,2511.49,2508.48,2508.48,-0.119849,-0.119849,down,down,False,False,18,Saturday
2,2025-06-14 19:00:00+00:00,2508.49,2507.5,2507.5,-0.039466,-0.039466,down,down,False,False,19,Saturday
3,2025-06-14 20:00:00+00:00,2507.5,2509.37,2509.37,0.074576,0.074576,up,up,False,True,20,Saturday
4,2025-06-14 21:00:00+00:00,2509.37,2527.94,2527.94,0.740026,0.740026,up,up,False,True,21,Saturday
5,2025-06-14 22:00:00+00:00,2527.95,2526.49,2526.49,-0.057754,-0.057754,down,down,False,False,22,Saturday
6,2025-06-14 23:00:00+00:00,2526.48,2530.76,2530.76,0.169406,0.169406,up,up,False,True,23,Saturday
7,2025-06-15 00:00:00+00:00,2530.77,2537.6,2537.6,0.269878,0.269878,up,up,False,True,0,Sunday
8,2025-06-15 01:00:00+00:00,2537.61,2528.99,2528.99,-0.33969,-0.33969,down,down,False,False,1,Sunday
9,2025-06-15 02:00:00+00:00,2529.0,2529.19,2529.19,0.007513,0.007513,up,up,False,True,2,Sunday


In [11]:
# 📊 Interactive Plotly Visualizations for Flip Behavior
if "flip_df" in locals():
    # 1. Flip behavior histogram
    fig1 = px.histogram(
        flip_df,
        x="delta_45_pct",
        color="flipped",
        nbins=50,
        title="🔄 Candle Flip Behavior by 45-Minute Change",
        labels={"delta_45_pct": "45-Minute Change (%)", "count": "Number of Candles"},
        color_discrete_map={True: "#ff6b6b", False: "#4ecdc4"},
        height=500,
    )
    fig1.update_layout(
        xaxis_title="45-Minute Change (%)",
        yaxis_title="Number of Candles",
        legend_title="Flipped Direction",
    )
    fig1.show()

    # 2. Green vs Red distribution
    fig2 = px.histogram(
        flip_df,
        x="delta_45_pct",
        color="green_candle",
        nbins=50,
        title="🟢🔴 Green vs Red Candles by 45-Minute Change",
        labels={"delta_45_pct": "45-Minute Change (%)", "count": "Number of Candles"},
        color_discrete_map={True: "#00ff00", False: "#ff0000"},
        height=500,
    )
    fig2.update_layout(
        xaxis_title="45-Minute Change (%)",
        yaxis_title="Number of Candles",
        legend_title="Candle Color",
    )
    fig2.show()

    # 3. Flip rate by hour of day
    hourly_stats = (
        flip_df.groupby("hour_of_day")
        .agg({"flipped": ["mean", "count"], "green_candle": "mean"})
        .round(3)
    )
    hourly_stats.columns = ["flip_rate", "count", "green_rate"]
    hourly_stats = hourly_stats.reset_index()

    fig3 = px.bar(
        hourly_stats,
        x="hour_of_day",
        y="flip_rate",
        title="🕐 Flip Rate by Hour of Day",
        labels={"hour_of_day": "Hour of Day", "flip_rate": "Flip Rate"},
        color="flip_rate",
        color_continuous_scale="RdYlBu_r",
        height=400,
    )
    fig3.update_layout(
        xaxis_title="Hour of Day (UTC)",
        yaxis_title="Flip Rate",
        xaxis=dict(tickmode="linear"),
    )
    fig3.show()

    # 4. Scatter plot: 45-min change vs final change
    fig4 = px.scatter(
        flip_df.sample(min(2000, len(flip_df))),  # Sample for performance
        x="delta_45_pct",
        y="delta_close_pct",
        color="flipped",
        title="📈 45-Minute Change vs Final Change (Sampled)",
        labels={
            "delta_45_pct": "45-Minute Change (%)",
            "delta_close_pct": "Final Change (%)",
        },
        color_discrete_map={True: "#ff6b6b", False: "#4ecdc4"},
        opacity=0.6,
        height=500,
    )
    # Add diagonal line (y=x) to show where no change would occur
    fig4.add_shape(
        type="line",
        x0=-3,
        y0=-3,
        x1=3,
        y1=3,
        line=dict(color="gray", width=2, dash="dash"),
    )
    fig4.update_layout(
        xaxis_title="45-Minute Change (%)",
        yaxis_title="Final Change (%)",
        legend_title="Flipped Direction",
    )
    fig4.show()

    # 5. Day of week analysis
    daily_stats = (
        flip_df.groupby("day_of_week")
        .agg({"flipped": "mean", "green_candle": "mean"})
        .round(3)
    )
    daily_stats = daily_stats.reindex(
        ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    )
    daily_stats = daily_stats.reset_index()

    fig5 = px.bar(
        daily_stats,
        x="day_of_week",
        y="flipped",
        title="📅 Flip Rate by Day of Week",
        labels={"day_of_week": "Day of Week", "flipped": "Flip Rate"},
        color="flipped",
        color_continuous_scale="viridis",
        height=400,
    )
    fig5.update_layout(xaxis_title="Day of Week", yaxis_title="Flip Rate")
    fig5.show()

    print("✅ All flip behavior visualizations completed!")

else:
    print("⚠️  Please run the flip analysis first")

✅ All flip behavior visualizations completed!


In [12]:
# 🏋️ Train different models
if "df" in locals():
    models = {}
    performances = {}

    print("🤖 Training machine learning models...")
    print("⏳ This may take a few minutes depending on data size...")

    for model_type in ["random_forest", "gradient_boost", "logistic"]:
        print(f"\n🔄 Training {model_type} model...")

        try:
            predictor = EthCandlePredictor(model_type=model_type)
            performance = predictor.train(df, test_size=0.2)

            models[model_type] = predictor
            performances[model_type] = performance

            print(f"   ✅ Accuracy: {performance.accuracy:.3f}")
            print(f"   ✅ AUC: {performance.auc_score:.3f}")
            print(f"   ✅ Features used: {len(predictor.feature_names)}")

        except Exception as e:
            print(f"   ❌ Error training {model_type}: {e}")

    if models:
        print(f"\n🎉 Successfully trained {len(models)} models!")

        # Create performance comparison dataframe
        performance_df = pd.DataFrame(
            {
                "Model": list(performances.keys()),
                "Accuracy": [p.accuracy for p in performances.values()],
                "Precision": [p.precision for p in performances.values()],
                "Recall": [p.recall for p in performances.values()],
                "F1_Score": [p.f1_score for p in performances.values()],
                "AUC": [p.auc_score for p in performances.values()],
            }
        )

        print(f"\n📊 Model Performance Comparison:")
        display(performance_df.round(3))

        # Identify best model
        best_model_name = performance_df.loc[performance_df["AUC"].idxmax(), "Model"]
        best_auc = performance_df["AUC"].max()
        print(f"\n🏆 Best performing model: {best_model_name} (AUC: {best_auc:.3f})")

    else:
        print("❌ No models were successfully trained")

else:
    print("⚠️  Please load data first by running the data loading cells")

🤖 Training machine learning models...
⏳ This may take a few minutes depending on data size...

🔄 Training random_forest model...
   ✅ Accuracy: 0.931
   ✅ AUC: 0.953
   ✅ Features used: 39

🔄 Training gradient_boost model...
   ✅ Accuracy: 0.889
   ✅ AUC: 0.938
   ✅ Features used: 39

🔄 Training logistic model...
   ✅ Accuracy: 0.875
   ✅ AUC: 0.943
   ✅ Features used: 39

🎉 Successfully trained 3 models!

📊 Model Performance Comparison:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score,AUC
0,random_forest,0.931,0.974,0.902,0.937,0.953
1,gradient_boost,0.889,0.923,0.878,0.9,0.938
2,logistic,0.875,0.971,0.805,0.88,0.943



🏆 Best performing model: random_forest (AUC: 0.953)


In [18]:
# 🔮 Make predictions on recent data
if "models" in locals() and "df_1h" in locals() and len(models) > 0:
    recent_hours = df_1h.index[-15:]  # Last 15 hours for better analysis
    predictions = []

    print("🔮 Making predictions on recent hours...")
    print("⏳ Using best performing model for predictions...\n")

    # Use best performing model
    best_model_name = performance_df.loc[performance_df["AUC"].idxmax(), "Model"]
    best_model = models[best_model_name]
    print(f"🏆 Using best model: {best_model_name}")
    print("=" * 70)

    successful_predictions = 0

    for i, hour_start in enumerate(recent_hours):
        try:
            # Make prediction
            pred = best_model.predict(df, hour_start)

            # Get actual result
            actual_close = df_1h.loc[hour_start, "close"]
            actual_open = df_1h.loc[hour_start, "open"]
            actual_direction = "green" if actual_close > actual_open else "red"
            actual_change = (actual_close - actual_open) / actual_open * 100

            correct = pred.predicted_direction == actual_direction

            predictions.append(
                {
                    "timestamp": hour_start,
                    "predicted": pred.predicted_direction,
                    "actual": actual_direction,
                    "confidence": pred.confidence,
                    "prob_green": pred.probability_green,
                    "prob_red": pred.probability_red,
                    "actual_change": actual_change,
                    "correct": correct,
                    "open_price": actual_open,
                    "close_price": actual_close,
                }
            )

            # Emoji indicators
            status = "✅" if correct else "❌"
            confidence_emoji = (
                "🔥"
                if pred.confidence > 0.7
                else "⚡"
                if pred.confidence > 0.6
                else "💡"
            )
            direction_emoji = "🟢" if pred.predicted_direction == "green" else "🔴"

            print(
                f"{hour_start.strftime('%m-%d %H:%M')} | "
                f"{direction_emoji} Pred: {pred.predicted_direction:>5} {confidence_emoji}({pred.confidence:.1%}) | "
                f"Actual: {actual_direction:>5} ({actual_change:+.2f}%) {status}"
            )

            successful_predictions += 1

        except Exception as e:
            print(f"❌ Failed to predict for {hour_start}: {e}")

    if predictions:
        accuracy = sum(p["correct"] for p in predictions) / len(predictions)
        avg_confidence = np.mean([p["confidence"] for p in predictions])

        print("=" * 70)
        print(f"📊 PREDICTION SUMMARY:")
        print(
            f"   🎯 Accuracy: {accuracy:.1%} ({sum(p['correct'] for p in predictions)}/{len(predictions)})"
        )
        print(f"   💪 Average Confidence: {avg_confidence:.1%}")
        print(
            f"   ✅ Successful Predictions: {successful_predictions}/{len(recent_hours)}"
        )

        # Additional stats
        correct_preds = [p for p in predictions if p["correct"]]
        if correct_preds:
            avg_correct_confidence = np.mean([p["confidence"] for p in correct_preds])
            print(f"   🔥 Avg Confidence (Correct): {avg_correct_confidence:.1%}")

        incorrect_preds = [p for p in predictions if not p["correct"]]
        if incorrect_preds:
            avg_incorrect_confidence = np.mean(
                [p["confidence"] for p in incorrect_preds]
            )
            print(f"   💔 Avg Confidence (Wrong): {avg_incorrect_confidence:.1%}")

        print(f"\n📋 Detailed predictions saved to 'predictions' variable")

else:
    if "models" not in locals() or len(models) == 0:
        print("⚠️  Please train models first")
    else:
        print("⚠️  Please ensure hourly data is available")

🔮 Making predictions on recent hours...
⏳ Using best performing model for predictions...

🏆 Using best model: random_forest
06-29 03:00 | 🔴 Pred:   red 🔥(89.1%) | Actual:   red (-0.16%) ✅
06-29 04:00 | 🟢 Pred: green 💡(55.1%) | Actual: green (+0.08%) ✅
06-29 05:00 | 🟢 Pred: green ⚡(66.7%) | Actual: green (+0.16%) ✅
06-29 06:00 | 🟢 Pred: green 💡(57.7%) | Actual: green (+0.13%) ✅
06-29 07:00 | 🟢 Pred: green 💡(59.1%) | Actual: green (+0.15%) ✅
06-29 08:00 | 🟢 Pred: green 🔥(88.4%) | Actual: green (+0.13%) ✅
06-29 09:00 | 🟢 Pred: green 🔥(89.1%) | Actual: green (+0.19%) ✅
06-29 10:00 | 🟢 Pred: green 💡(57.3%) | Actual: green (+0.33%) ✅
06-29 11:00 | 🟢 Pred: green 🔥(70.0%) | Actual: green (+0.24%) ✅
06-29 12:00 | 🔴 Pred:   red 🔥(93.8%) | Actual:   red (-0.30%) ✅
06-29 13:00 | 🔴 Pred:   red 🔥(84.7%) | Actual:   red (-0.32%) ✅
06-29 14:00 | 🔴 Pred:   red 🔥(96.9%) | Actual:   red (-0.28%) ✅
06-29 15:00 | 🟢 Pred: green 🔥(76.7%) | Actual:   red (-0.14%) ❌
06-29 16:00 | 🔴 Pred:   red 🔥(90.9%) | Actua

In [14]:
# 🔍 Feature Importance Analysis (Plotly Visualization)
if "models" in locals() and len(models) > 0:
    print("🔍 Analyzing feature importance...")

    # Use Random Forest for feature importance (it provides the most interpretable results)
    if "random_forest" in models:
        rf_model = models["random_forest"]

        if hasattr(rf_model.model, "feature_importances_"):
            feature_importance = pd.DataFrame(
                {
                    "feature": rf_model.feature_names,
                    "importance": rf_model.model.feature_importances_,
                }
            ).sort_values("importance", ascending=False)

            print(f"✅ Extracted {len(feature_importance)} feature importances")

            # Create interactive Plotly bar chart for top features
            top_features = feature_importance.head(20)  # Top 20 features

            fig_importance = px.bar(
                top_features,
                x="importance",
                y="feature",
                orientation="h",
                title="🎯 Top 20 Most Important Features (Random Forest)",
                labels={"importance": "Feature Importance", "feature": "Feature Name"},
                color="importance",
                color_continuous_scale="viridis",
                height=700,
            )

            # Customize layout
            fig_importance.update_layout(
                yaxis={"categoryorder": "total ascending"},
                xaxis_title="Feature Importance",
                yaxis_title="Feature Name",
                showlegend=False,
                margin=dict(l=200),  # More space for feature names
            )

            fig_importance.show()

            # Show top 10 in a table
            print(f"\n📊 Top 10 Most Important Features:")
            display(feature_importance.head(10).round(4))

            # Categorize features
            feature_categories = {
                "price": ["open", "high", "low", "close", "price_45", "price_change"],
                "volume": ["volume", "vwap", "volume_sma"],
                "technical": ["rsi", "macd", "bb_", "sma", "ema"],
                "volatility": ["atr", "volatility", "std"],
                "time": ["hour", "minute", "day_of_week"],
            }

            # Analyze feature categories
            category_importance = {}
            for category, keywords in feature_categories.items():
                category_features = feature_importance[
                    feature_importance["feature"].str.contains(
                        "|".join(keywords), case=False
                    )
                ]
                if len(category_features) > 0:
                    category_importance[category] = category_features[
                        "importance"
                    ].sum()

            if category_importance:
                category_df = pd.DataFrame(
                    list(category_importance.items()),
                    columns=["Category", "Total_Importance"],
                )
                category_df = category_df.sort_values(
                    "Total_Importance", ascending=False
                )

                # Create category importance chart
                fig_category = px.pie(
                    category_df,
                    values="Total_Importance",
                    names="Category",
                    title="📈 Feature Importance by Category",
                    color_discrete_sequence=px.colors.qualitative.Set3,
                    height=500,
                )

                fig_category.update_traces(
                    textposition="inside", textinfo="percent+label"
                )

                fig_category.show()

                print(f"\n🏷️  Feature Category Analysis:")
                for i, row in category_df.iterrows():
                    print(f"   {row['Category']:>12}: {row['Total_Importance']:.3f}")

        else:
            print(
                "❌ Model doesn't have feature importances (might be logistic regression)"
            )
    else:
        print("❌ Random Forest model not available for feature importance analysis")
else:
    print("⚠️  Please train models first")

🔍 Analyzing feature importance...
✅ Extracted 39 feature importances



📊 Top 10 Most Important Features:


Unnamed: 0,feature,importance
4,price_change_abs,0.2101
5,price_change_pct,0.1674
35,momentum_15min,0.1314
8,upward_momentum,0.0889
7,position_in_range,0.0472
21,macd,0.0421
22,macd_signal,0.0404
9,upper_half,0.0289
25,hammer_like,0.0213
34,up_down_ratio,0.0212



🏷️  Feature Category Analysis:
          price: 0.403
      technical: 0.103
         volume: 0.026
           time: 0.010
     volatility: 0.009


In [15]:
# 📊 Comprehensive Interactive Dashboard
if "predictions" in locals() and "performance_df" in locals() and len(predictions) > 0:
    print("🎨 Creating comprehensive Plotly dashboard...")

    # Create subplots dashboard
    fig = make_subplots(
        rows=3,
        cols=2,
        subplot_titles=(
            "📈 Recent ETH Price (1H Candlesticks)",
            "🏆 Model Performance Comparison",
            "🎯 Prediction Confidence Over Time",
            "💰 Prediction vs Actual Price Changes",
            "📊 Model Performance Metrics",
            "🕐 Prediction Accuracy by Hour",
        ),
        specs=[
            [{"type": "candlestick"}, {"type": "bar"}],
            [{"type": "scatter"}, {"type": "scatter"}],
            [{"type": "bar"}, {"type": "bar"}],
        ],
        vertical_spacing=0.08,
        horizontal_spacing=0.1,
    )

    # 1. Recent price action (candlestick)
    recent_data = df_1h.tail(48)  # Last 48 hours
    fig.add_trace(
        go.Candlestick(
            x=recent_data.index,
            open=recent_data["open"],
            high=recent_data["high"],
            low=recent_data["low"],
            close=recent_data["close"],
            name="ETH Price",
            showlegend=False,
        ),
        row=1,
        col=1,
    )

    # 2. Model performance comparison
    fig.add_trace(
        go.Bar(
            x=performance_df["Model"],
            y=performance_df["AUC"],
            name="AUC Score",
            marker_color=["#1f77b4", "#ff7f0e", "#2ca02c"][: len(performance_df)],
            showlegend=False,
            text=performance_df["AUC"].round(3),
            textposition="outside",
        ),
        row=1,
        col=2,
    )

    # 3. Prediction confidence over time
    pred_df = pd.DataFrame(predictions)
    colors = ["#00ff00" if correct else "#ff0000" for correct in pred_df["correct"]]
    fig.add_trace(
        go.Scatter(
            x=pred_df["timestamp"],
            y=pred_df["confidence"],
            mode="markers+lines",
            marker=dict(color=colors, size=10),
            line=dict(color="gray", width=1),
            name="Confidence",
            showlegend=False,
        ),
        row=2,
        col=1,
    )

    # 4. Predicted vs actual price changes
    symbols = ["circle" if correct else "x" for correct in pred_df["correct"]]
    pred_colors = [
        "#00ff00" if pred == "green" else "#ff0000" for pred in pred_df["predicted"]
    ]
    fig.add_trace(
        go.Scatter(
            x=pred_df["timestamp"],
            y=pred_df["actual_change"],
            mode="markers",
            marker=dict(
                color=pred_colors,
                size=12,
                symbol=symbols,
                line=dict(width=2, color="black"),
            ),
            name="Predictions",
            showlegend=False,
        ),
        row=2,
        col=2,
    )

    # 5. Model performance metrics comparison
    metrics = ["Accuracy", "Precision", "Recall", "F1_Score"]

    for i, metric in enumerate(metrics):
        fig.add_trace(
            go.Bar(
                x=performance_df["Model"],
                y=performance_df[metric],
                name=metric,
                offsetgroup=i,
                showlegend=True if i < 2 else False,  # Only show legend for first 2
                marker_color=px.colors.qualitative.Set1[i],
            ),
            row=3,
            col=1,
        )

    # 6. Prediction accuracy by hour (if we have enough data)
    if len(pred_df) > 5:
        pred_df["hour"] = pred_df["timestamp"].dt.hour
        hourly_accuracy = (
            pred_df.groupby("hour")["correct"].agg(["mean", "count"]).reset_index()
        )
        hourly_accuracy.columns = ["hour", "accuracy", "count"]

        fig.add_trace(
            go.Bar(
                x=hourly_accuracy["hour"],
                y=hourly_accuracy["accuracy"],
                name="Hourly Accuracy",
                marker_color="lightblue",
                showlegend=False,
                text=hourly_accuracy["count"].astype(str) + " samples",
                textposition="outside",
            ),
            row=3,
            col=2,
        )
        # Update layout
        fig.update_layout(
            height=1000,
            title_text="🎯 ETH Candle Prediction Analysis Dashboard",
            title_x=0.5,
            title_font_size=20,
            showlegend=True,
            legend=dict(
                orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
            ),
        )
        # Update axis labels
        fig.update_xaxes(title_text="Time", row=1, col=1)
        fig.update_yaxes(title_text="Price ($)", row=1, col=1)

        fig.update_xaxes(title_text="Model", row=1, col=2)
        fig.update_yaxes(title_text="AUC Score", row=1, col=2)

        fig.update_xaxes(title_text="Time", row=2, col=1)
        fig.update_yaxes(title_text="Confidence", row=2, col=1)

        fig.update_xaxes(title_text="Time", row=2, col=2)
        fig.update_yaxes(title_text="Price Change (%)", row=2, col=2)

        fig.update_xaxes(title_text="Model", row=3, col=1)
        fig.update_yaxes(title_text="Score", row=3, col=1)

        fig.update_xaxes(title_text="Hour (UTC)", row=3, col=2)
        fig.update_yaxes(title_text="Accuracy", row=3, col=2)

        fig.show()
        print("✅ Interactive dashboard created successfully!")

    else:
        if "predictions" not in locals():
            print("⚠️  Please run predictions first")
        elif len(predictions) == 0:
            print("⚠️  No predictions available for dashboard")
        else:
            print("⚠️  Please ensure performance_df is available")

🎨 Creating comprehensive Plotly dashboard...


✅ Interactive dashboard created successfully!


In [19]:
# 🔮 Live Prediction Function
def make_live_prediction():
    """Make a prediction for the current hour based on available data"""
    if "models" not in locals() or len(models) == 0:
        print("❌ Please train models first")
        return None

    current_time = pd.Timestamp.now(tz="UTC")
    current_hour_start = current_time.floor("H")

    # Check if we have at least 45 minutes of data for current hour
    minutes_elapsed = (current_time - current_hour_start).total_seconds() / 60

    if minutes_elapsed >= 45:
        try:
            # Use best model
            best_model_name = performance_df.loc[
                performance_df["AUC"].idxmax(), "Model"
            ]
            best_model = models[best_model_name]

            # Make prediction
            pred = best_model.predict(df, current_hour_start)

            print(
                f"🔮 LIVE PREDICTION for {current_hour_start.strftime('%Y-%m-%d %H:%M')} hour:"
            )
            print(f"  📈 Predicted Direction: {pred.predicted_direction.upper()}")
            print(f"  🎯 Confidence: {pred.confidence:.1%}")
            print(f"  🟢 Probability Green: {pred.probability_green:.1%}")
            print(f"  🔴 Probability Red: {pred.probability_red:.1%}")
            print(f"  🤖 Model Used: {pred.model_name}")
            print(f"  ⏰ Time Remaining: {60 - minutes_elapsed:.0f} minutes")

            # Create a simple live prediction chart
            fig_live = go.Figure()

            fig_live.add_trace(
                go.Indicator(
                    mode="gauge+number",
                    value=pred.confidence * 100,
                    domain={"x": [0, 1], "y": [0, 1]},
                    title={
                        "text": f"Prediction Confidence<br>{pred.predicted_direction.title()} Candle"
                    },
                    gauge={
                        "axis": {"range": [None, 100]},
                        "bar": {
                            "color": "green"
                            if pred.predicted_direction == "green"
                            else "red"
                        },
                        "steps": [
                            {"range": [0, 50], "color": "lightgray"},
                            {"range": [50, 75], "color": "yellow"},
                            {"range": [75, 100], "color": "lightgreen"},
                        ],
                        "threshold": {
                            "line": {"color": "black", "width": 4},
                            "thickness": 0.75,
                            "value": 70,
                        },
                    },
                )
            )

            fig_live.update_layout(
                title=f"Live Prediction - {current_hour_start.strftime('%H:%M UTC')}",
                height=400,
            )
            fig_live.show()

            return pred

        except Exception as e:
            print(f"❌ Cannot make live prediction: {e}")
            return None
    else:
        print(f"⏳ Current hour has only {minutes_elapsed:.0f} minutes of data.")
        print(f"   Need at least 45 minutes for prediction.")
        return None


# Try to make a live prediction
print("🕐 Checking for live prediction opportunity...")
live_pred = make_live_prediction()

🕐 Checking for live prediction opportunity...
❌ Please train models first
