In [None]:
# 📦 Initialize Data Source and Load Cached Data
# Set up CLOB data source and load pre-cached candle data for analysis
import warnings

warnings.filterwarnings("ignore")  # Suppress non-critical warnings for cleaner output

# Initialize CLOB data source and load cached candle data
from core.data_sources.clob import CLOBDataSource


clob = CLOBDataSource()
clob.load_candles_cache()

print("✅ CLOB data source initialized and cache loaded")
print(f"📊 Available cached datasets: {len(clob.candles_cache)} candle sets")

# ⚙️ Analysis Configuration
Define timeframe for clustering analysis

In [None]:
INTERVAL = "15m"  # 15-minute timeframe for mid-frequency relationship analysis

print(f"📊 Configuration:")
print(f"  • Timeframe: {INTERVAL}")
print(f"  • Analysis Type: Cointegration-based hierarchical clustering")
print(f"  • Method: Statistical relationship identification")

In [None]:
# 🧮 Enhanced Cointegration Matrix Calculation with Normalized Prices
# Calculate pairwise cointegration relationships for all trading pairs
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint
from tqdm.notebook import tqdm
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Filter candles for specified interval
candles = [candle for candle in clob.candles_cache.values() if candle.interval == INTERVAL]
print(f"📊 Processing {len(candles)} trading pairs for cointegration analysis...")

# ============================================
# NORMALIZED PRICE SERIES PREPARATION
# ============================================
# IMPORTANT: Use normalized prices (cumprod of returns) for cointegration!
# This makes different price levels comparable (BTC at $60k vs DOGE at $0.10)
pair_normalized_prices = {}  # For cointegration (normalized prices)
pair_returns = {}  # For correlation analysis
min_data_points = 100  # Minimum required data points

for candle in candles:
    df = candle.data
    if df is not None and not df.empty and len(df) >= min_data_points:
        pair_name = candle.trading_pair
        
        # Calculate returns
        returns = df['close'].pct_change()
        
        # Calculate normalized prices (cumulative product starting from 1)
        # This gives us comparable price series regardless of absolute price level
        normalized_prices = (1 + returns).cumprod()
        normalized_prices.iloc[0] = 1  # Ensure first value is 1
        
        # Store both series
        pair_normalized_prices[pair_name] = normalized_prices.dropna()
        pair_returns[pair_name] = returns.dropna()

pairs = list(pair_normalized_prices.keys())
n_pairs = len(pairs)

print(f"✅ Prepared normalized price series for {n_pairs} trading pairs")
print(f"📊 All price series start at 1.0 for fair comparison")
print(f"📊 Minimum data points per series: {min_data_points}")

# ============================================
# COINTEGRATION & CORRELATION MATRICES
# ============================================
# Initialize matrices
cointegration_matrix = pd.DataFrame(
    np.ones((n_pairs, n_pairs)),  # Initialize with 1 (no cointegration)
    index=pairs,
    columns=pairs
)

correlation_matrix = pd.DataFrame(
    np.zeros((n_pairs, n_pairs)),
    index=pairs,
    columns=pairs
)

print(f"🔄 Calculating cointegration p-values and correlations...")

# Track analysis results
analysis_results = {
    'successful_tests': 0,
    'failed_tests': [],
    'strong_cointegration': [],
    'strong_correlation': []
}

# Calculate pairwise relationships
for i in tqdm(range(n_pairs), desc="Processing pairs"):
    for j in range(i, n_pairs):
        pair1, pair2 = pairs[i], pairs[j]
        
        if i == j:
            # Self-relationship
            cointegration_matrix.iloc[i, j] = 0.0  # Perfect cointegration with self
            correlation_matrix.iloc[i, j] = 1.0   # Perfect correlation with self
        else:
            # Get aligned normalized price series for cointegration
            norm_prices1 = pair_normalized_prices[pair1]
            norm_prices2 = pair_normalized_prices[pair2]
            norm_prices1_aligned, norm_prices2_aligned = norm_prices1.align(norm_prices2, join='inner')
            
            # Get aligned return series for correlation
            returns1 = pair_returns[pair1]
            returns2 = pair_returns[pair2]
            returns1_aligned, returns2_aligned = returns1.align(returns2, join='inner')
            
            if len(norm_prices1_aligned) >= min_data_points:
                try:
                    # Calculate correlation (using returns)
                    if len(returns1_aligned) > 0:
                        corr = returns1_aligned.corr(returns2_aligned)
                        correlation_matrix.iloc[i, j] = corr
                        correlation_matrix.iloc[j, i] = corr
                        
                        if abs(corr) > 0.7:
                            analysis_results['strong_correlation'].append({
                                'pair1': pair1,
                                'pair2': pair2,
                                'correlation': corr
                            })
                    
                    # Perform Engle-Granger cointegration test (using normalized prices)
                    test_stat, p_value, critical_values = coint(norm_prices1_aligned, norm_prices2_aligned)
                    
                    # Store cointegration results
                    cointegration_matrix.iloc[i, j] = p_value
                    cointegration_matrix.iloc[j, i] = p_value
                    
                    analysis_results['successful_tests'] += 1
                    
                    if p_value < 0.05:
                        analysis_results['strong_cointegration'].append({
                            'pair1': pair1,
                            'pair2': pair2,
                            'p_value': p_value,
                            'correlation': corr if 'corr' in locals() else np.nan
                        })
                    
                except Exception as e:
                    analysis_results['failed_tests'].append((pair1, pair2, str(e)))
                    cointegration_matrix.iloc[i, j] = 1.0
                    cointegration_matrix.iloc[j, i] = 1.0

print("✅ Cointegration matrix calculation complete\n")

# ============================================
# ANALYSIS SUMMARY
# ============================================
print("📊 Cointegration Analysis Results (using normalized prices):")
print("(Lower p-values indicate stronger cointegration relationships)")

# Calculate relationship counts
strong_coint = ((cointegration_matrix < 0.05) & (cointegration_matrix > 0)).sum().sum() // 2
moderate_coint = ((cointegration_matrix >= 0.05) & (cointegration_matrix < 0.1)).sum().sum() // 2
weak_coint = ((cointegration_matrix >= 0.1)).sum().sum() // 2

print(f"  • Strong cointegration (p < 0.05): {strong_coint}")
print(f"  • Moderate cointegration (0.05 ≤ p < 0.1): {moderate_coint}")
print(f"  • Weak/No cointegration (p ≥ 0.1): {weak_coint}")
print(f"  • Successful tests: {analysis_results['successful_tests']}")
print(f"  • Failed tests: {len(analysis_results['failed_tests'])}")

# Show correlation summary
high_corr = ((correlation_matrix > 0.7) & (correlation_matrix < 1)).sum().sum() // 2
moderate_corr = ((correlation_matrix > 0.3) & (correlation_matrix <= 0.7)).sum().sum() // 2
low_corr = ((correlation_matrix <= 0.3) & (correlation_matrix > -1)).sum().sum() // 2

print(f"\n📊 Correlation Analysis Results:")
print(f"  • High correlation (>0.7): {high_corr}")
print(f"  • Moderate correlation (0.3-0.7): {moderate_corr}")
print(f"  • Low correlation (≤0.3): {low_corr}")

# Show top cointegrated pairs
if analysis_results['strong_cointegration']:
    print(f"\n🏆 Top 5 Cointegrated Pairs (best for pairs trading):")
    sorted_coint = sorted(analysis_results['strong_cointegration'], key=lambda x: x['p_value'])
    for i, pair_info in enumerate(sorted_coint[:5]):
        print(f"  {i+1}. {pair_info['pair1']} ↔ {pair_info['pair2']}: p={pair_info['p_value']:.4f}, corr={pair_info['correlation']:.3f}")

# ============================================
# CORRELATION VS COINTEGRATION VISUALIZATION
# ============================================
# Create comparison visualization
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Correlation Matrix', 'Cointegration P-Values (capped at 0.1)'),
    horizontal_spacing=0.12
)

# Correlation heatmap
fig.add_trace(
    go.Heatmap(
        z=correlation_matrix.values,
        x=pairs,
        y=pairs,
        colorscale='RdBu',
        zmid=0,
        zmin=-1,
        zmax=1,
        colorbar=dict(title="Corr", x=0.45, len=0.9),
        text=correlation_matrix.values.round(2),
        texttemplate="%{text}",
        textfont={"size": 8},
        showscale=True
    ),
    row=1, col=1
)

# Cointegration heatmap (capped for better visualization)
coint_viz = np.minimum(cointegration_matrix.values, 0.1)
fig.add_trace(
    go.Heatmap(
        z=coint_viz,
        x=pairs,
        y=pairs,
        colorscale='RdYlGn_r',
        zmin=0,
        zmax=0.1,
        colorbar=dict(title="P-Val", x=1.02, len=0.9),
        text=cointegration_matrix.values.round(3),
        texttemplate="%{text}",
        textfont={"size": 8},
        showscale=True
    ),
    row=1, col=2
)

fig.update_layout(
    title='📊 Correlation vs Cointegration Analysis<br><sub>Using normalized prices (cumprod of returns) for fair comparison</sub>',
    width=1600,
    height=700,
    template="plotly_dark"
)

fig.update_xaxes(tickangle=-45, tickfont=dict(size=9))
fig.update_yaxes(tickfont=dict(size=9))
fig.show()

# Display the cointegration matrix
print("\n📊 Cointegration Matrix (p-values):")
cointegration_matrix

In [None]:
# 📊 Correlation vs Cointegration Scatter Analysis
# Analyze the relationship between correlation and cointegration
import plotly.express as px

# Prepare comparison data
comparison_data = []
for i in range(n_pairs):
    for j in range(i+1, n_pairs):
        comparison_data.append({
            'pair': f"{pairs[i]}-{pairs[j]}",
            'pair1': pairs[i],
            'pair2': pairs[j],
            'correlation': abs(correlation_matrix.iloc[i, j]),
            'cointegration_pvalue': cointegration_matrix.iloc[i, j]
        })

comparison_df = pd.DataFrame(comparison_data)

# Create scatter plot
fig = px.scatter(
    comparison_df,
    x='correlation',
    y='cointegration_pvalue',
    hover_data=['pair1', 'pair2'],
    title='🔍 Correlation vs Cointegration Analysis',
    labels={
        'correlation': 'Absolute Correlation',
        'cointegration_pvalue': 'Cointegration P-Value'
    },
    color='cointegration_pvalue',
    color_continuous_scale='RdYlGn_r'
)

# Add reference lines
fig.add_hline(y=0.05, line_dash="dash", line_color="red", annotation_text="p=0.05 threshold")
fig.add_vline(x=0.7, line_dash="dash", line_color="blue", annotation_text="High correlation")

fig.update_layout(
    width=1000,
    height=600,
    template="plotly_dark",
    xaxis_title='Absolute Correlation (Higher = More Correlated)',
    yaxis_title='Cointegration P-Value (Lower = More Cointegrated)'
)

fig.show()

# Identify interesting relationships
print("\n🎯 Interesting Relationships:")

# High correlation but not cointegrated
high_corr_no_coint = comparison_df[(comparison_df['correlation'] > 0.7) & (comparison_df['cointegration_pvalue'] > 0.05)]
print(f"\n📈 Highly correlated but NOT cointegrated ({len(high_corr_no_coint)} pairs):")
if not high_corr_no_coint.empty:
    for _, row in high_corr_no_coint.head(5).iterrows():
        print(f"  • {row['pair1']} vs {row['pair2']}: corr={row['correlation']:.3f}, p-val={row['cointegration_pvalue']:.3f}")
else:
    print("  None found")

# Cointegrated but not highly correlated
coint_low_corr = comparison_df[(comparison_df['cointegration_pvalue'] < 0.05) & (comparison_df['correlation'] < 0.5)]
print(f"\n📊 Cointegrated but LOW correlation ({len(coint_low_corr)} pairs):")
if not coint_low_corr.empty:
    for _, row in coint_low_corr.head(5).iterrows():
        print(f"  • {row['pair1']} vs {row['pair2']}: corr={row['correlation']:.3f}, p-val={row['cointegration_pvalue']:.3f}")
else:
    print("  None found")

# Both highly correlated and cointegrated (best for pairs trading)
both_strong = comparison_df[(comparison_df['correlation'] > 0.7) & (comparison_df['cointegration_pvalue'] < 0.05)]
print(f"\n⭐ BOTH highly correlated AND cointegrated ({len(both_strong)} pairs - best for pairs trading):")
if not both_strong.empty:
    for _, row in both_strong.head(5).iterrows():
        print(f"  • {row['pair1']} vs {row['pair2']}: corr={row['correlation']:.3f}, p-val={row['cointegration_pvalue']:.3f}")
else:
    print("  None found")

In [None]:
# 📊 Hierarchical Clustering and Visualization
# Perform clustering analysis and create comprehensive visualizations
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import squareform
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff

# ============================================
# SIMPLIFIED DISTANCE MATRIX TRANSFORMATION
# ============================================
# Convert cointegration p-values to distance matrix
# Use 1 - p_value so that lower p-values (stronger relationships) become higher similarity
distance_matrix = 1 - cointegration_matrix.values

# Ensure diagonal is 0 (distance from self)
np.fill_diagonal(distance_matrix, 0)

# Ensure matrix is symmetric and positive
distance_matrix = (distance_matrix + distance_matrix.T) / 2
distance_matrix = np.maximum(distance_matrix, 0)

print("✅ Distance matrix transformation complete")

# ============================================
# HIERARCHICAL CLUSTERING
# ============================================
# Convert to condensed form and perform clustering
distances_condensed = squareform(distance_matrix, checks=False)
Z = linkage(distances_condensed, method='ward')

print("✅ Hierarchical clustering complete")

# ============================================
# PLOTLY DENDROGRAM VISUALIZATION
# ============================================
# Create interactive dendrogram using plotly
# Use the distance matrix directly with plotly's dendrogram function
fig = ff.create_dendrogram(
    distance_matrix,
    orientation='left',
    labels=pairs,
    linkagefun=lambda x: linkage(x, 'ward')
)

fig.update_layout(
    title='📊 Hierarchical Clustering Dendrogram - Market Relationships',
    width=1000,
    height=800,
    xaxis_title='Distance',
    showlegend=False,
    font=dict(size=10),
    template="plotly_dark"
)

fig.show()

# ============================================
# CLUSTER ASSIGNMENT AND ANALYSIS
# ============================================
# Define number of clusters and create assignments
n_clusters = 5
clusters = fcluster(Z, n_clusters, criterion='maxclust')

# Create cluster DataFrame
cluster_df = pd.DataFrame({
    'trading_pair': pairs,
    'cluster': clusters
})

print(f"\n📊 Created {n_clusters} clusters with the following distribution:")
for i in range(1, n_clusters + 1):
    cluster_count = (clusters == i).sum()
    cluster_members = cluster_df[cluster_df['cluster'] == i]['trading_pair'].tolist()
    print(f"  • Cluster {i}: {cluster_count} pairs")
    print(f"    Members: {', '.join(cluster_members[:5])}{'...' if len(cluster_members) > 5 else ''}")

# ============================================
# VOLUME METRICS CALCULATION
# ============================================
# Calculate volume-based metrics for each trading pair
volume_metrics = {}
for candle in candles:
    if candle.data is not None and not candle.data.empty:
        pair_name = candle.trading_pair
        # Only include pairs that are in our cluster analysis
        if pair_name in pairs:
            volumes = candle.data['volume']
            avg_volume = volumes.mean()
            volume_stability = volumes.std() / avg_volume if avg_volume > 0 else float('inf')
            volume_metrics[pair_name] = {
                'avg_volume': avg_volume,
                'volume_stability': volume_stability
            }

# Add volume metrics to cluster DataFrame
cluster_df['avg_volume'] = cluster_df['trading_pair'].map(
    lambda x: volume_metrics.get(x, {}).get('avg_volume', 0)
)
cluster_df['volume_stability'] = cluster_df['trading_pair'].map(
    lambda x: volume_metrics.get(x, {}).get('volume_stability', float('inf'))
)

# ============================================
# VOLUME-BASED CLUSTER VISUALIZATION
# ============================================
# Filter out infinite values for visualization
viz_df = cluster_df[cluster_df['volume_stability'] != float('inf')].copy()

if not viz_df.empty:
    fig = px.scatter(
        viz_df,
        x='avg_volume',
        y='volume_stability',
        color='cluster',
        hover_data=['trading_pair'],
        log_x=True,  # Use logarithmic scale for volume
        title='🎯 Market Clusters by Volume Metrics',
        labels={
            'avg_volume': 'Average Trading Volume (log scale)',
            'volume_stability': 'Volume Stability (Std/Mean)',
            'cluster': 'Cluster'
        },
        color_continuous_scale='viridis'
    )

    fig.update_layout(
        width=1000,
        height=600,
        template="plotly_dark"
    )
    fig.show()

# ============================================
# SIMPLIFIED COINTEGRATION HEATMAP
# ============================================
# Create heatmap showing clusters
cluster_matrix = np.zeros((n_pairs, n_pairs))
for i in range(n_pairs):
    for j in range(n_pairs):
        if clusters[i] == clusters[j]:
            cluster_matrix[i, j] = clusters[i]

fig = go.Figure()

# Add cointegration heatmap
fig.add_trace(go.Heatmap(
    z=cointegration_matrix.values,
    x=pairs,
    y=pairs,
    colorscale='RdYlGn_r',
    zmin=0,
    zmax=0.1,
    colorbar=dict(title="P-Value", x=1.05),
    name='Cointegration'
))

fig.update_layout(
    title='🔥 Cointegration P-Values Heatmap<br><sub>Green = Strong Cointegration (p < 0.05)</sub>',
    width=900,
    height=900,
    xaxis_tickangle=-45,
    template="plotly_dark"
)
fig.show()

# ============================================
# CLUSTER SUMMARY STATISTICS
# ============================================
print("\n📊 Cluster Cointegration Summary:")
for cluster_num in range(1, n_clusters + 1):
    cluster_pairs = cluster_df[cluster_df['cluster'] == cluster_num]['trading_pair'].tolist()
    if len(cluster_pairs) > 1:
        # Calculate average intra-cluster cointegration
        p_values = []
        for i, pair1 in enumerate(cluster_pairs):
            for pair2 in cluster_pairs[i+1:]:
                if pair1 in cointegration_matrix.index and pair2 in cointegration_matrix.columns:
                    p_values.append(cointegration_matrix.loc[pair1, pair2])

        if p_values:
            avg_p = np.mean(p_values)
            print(f"  • Cluster {cluster_num}: avg p-value = {avg_p:.4f} ({'Strong' if avg_p < 0.05 else 'Moderate' if avg_p < 0.1 else 'Weak'} cointegration)")

# ============================================
# TOP PAIRS SELECTION BY CLUSTER
# ============================================
def select_top_pairs(cluster_df, n_pairs_per_cluster=2):
    """Select top trading pairs from each cluster based on volume."""
    selected_pairs = []
    for cluster_num in cluster_df['cluster'].unique():
        cluster_pairs = cluster_df[cluster_df['cluster'] == cluster_num].copy()
        # Filter out infinite values and select by volume
        cluster_pairs = cluster_pairs[cluster_pairs['volume_stability'] != float('inf')]
        if not cluster_pairs.empty:
            top_pairs = cluster_pairs.nlargest(min(n_pairs_per_cluster, len(cluster_pairs)), 'avg_volume')
            selected_pairs.append(top_pairs)

    if selected_pairs:
        return pd.concat(selected_pairs)
    return pd.DataFrame()

# Select and display top pairs
top_pairs = select_top_pairs(cluster_df, n_pairs_per_cluster=2)

if not top_pairs.empty:
    print("\n🏆 Top Trading Pairs by Cluster (by volume):")
    for cluster_num in sorted(top_pairs['cluster'].unique()):
        print(f"\n📊 Cluster {cluster_num}:")
        cluster_result = top_pairs[top_pairs['cluster'] == cluster_num]
        for _, row in cluster_result.iterrows():
            print(f"  • {row['trading_pair']}: vol={row['avg_volume']:,.0f}, stability={row['volume_stability']:.3f}")

In [None]:
# 📋 Cluster Data Inspection
# Examine specific cluster details and characteristics
print(f"📊 Sample cluster data structure:")
print(f"  • Total pairs in analysis: {len(cluster_df)}")
print(f"  • Columns: {list(cluster_df.columns)}")
print(f"  • Cluster range: {cluster_df['cluster'].min()} to {cluster_df['cluster'].max()}")

# Display sample row for data structure understanding
sample_row = cluster_df.iloc[1]
print(f"\n📋 Sample data point (Row 2):")
for col, val in sample_row.items():
    if isinstance(val, float) and not np.isfinite(val):
        print(f"  • {col}: {val} (infinite/NaN)")
    elif isinstance(val, float):
        print(f"  • {col}: {val:.6f}")
    else:
        print(f"  • {col}: {val}")

# Show cluster data overview
cluster_df.head()

In [None]:
# 📈 Time Series Cluster Analysis
# Visualize price movements and relationships within clusters over time
def plot_clusters_timeseries(candles, Z, pairs, cut_height=None, n_clusters=None):
    """
    Create time series visualization of clusters showing price movements.
    
    Args:
        candles: List of candle dataframes
        Z: Linkage matrix from hierarchical clustering
        pairs: List of trading pair names
        cut_height: Height to cut dendrogram (alternative to n_clusters)
        n_clusters: Number of clusters to create
    
    Returns:
        clusters: Array of cluster assignments
    """
    # Determine cluster assignments
    if cut_height is not None:
        clusters = fcluster(Z, cut_height, criterion='distance')
    else:
        clusters = fcluster(Z, n_clusters, criterion='maxclust')
    
    # Prepare data for visualization
    plot_data = []
    for candle, pair in zip(candles, pairs):
        if candle.data is not None and not candle.data.empty:
            df = candle.data.copy()
            # Calculate cumulative returns for relative performance comparison
            df['cum_returns'] = (1 + df['close'].pct_change().fillna(0)).cumprod()
            df['trading_pair'] = pair
            # Assign cluster label
            pair_cluster = clusters[pairs.index(pair)]
            df['cluster'] = f'Cluster {pair_cluster}'
            plot_data.append(df)
    
    # Combine all time series data
    combined_df = pd.concat(plot_data, ignore_index=True)
    
    # Create interactive line plot
    fig = px.line(
        combined_df,
        x='timestamp',
        y='cum_returns',
        color='cluster',
        line_group='trading_pair',
        hover_data=['trading_pair', 'close'],
        title=f'📊 Cumulative Returns by Cluster ({"Cut Height: " + str(cut_height) if cut_height else "Clusters: " + str(n_clusters)})',
        labels={
            'timestamp': 'Time',
            'cum_returns': 'Cumulative Returns (1 = baseline)',
            'cluster': 'Market Cluster'
        }
    )
    
    fig.update_layout(
        width=1200,
        height=800,
        template="plotly_dark",
        hovermode="x unified"
    )
    
    fig.show()
    
    # Calculate and display cluster statistics
    cluster_stats = combined_df.groupby('cluster').agg({
        'cum_returns': ['mean', 'std', 'count'],
        'trading_pair': 'nunique'
    }).round(4)
    
    print("\n📊 Cluster Performance Statistics:")
    print(cluster_stats)
    
    return clusters

def plot_clusters_timeseries_normalized(candles, Z, pairs, cut_height=None, n_clusters=None):
    """
    Create normalized price visualization where all pairs start at 1.0.
    Better for comparing relative performance across different price levels.
    """
    # Determine cluster assignments
    if cut_height is not None:
        clusters = fcluster(Z, cut_height, criterion='distance')
    else:
        clusters = fcluster(Z, n_clusters, criterion='maxclust')
    
    # Prepare normalized data
    plot_data = []
    for candle, pair in zip(candles, pairs):
        if candle.data is not None and not candle.data.empty:
            df = candle.data.copy()
            # Normalize prices to start at 1.0 for comparison
            df['normalized_price'] = df['close'] / df['close'].iloc[0]
            df['trading_pair'] = pair
            # Assign cluster
            pair_cluster = clusters[pairs.index(pair)]
            df['cluster'] = f'Cluster {pair_cluster}'
            plot_data.append(df)
    
    # Combine data
    combined_df = pd.concat(plot_data, ignore_index=True)
    
    # Create visualization
    fig = px.line(
        combined_df,
        x='timestamp',
        y='normalized_price',
        color='cluster',
        line_group='trading_pair',
        hover_data=['trading_pair', 'close'],
        title=f'🎯 Normalized Price Movement by Cluster ({"Cut Height: " + str(cut_height) if cut_height else "Clusters: " + str(n_clusters)})',
        labels={
            'timestamp': 'Time',
            'normalized_price': 'Normalized Price (1 = start)',
            'cluster': 'Market Cluster'
        }
    )
    
    fig.update_layout(
        width=1200,
        height=800,
        template="plotly_dark",
        showlegend=True,
        hovermode="x unified"
    )
    
    fig.show()
    
    return clusters

print("✅ Time series analysis functions defined")

# Example usage: Normalized price movement visualization
print("🎯 Generating normalized price movement analysis...")
clusters = plot_clusters_timeseries_normalized(candles, Z, pairs, n_clusters=3)

In [None]:
# 🎯 Advanced Market Selection System
# Select representative markets from each cluster using comprehensive metrics
def select_representative_markets(candles, Z, pairs, n_clusters, top_n=1):
    """
    Select optimal representative markets from each cluster using multi-factor analysis.
    
    Combines volume, volatility, and stability metrics to identify the best trading
    opportunities within each market segment.
    
    Args:
        candles: List of candle dataframes
        Z: Hierarchical clustering linkage matrix
        pairs: List of trading pair names
        n_clusters: Number of clusters to create
        top_n: Number of markets to select per cluster
    
    Returns:
        pd.DataFrame: Selected markets with comprehensive metrics
    """
    # Get cluster assignments
    clusters = fcluster(Z, n_clusters, criterion='maxclust')
    
    # Create base DataFrame with cluster assignments
    cluster_df = pd.DataFrame({
        'trading_pair': pairs,
        'cluster': clusters
    })
    
    print(f"🔄 Calculating comprehensive metrics for {len(candles)} markets...")
    
    # ============================================
    # COMPREHENSIVE METRICS CALCULATION
    # ============================================
    market_metrics = []
    for candle in candles:
        if candle.data is not None and not candle.data.empty:
            df = candle.data
            
            # Volume metrics (USD-based)
            usd_volume = df['volume'] * df['close']
            avg_usd_volume = usd_volume.mean()
            volume_stability = usd_volume.std() / usd_volume.mean() if usd_volume.mean() != 0 else float('inf')
            
            # Price and volatility metrics
            returns = df['close'].pct_change().dropna()
            volatility = returns.std()
            price_mean = df['close'].mean()
            
            # Trading activity metrics
            n_trades = len(df)
            price_range = (df['high'].max() - df['low'].min()) / df['close'].mean()
            
            # Liquidity proxy (high volume + low volatility = good liquidity)
            liquidity_score = avg_usd_volume / (volatility + 1e-10)  # Avoid division by zero
            
            metrics = {
                'trading_pair': candle.trading_pair,
                'avg_usd_volume': avg_usd_volume,
                'volatility': volatility,
                'price_mean': price_mean,
                'n_trades': n_trades,
                'volume_stability': volume_stability,
                'price_range': price_range,
                'liquidity_score': liquidity_score
            }
            market_metrics.append(metrics)
    
    # Create comprehensive metrics DataFrame
    metrics_df = pd.DataFrame(market_metrics)
    
    # Merge with cluster assignments
    cluster_df = cluster_df.merge(metrics_df, on='trading_pair', how='left')
    
    # ============================================
    # METRIC NORMALIZATION AND SCORING
    # ============================================
    # Normalize key metrics to 0-1 scale for fair comparison
    for col in ['avg_usd_volume', 'volatility', 'liquidity_score']:
        if col in cluster_df.columns:
            min_val = cluster_df[col].min()
            max_val = cluster_df[col].max()
            if max_val > min_val:  # Avoid division by zero
                cluster_df[f'{col}_normalized'] = (cluster_df[col] - min_val) / (max_val - min_val)
            else:
                cluster_df[f'{col}_normalized'] = 0
    
    # Calculate composite score with weighted factors
    # Higher volume (60%) + Lower volatility (20%) + Higher liquidity (20%)
    cluster_df['composite_score'] = (
        cluster_df.get('avg_usd_volume_normalized', 0) * 0.6 +      # Volume weight
        (1 - cluster_df.get('volatility_normalized', 0)) * 0.2 +    # Inverse volatility weight  
        cluster_df.get('liquidity_score_normalized', 0) * 0.2       # Liquidity weight
    )
    
    # ============================================
    # MARKET SELECTION BY CLUSTER
    # ============================================
    selected_markets = []
    for cluster_num in range(1, n_clusters + 1):
        cluster_markets = cluster_df[cluster_df['cluster'] == cluster_num].copy()
        
        if not cluster_markets.empty:
            # Select top markets based on composite score
            top_markets = cluster_markets.nlargest(top_n, 'composite_score')
            selected_markets.append(top_markets)
    
    selected_df = pd.concat(selected_markets) if selected_markets else pd.DataFrame()
    
    # Sort results by cluster and score
    if not selected_df.empty:
        selected_df = selected_df.sort_values(['cluster', 'composite_score'], ascending=[True, False])
    
    # ============================================
    # VISUALIZATION
    # ============================================
    if not cluster_df.empty:
        # Create comprehensive scatter plot
        fig = px.scatter(
            cluster_df,
            x='avg_usd_volume',
            y='volatility',
            color='cluster',
            size='liquidity_score',
            hover_data=['trading_pair', 'avg_usd_volume', 'volatility', 'volume_stability', 'composite_score'],
            title=f'🎯 Market Selection Analysis (Top {top_n} per cluster)',
            labels={
                'avg_usd_volume': 'Average USD Volume',
                'volatility': 'Price Volatility',
                'cluster': 'Market Cluster',
                'liquidity_score': 'Liquidity Score'
            },
            log_x=True  # Use log scale for volume
        )
        
        # Highlight selected markets with star markers
        if not selected_df.empty:
            selected_pairs = selected_df['trading_pair'].tolist()
            for pair in selected_pairs:
                market_data = cluster_df[cluster_df['trading_pair'] == pair]
                if not market_data.empty:
                    fig.add_trace(
                        go.Scatter(
                            x=[market_data['avg_usd_volume'].iloc[0]],
                            y=[market_data['volatility'].iloc[0]],
                            mode='markers',
                            marker=dict(
                                symbol='star',
                                size=15,
                                line=dict(width=2, color='white'),
                                color='yellow'
                            ),
                            name=f'⭐ {pair}',
                            showlegend=False
                        )
                    )
        
        fig.update_layout(
            width=1000,
            height=600,
            template="plotly_dark",
            xaxis_title='💰 Average USD Volume (log scale)',
            yaxis_title='📊 Price Volatility'
        )
        fig.show()
    
    # ============================================
    # RESULTS SUMMARY
    # ============================================
    if not selected_df.empty:
        print(f"\n🏆 Selected Representative Markets ({top_n} per cluster):")
        
        for cluster_num in range(1, n_clusters + 1):
            cluster_results = selected_df[selected_df['cluster'] == cluster_num]
            if not cluster_results.empty:
                print(f"\n📊 Cluster {cluster_num} ({len(cluster_results)} selected):")
                
                # Format results for display
                display_cols = ['trading_pair', 'avg_usd_volume', 'volatility', 'volume_stability', 'composite_score']
                formatted_results = cluster_results[display_cols].copy()
                
                # Apply formatting
                formatted_results['avg_usd_volume'] = formatted_results['avg_usd_volume'].apply(lambda x: f"${x:,.0f}")
                formatted_results['volatility'] = formatted_results['volatility'].apply(lambda x: f"{x:.4f}")
                formatted_results['volume_stability'] = formatted_results['volume_stability'].apply(
                    lambda x: f"{x:.4f}" if np.isfinite(x) else "∞"
                )
                formatted_results['composite_score'] = formatted_results['composite_score'].apply(lambda x: f"{x:.4f}")
                
                print(formatted_results.to_string(index=False))
    else:
        print("⚠️ No markets selected - check data availability")
    
    return selected_df

print("🎯 Advanced market selection system ready")

# Execute comprehensive market selection
selected_markets = select_representative_markets(
    candles=candles,
    Z=Z,
    pairs=pairs,
    n_clusters=3,  # Create 10 distinct market clusters
    top_n=2         # Select top 2 markets from each cluster
)

print(f"\n✅ Market selection complete: {len(selected_markets)} markets selected from {len(pairs)} total pairs")