In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
import umap
import hdbscan
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load extracted features
print("Loading extracted features...")
features_file = "data/extracted_features.csv"
df = pd.read_csv(features_file)

Loading extracted features...


In [3]:
# Convert Time to datetime
df['Time'] = pd.to_datetime(df['Time'])

print(f"Loaded {df.shape[0]} rows with {df.shape[1]} features")

# Group features by category
liquidity_features = [col for col in df.columns if any(x in col for x in ['bid_ask_spread', 'imbalance', 'depth', 'slope'])]
volatility_features = [col for col in df.columns if any(x in col for x in ['volatility', 'zscore'])]
trend_features = [col for col in df.columns if any(x in col for x in ['return', 'trend', 'rsi'])]
volume_features = [col for col in df.columns if any(x in col for x in ['volume', 'trade', 'vwap'])]

# Print feature counts by category
print(f"Liquidity features: {len(liquidity_features)}")
print(f"Volatility features: {len(volatility_features)}")
print(f"Trend features: {len(trend_features)}")
print(f"Volume features: {len(volume_features)}")

# Feature selection - let's use the most important features from each category
selected_features = [
    # Liquidity
    'bid_ask_spread_bps', 'imbalance_lvl1', 'cum_depth_imbalance', 
    'bid_slope', 'ask_slope', 'mean_bid_price_spacing', 'mean_ask_price_spacing',
    'bid_depth_5lvl', 'ask_depth_5lvl',
    
    # Volatility
    'volatility_10s', 'volatility_30s', 'volatility_60s',
    'zscore_10s', 'zscore_30s', 'zscore_60s',
    
    # Trend
    'return_10s', 'return_30s', 'return_60s',
    'rsi_10s', 'rsi_30s', 'rsi_60s',
    'trend_slope_10s', 'trend_slope_30s', 'trend_slope_60s',
    
    # Volume
    'volume_10s', 'volume_30s', 'volume_60s',
    'volume_imbalance_10s', 'volume_imbalance_30s', 'volume_imbalance_60s',
    'avg_trade_size_30s', 'vwap_shift_30s'
]

# Filter out features that might not exist in the dataframe
existing_features = [f for f in selected_features if f in df.columns]
print(f"Using {len(existing_features)} features for clustering")

# Drop rows with NaN values in selected features
data_for_clustering = df[['Time'] + existing_features].dropna()
print(f"After dropping NaN values: {data_for_clustering.shape[0]} rows")

Loaded 1526373 rows with 75 features
Liquidity features: 37
Volatility features: 8
Trend features: 12
Volume features: 17
Using 32 features for clustering
After dropping NaN values: 1526373 rows


In [4]:
# Separate features and time
X = data_for_clustering[existing_features]
times = data_for_clustering['Time']

# Apply PCA for dimensionality reduction
print("Applying PCA...")
pca = PCA(n_components=0.95)  # Keep enough components to explain 95% of variance
X_pca = pca.fit_transform(X)
print(f"PCA reduced dimensions from {X.shape[1]} to {X_pca.shape[1]}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Cumulative explained variance: {np.sum(pca.explained_variance_ratio_)}")

# Plot PCA components
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
plt.xlabel('PCA Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Components Explained Variance')
plt.savefig('pca_components.png')
plt.close()

# Apply K-means clustering for different K values
print("Applying KMeans clustering...")
inertia = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_pca)
    inertia.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), inertia, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.savefig('elbow_curve.png')
plt.close()

# Select optimal K (this can be adjusted based on the elbow curve)
optimal_k = 4  # Adjust this based on the elbow curve
print(f"Selected optimal K: {optimal_k}")

# Apply K-means with optimal K
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca)

# Add cluster labels to the original data
data_for_clustering['cluster'] = clusters

# Try DBSCAN as an alternative clustering method
print("Applying DBSCAN clustering...")
dbscan = DBSCAN(eps=1.0, min_samples=10)
dbscan_clusters = dbscan.fit_predict(X_pca)
data_for_clustering['dbscan_cluster'] = dbscan_clusters

# Analyze clusters
print("\nK-means Cluster Analysis:")
for i in range(optimal_k):
    cluster_data = data_for_clustering[data_for_clustering['cluster'] == i]
    print(f"\nCluster {i} - {len(cluster_data)} samples")
    
    # Analyze key characteristics for each dimension
    
    # 1. Trending vs Mean-reverting
    mean_trend_slope = cluster_data['trend_slope_30s'].mean()
    mean_rsi = cluster_data['rsi_30s'].mean()
    print(f"  Trend characteristics: Slope={mean_trend_slope:.4f}, RSI={mean_rsi:.2f}")
    trend_type = "Trending" if abs(mean_trend_slope) > 0.0001 else "Mean-reverting"
    trend_direction = "Upward" if mean_trend_slope > 0 else "Downward"
    if trend_type == "Trending":
        print(f"  Regime: {trend_direction} {trend_type}")
    else:
        print(f"  Regime: {trend_type}")
    
    # 2. Volatile vs Stable
    mean_volatility = cluster_data['volatility_30s'].mean()
    print(f"  Volatility: {mean_volatility:.6f}")
    volatility_type = "Volatile" if mean_volatility > 0.0001 else "Stable"
    print(f"  Regime: {volatility_type}")
    
    # 3. Liquid vs Illiquid
    mean_spread = cluster_data['bid_ask_spread_bps'].mean()
    mean_depth = (cluster_data['bid_depth_5lvl'].mean() + cluster_data['ask_depth_5lvl'].mean()) / 2
    print(f"  Liquidity: Spread={mean_spread:.2f} bps, Depth={mean_depth:.2f}")
    liquidity_type = "Illiquid" if mean_spread > 1.0 else "Liquid"
    print(f"  Regime: {liquidity_type}")
    
    # Overall regime classification
    print(f"  Overall Market Regime: {trend_direction if trend_type=='Trending' else ''} {trend_type}, {volatility_type}, {liquidity_type}")

# Visualize clusters in 2D PCA space
plt.figure(figsize=(12, 10))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.7)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Market Regime Clusters')
plt.colorbar(label='Cluster')
plt.savefig('cluster_visualization.png')
plt.close()

# Visualize DBSCAN clusters
plt.figure(figsize=(12, 10))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=dbscan_clusters, cmap='viridis', alpha=0.7)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('DBSCAN Market Regime Clusters')
plt.colorbar(label='Cluster')
plt.savefig('dbscan_cluster_visualization.png')
plt.close()

# Create time series visualization of clusters
plt.figure(figsize=(16, 6))
plt.scatter(data_for_clustering['Time'], data_for_clustering['mid_price'] if 'mid_price' in data_for_clustering.columns else range(len(data_for_clustering)), 
           c=data_for_clustering['cluster'], cmap='viridis', alpha=0.7, s=10)
plt.xlabel('Time')
plt.ylabel('Mid Price' if 'mid_price' in data_for_clustering.columns else 'Index')
plt.title('Market Regimes Over Time')
plt.colorbar(label='Regime Cluster')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('regimes_over_time.png')
plt.close()

# Save clustering results
print("Saving results...")
results_file = "market_regimes.csv"
data_for_clustering.to_csv(results_file, index=False)
print(f"Results saved to {results_file}")

# Create feature importance analysis
print("Analyzing feature importance for each cluster...")
cluster_centers = kmeans.cluster_centers_
feature_importance = pd.DataFrame()

# Map back from PCA to original features
for i in range(optimal_k):
    # Get the contribution of each PCA component to this cluster center
    cluster_center_pca = cluster_centers[i]
    
    # Transform back to original space (approximate)
    # This is a rough approximation using the PCA components and loadings
    importance = np.abs(np.dot(pca.components_.T, cluster_center_pca))
    feature_importance[f'Cluster_{i}'] = importance

feature_importance.index = existing_features

# Save feature importance
feature_importance.to_csv('feature_importance.csv')

# Create heatmap of feature importance
plt.figure(figsize=(14, 10))
sns.heatmap(feature_importance, annot=False, cmap='viridis')
plt.title('Feature Importance by Cluster')
plt.tight_layout()
plt.savefig('feature_importance_heatmap.png')
plt.close()

print("Analysis complete!")

Applying PCA...
PCA reduced dimensions from 32 to 3
Explained variance ratio: [0.75425946 0.18850756 0.04732261]
Cumulative explained variance: 0.9900896343857355
Applying KMeans clustering...
Selected optimal K: 4
Applying DBSCAN clustering...

K-means Cluster Analysis:

Cluster 0 - 398385 samples
  Trend characteristics: Slope=0.0068, RSI=65.43
  Regime: Upward Trending
  Volatility: 0.000073
  Regime: Stable
  Liquidity: Spread=0.03 bps, Depth=0.00
  Regime: Liquid
  Overall Market Regime: Upward Trending, Stable, Liquid

Cluster 1 - 373206 samples
  Trend characteristics: Slope=0.0053, RSI=57.40
  Regime: Upward Trending
  Volatility: 0.000075
  Regime: Stable
  Liquidity: Spread=-0.00 bps, Depth=0.00
  Regime: Liquid
  Overall Market Regime: Upward Trending, Stable, Liquid

Cluster 2 - 393834 samples
  Trend characteristics: Slope=-0.0069, RSI=35.45
  Regime: Downward Trending
  Volatility: 0.000078
  Regime: Stable
  Liquidity: Spread=-0.01 bps, Depth=-0.01
  Regime: Liquid
  Ove