In [38]:
import pandas as pd
import numpy as np
from scipy.signal import argrelextrema
import matplotlib.pyplot as plt
from untrade.client import Client
import ta
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from arch import arch_model  # For GARCH model

from sklearn.cluster import MiniBatchKMeans


In [13]:
data = pd.read_csv(
    '/Users/tejasmacipad/Desktop/Final_inter_IIT_submission/BTC/BTC_2019_2023_30m.csv',
    parse_dates=['datetime'],  # Parse the 'datetime' column as datetime
    index_col='datetime'       # Use 'datetime' as the index
)

print(data.head())

                     Unnamed: 0      open      high       low     close  \
datetime                                                                  
2019-09-08 17:30:00           0  10000.00  10000.00  10000.00  10000.00   
2019-09-08 18:00:00           1  10000.00  10000.00  10000.00  10000.00   
2019-09-08 18:30:00           2  10000.00  10000.00  10000.00  10000.00   
2019-09-08 19:00:00           3  10344.77  10357.53  10342.90  10354.62   
2019-09-08 19:30:00           4  10354.62  10357.35  10337.43  10340.12   

                      volume  
datetime                      
2019-09-08 17:30:00    0.002  
2019-09-08 18:00:00    0.000  
2019-09-08 18:30:00    0.000  
2019-09-08 19:00:00  136.177  
2019-09-08 19:30:00  335.482  


In [14]:
data = data.drop_duplicates(keep=False)

Generating Tech indicators for calculation of Market Regime:

1) Bearish Market
2) Bullish Market
3) Sideways Market
4) High Volatilite Market
5) Trend Reversal Market

In [50]:
historical_data = data[["close"]].copy()

def feature_engineering(data):
    mod_data = data[["close"]].copy()
    
    mod_data['return'] = data['close'].pct_change()  
    mod_data['log_return'] = np.log(data['close'] / data['close'].shift(1)) 
    mod_data['volatility'] = mod_data['return'].rolling(window=20).std()  
    mod_data['rsi'] = ta.momentum.RSIIndicator(data['close']).rsi()
    mod_data['macd_diff'] = ta.trend.MACD(data['close']).macd_diff()
    bollinger = ta.volatility.BollingerBands(data['close'])
    mod_data['bollinger_hband'] = bollinger.bollinger_hband()
    mod_data['bollinger_lband'] = bollinger.bollinger_lband()

    mod_data['scaled_return'] = mod_data['return'] * 100  
    
    garch_model = arch_model(mod_data['scaled_return'].dropna(), vol='Garch', p=1, q=1, rescale=False)
    garch_fit = garch_model.fit(disp="off")
    mod_data['garch_volatility'] = np.nan
    mod_data.iloc[1:, mod_data.columns.get_loc('garch_volatility')] = garch_fit.conditional_volatility.values / 100

    mod_data.dropna(inplace=True)

    return mod_data

historical_mod_data = feature_engineering(historical_data)
features = historical_mod_data[['volatility', 'rsi', 'macd_diff', 'bollinger_hband', 'bollinger_lband', 'garch_volatility']]

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

optimal_clusters = None
best_score = -1
for n_clusters in range(2, 6):  
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(features_scaled)
    score = silhouette_score(features_scaled, labels)
    if score > best_score:
        best_score = score
        optimal_clusters = n_clusters

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
historical_mod_data['regime'] = kmeans.fit_predict(features_scaled)

regime_mapping = {0: 'Bullish', 1: 'Bearish', 2: 'Sideways'}
historical_mod_data['regime'] = historical_mod_data['regime'].map(regime_mapping)

vol_mean = historical_mod_data['garch_volatility'].mean()
vol_std = historical_mod_data['garch_volatility'].std()
high_vol_threshold = vol_mean + vol_std
low_vol_threshold = vol_mean - vol_std
historical_mod_data['volatility_regime'] = np.where(historical_mod_data['garch_volatility'] > high_vol_threshold, 'High Volatility',
                                                   np.where(historical_mod_data['garch_volatility'] < low_vol_threshold, 'Low Volatility',
                                                            'Moderate Volatility'))

Refined Code

In [63]:
updating_data = pd.DataFrame(columns=['close', 'return', 'log_return', 'volatility', 'rsi', 'macd_diff', 
                                    'bollinger_hband', 'bollinger_lband', 'scaled_return', 'garch_volatility', 'regime'])

scaler = StandardScaler()
kmeans = None

def feature_engineering(data):
    mod_data = data[["close"]].copy()
    mod_data['return'] = data['close'].pct_change()  
    mod_data['log_return'] = np.log(data['close'] / data['close'].shift(1))  
    mod_data['volatility'] = mod_data['return'].rolling(window=20).std()  
    mod_data['rsi'] = ta.momentum.RSIIndicator(data['close']).rsi()
    mod_data['macd_diff'] = ta.trend.MACD(data['close']).macd_diff()
    bollinger = ta.volatility.BollingerBands(data['close'])
    mod_data['bollinger_hband'] = bollinger.bollinger_hband()
    mod_data['bollinger_lband'] = bollinger.bollinger_lband()
    mod_data['scaled_return'] = mod_data['return'] * 100  

    garch_model = arch_model(mod_data['scaled_return'].dropna(), vol='Garch', p=1, q=1, rescale=False)
    garch_fit = garch_model.fit(disp="off")
    mod_data['garch_volatility'] = np.nan
    mod_data.iloc[1:, mod_data.columns.get_loc('garch_volatility')] = garch_fit.conditional_volatility.values / 100
    mod_data.dropna(inplace=True)
    return mod_data

def initialize_models(historical_data):
    global updating_data, kmeans, scaler
    
    print("Starting feature engineering...")
    initial_features = feature_engineering(historical_data)
    
    features = initial_features[['volatility', 'rsi', 'macd_diff', 
                               'bollinger_hband', 'bollinger_lband', 'garch_volatility']]
    
    print("Finding optimal number of clusters...")
    optimal_clusters = None
    best_score = -1
    
    for n_clusters in range(2, 6):
        kmeans_temp = KMeans(n_clusters=n_clusters, random_state=42)
        features_scaled = scaler.fit_transform(features)
        labels = kmeans_temp.fit_predict(features_scaled)
        score = silhouette_score(features_scaled, labels)
        print(f"Clusters: {n_clusters}, Silhouette Score: {score:.3f}")
        if score > best_score:
            best_score = score
            optimal_clusters = n_clusters
    
    print(f"Optimal number of clusters: {optimal_clusters}")
    
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
    features_scaled = scaler.fit_transform(features)
    kmeans.fit(features_scaled)
    
    vol_mean = initial_features['garch_volatility'].mean()
    vol_std = initial_features['garch_volatility'].std()
    global high_vol_threshold, low_vol_threshold
    high_vol_threshold = vol_mean + vol_std
    low_vol_threshold = vol_mean - vol_std


def predict_market_condition(new_data):
    global updating_data, kmeans, high_vol_threshold, low_vol_threshold
    
    new_mod_data = feature_engineering(new_data)
    updating_data = pd.concat([updating_data, new_mod_data])
    
    if len(updating_data) < 20:
        return "Neutral", "Moderate Volatility"
    
    features = updating_data[['volatility', 'rsi', 'macd_diff', 
                            'bollinger_hband', 'bollinger_lband', 'garch_volatility']]
    features_scaled = scaler.transform(features)
    
    predicted_regime = kmeans.predict(features_scaled[-1].reshape(1, -1))[0]
    regime_mapping = {0: 'Bullish', 1: 'Bearish', 2: 'Sideways'}
    market_condition = regime_mapping.get(predicted_regime, 'Neutral')
    
    latest_vol = updating_data['garch_volatility'].iloc[-1]
    if latest_vol > high_vol_threshold:
        vol_regime = 'High Volatility'
    elif latest_vol < low_vol_threshold:
        vol_regime = 'Low Volatility'
    else:
        vol_regime = 'Moderate Volatility'
    
    return market_condition, vol_regime

In [64]:
initialize_models(historical_data)

Starting feature engineering...
Finding optimal number of clusters...
Clusters: 2, Silhouette Score: 0.336
Clusters: 3, Silhouette Score: 0.344
Clusters: 4, Silhouette Score: 0.337
Clusters: 5, Silhouette Score: 0.236
Optimal number of clusters: 3
