In [None]:
import pandas as pd
import numpy as np
import pickle
import os
from datetime import datetime
from binance.client import Client
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from hmmlearn import hmm
import matplotlib.pyplot as plt

# ==== USER CONFIG ====
SYMBOL = "ETHUSDT"
INTERVAL = Client.KLINE_INTERVAL_5MINUTE
CSV_FOLDER = "data"
MODEL_FOLDER = "models"
PCA_EXPORT_FOLDER = "pca_exports"
N_COMPONENTS = 3

# ==== BINANCE CLIENT ====
client = Client("", "")

# ==== FETCH ALL HISTORICAL DATA ====
def fetch_all_binance_data(symbol, interval):
    print("Fetching historical data from Binance...")
    all_data = []
    limit = 1000
    start_ts = None
    while True:
        klines = client.get_klines(symbol=symbol, interval=interval, limit=limit, startTime=start_ts)
        if not klines:
            break
        all_data += klines
        start_ts = klines[-1][0] + 1
        if len(klines) < limit:
            break
    df = pd.DataFrame(all_data, columns=[
        "open_time", "open", "high", "low", "close", "volume",
        "close_time", "quote_asset_volume", "number_of_trades",
        "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"
    ])
    df["open_time"] = pd.to_datetime(df["open_time"], unit="ms")
    df["close_time"] = pd.to_datetime(df["close_time"], unit="ms")
    df[["open","high","low","close","volume"]] = df[["open","high","low","close","volume"]].astype(float)
    df.set_index("open_time", inplace=True)
    print(f"Total candles fetched: {len(df)}")
    return df

# ==== FEATURE ENGINEERING ====
def add_features(df):
    df["Log_Return_1"] = np.log(df["close"] / df["close"].shift(1))
    df["Log_Return_4"] = np.log(df["close"] / df["close"].shift(4))
    df["Rolling_Std_12"] = df["Log_Return_1"].rolling(12).std()
    df["Log_Volume"] = np.log(df["volume"] + 1)
    df["Close_Position_Range_12"] = (df["close"] - df["low"].rolling(12).min()) / \
                                     (df["high"].rolling(12).max() - df["low"].rolling(12).min())
    rolling_mean_12 = df["close"].rolling(12).mean()
    rolling_std_12 = df["close"].rolling(12).std()
    df["Signal_Noise_Ratio_12"] = (rolling_mean_12 - df["close"].shift(12)) / (rolling_std_12 + 1e-9)
    df.dropna(inplace=True)
    return df

# ==== TRAIN HMM ====
def train_hmm(df_features, n_components=N_COMPONENTS):
    X = df_features.values
    scaler = StandardScaler().fit(X)
    X_scaled = scaler.transform(X)
    pca = PCA(n_components=0.95).fit(X_scaled)
    X_pca = pca.transform(X_scaled)

    # Train HMM
    model = hmm.GaussianHMM(n_components=n_components, covariance_type="full", n_iter=100, random_state=42)
    model.fit(X_pca)

    return model, scaler, pca, X_pca

# ==== MAIN ====
df = fetch_all_binance_data(SYMBOL, INTERVAL)
df = add_features(df)
features = ["Log_Return_1", "Log_Return_4", "Rolling_Std_12",
            "Log_Volume", "Close_Position_Range_12", "Signal_Noise_Ratio_12"]
df_features = df[features]

# Train HMM
hmm_model, scaler, pca, X_pca = train_hmm(df_features)

# Predict regimes
regimes = hmm_model.predict(pca.transform(scaler.transform(df_features)))
df["HMM_REGIME"] = regimes

# Export CSV
os.makedirs(CSV_FOLDER, exist_ok=True)
csv_path = os.path.join(CSV_FOLDER, f"{SYMBOL}_hmm_regimes_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
df.reset_index().to_csv(csv_path, index=False)
print(f"HMM regimes saved to: {csv_path}")


plt.figure(figsize=(15, 6))
colors = ['blue', 'red', 'green']
plt.plot(df.index, df['close'], color='lightgray', linewidth=0.6, label='Close Price')
for i in range(N_COMPONENTS):
    regime_data = df[df["HMM_REGIME"] == i]
    plt.scatter(regime_data.index, regime_data['close'], s=5, alpha=0.6, label=f'Regime {i}', color=colors[i])
plt.title(f"{SYMBOL} Market Regimes (HMM - {N_COMPONENTS} Components)")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
