In [2]:
import ccxt
import pandas as pd
from datetime import datetime
import numpy as np
import talib
from sklearn.model_selection import train_test_split

In [4]:
def fetch_crypto_data(symbol, timeframe, start_date):
    # Initialize the exchange
    exchange = ccxt.binance()

    # Define the limit for candles per API call
    limit = 1000  # Maximum allowed by Binance

    # Convert start_date string to timestamp
    since = exchange.parse8601(f"{start_date}T00:00:00Z")

    all_ohlcv = []

    while True:
        # Fetch OHLCV data
        ohlcv = exchange.fetch_ohlcv(symbol, timeframe, since, limit)

        # Break the loop if no new data is fetched
        if len(ohlcv) == 0:
            break

        all_ohlcv.extend(ohlcv)

        # Update 'since' to the timestamp of the last candle fetched
        since = (
            ohlcv[-1][0] + exchange.parse_timeframe(timeframe) * 60 * 1000
        )  # 60*1000 converts minutes to milliseconds

    # Convert to a pandas DataFrame
    symbol_modified = symbol.replace("/", ":")
    df = pd.DataFrame(
        all_ohlcv,
        columns=[
            f"{symbol_modified}_timestamp",
            f"{symbol_modified}_open",
            f"{symbol_modified}_high",
            f"{symbol_modified}_low",
            f"{symbol_modified}_close",
            f"{symbol_modified}_volume",
        ],
    )

    # Setting timestamp as the index and converting it to datetime
    df[f"{symbol_modified}_timestamp"] = pd.to_datetime(
        df[f"{symbol_modified}_timestamp"], unit="ms"
    )
    df.set_index(f"{symbol_modified}_timestamp", inplace=True)

    # Save to csv
    df.to_csv(f"{symbol_modified}_price_data.csv")

    return df


# Example usage

# df = fetch_crypto_data("ETH/USDT", "1h", "2022-01-01")

In [4]:
def add_target(df, symbol, day_to_forecast):
    # Calculate the 7-day return
    symbol = symbol.replace("/", ":")
    days_to_shift = day_to_forecast * 24  # n days * 24 hours per day
    df[f"{symbol}_target"] = (
        df[f"{symbol}_close"].pct_change(periods=days_to_shift).shift(-days_to_shift)
    )
    return df

In [5]:
def get_features_and_target(symbol, feature_lags=[3, 9, 16], day_to_forecast=7):
    symbol = symbol.replace("/", ":")

    features_df = pd.read_csv(
        f"{symbol}_price_data.csv", parse_dates=True, index_col=f"{symbol}_timestamp"
    )

    # Base metrics calculation
    # Ensure the columns exist before calculation
    if (
        f"{symbol}_close" in features_df.columns
        and f"{symbol}_high" in features_df.columns
        and f"{symbol}_low" in features_df.columns
        and f"{symbol}_volume" in features_df.columns
    ):
        # Moving Averages
        for ma in [9, 20, 50, 200]:
            features_df[f"{symbol}_sma_{ma}"] = talib.SMA(
                features_df[f"{symbol}_close"], timeperiod=ma
            )

        # RSI
        rsi = talib.RSI(features_df[f"{symbol}_close"], timeperiod=14)
        features_df[f"{symbol}_rsi"] = rsi.shift(periods=1)

        # Bollinger Bands
        upperband, middleband, lowerband = talib.BBANDS(
            features_df[f"{symbol}_close"],
            timeperiod=20,
            nbdevup=2,
            nbdevdn=2,
            matype=0,
        )
        features_df[f"{symbol}_bollinger_up"] = upperband.shift(periods=1)
        features_df[f"{symbol}_bollinger_down"] = lowerband.shift(periods=1)

        # ADX
        adx = talib.ADX(
            features_df[f"{symbol}_high"],
            features_df[f"{symbol}_low"],
            features_df[f"{symbol}_close"],
            timeperiod=14,
        )
        features_df[f"{symbol}_adx"] = adx.shift(periods=1)

        # MACD
        macd, macdsignal, macdhist = talib.MACD(
            features_df[f"{symbol}_close"], fastperiod=12, slowperiod=26, signalperiod=9
        )
        features_df[f"{symbol}_macd_diff"] = (macd - macdsignal).shift(periods=1)

        # OBV
        obv = talib.OBV(features_df[f"{symbol}_close"], features_df[f"{symbol}_volume"])
        features_df[f"{symbol}_obv"] = obv.shift(periods=1)

        # Ichimoku Cloud (Conversion Line)
        nine_period_high = features_df[f"{symbol}_high"].rolling(window=9).max()
        nine_period_low = features_df[f"{symbol}_low"].rolling(window=9).min()
        features_df[f"{symbol}_ichimoku_conversion"] = (
            (nine_period_high + nine_period_low) / 2
        ).shift(periods=1)

        # Stochastic Oscillator
        stochastic_k, stochastic_d = talib.STOCH(
            features_df[f"{symbol}_high"],
            features_df[f"{symbol}_low"],
            features_df[f"{symbol}_close"],
        )
        features_df[f"{symbol}_stochastic_k"] = stochastic_k.shift(periods=1)
        features_df[f"{symbol}_stochastic_d"] = stochastic_d.shift(periods=1)

        # Aroon Indicator
        aroon_up, aroon_down = talib.AROON(
            features_df[f"{symbol}_high"], features_df[f"{symbol}_low"], timeperiod=14
        )
        features_df[f"{symbol}_aroon_up"] = aroon_up.shift(periods=1)
        features_df[f"{symbol}_aroon_down"] = aroon_down.shift(periods=1)

        # Lagged metrics calculation
        for lag in feature_lags:
            # Shifted
            features_df[f"{symbol}_rsi_lag_{lag}"] = rsi.shift(lag)
            features_df[f"{symbol}_macd_diff_lag_{lag}"] = features_df[
                f"{symbol}_macd_diff"
            ].shift(lag)
            features_df[f"{symbol}_obv_lag_{lag}"] = obv.shift(lag)
            features_df[f"{symbol}_ichimoku_conversion_lag_{lag}"] = features_df[
                f"{symbol}_ichimoku_conversion"
            ].shift(lag)
            features_df[f"{symbol}_stochastic_k_lag_{lag}"] = stochastic_k.shift(lag)
            features_df[f"{symbol}_stochastic_d_lag_{lag}"] = stochastic_d.shift(lag)
            features_df[f"{symbol}_aroon_up_lag_{lag}"] = aroon_up.shift(lag)
            features_df[f"{symbol}_aroon_down_lag_{lag}"] = aroon_down.shift(lag)

            # Deltas (Momentums)
            features_df[f"{symbol}_rsi_delta_{lag}"] = rsi.diff(lag)
            features_df[f"{symbol}_macd_diff_delta_{lag}"] = features_df[
                f"{symbol}_macd_diff"
            ].diff(lag)
            features_df[f"{symbol}_obv_delta_{lag}"] = obv.diff(lag)
            features_df[f"{symbol}_ichimoku_conversion_delta_{lag}"] = features_df[
                f"{symbol}_ichimoku_conversion"
            ].diff(lag)
            features_df[f"{symbol}_stochastic_k_delta_{lag}"] = stochastic_k.diff(lag)
            features_df[f"{symbol}_stochastic_d_delta_{lag}"] = stochastic_d.diff(lag)
            features_df[f"{symbol}_aroon_up_delta_{lag}"] = aroon_up.diff(lag)
            features_df[f"{symbol}_aroon_down_delta_{lag}"] = aroon_down.diff(lag)

        # Handle missing values and drop not needed columns
        features_df = add_target(features_df, symbol, day_to_forecast)

        features_df.drop(
            columns=[
                f"{symbol}_open",
                f"{symbol}_high",
                f"{symbol}_low",
                f"{symbol}_close",
                f"{symbol}_volume",
            ],
            inplace=True,
        )

        features_df = features_df.replace([np.inf, -np.inf], np.nan)
        features_df.drop_duplicates(inplace=True)
        features_df.dropna(inplace=True)

        return features_df
    else:
        raise ValueError("Required columns are missing in the DataFrame")

In [None]:
full_df = get_features_and_target("BTC/USDT")

full_df.columns

In [None]:
def prepare_data_for_ML(symbol):
    symbol = symbol.replace("/", ":")
    df = get_features_and_target(symbol)
    X = df.drop(columns=f"{symbol}_target")
    y = df[f"{symbol}_target"].copy()

    X_train, y_train, X_test, y_test = train_test_split(
        X, y, test_size=0.15, random_state=99
    )

    return (X_train, y_train, X_test, y_test)