In [2]:
import pandas as pd 
import MetaTrader5 as mt
import numpy as np
import talib as ta
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from math import ceil
from datetime import datetime

In [4]:
# Mettre les identifiants FTMO MetaTrader5
mt.initialize()

login_mt5 = 1051534030
mdp_mt5 = 'FG2SF2M74R'
server = 'FTMO-Demo'

mt.login(login_mt5, mdp_mt5, server)
type(mt.positions_total())

int

In [3]:
symbols = ['AUDCAD', 'AUDJPY', 'AUDNZD', 'AUDCHF', 'AUDUSD', 'GBPAUD', 
           'GBPCAD', 'GBPJPY', 'GBPNZD', 'GBPCHF', 'GBPUSD', 'CADJPY',
           'CADCHF', 'EURAUD', 'EURGBP', 'EURCAD', 'EURJPY', 'EURCHF', 
           'EURUSD', 'EURNZD', 'NZDCAD', 'NZDCHF', 'NZDUSD', 'NZDJPY',
           'CHFJPY', 'USDCAD', 'USDCHF', 'USDJPY']

In [4]:
def get_clean_mt5_data(tickers, interval):
    
    datas = []
    for ticker in tickers:
        
        data = pd.DataFrame(mt.copy_rates_from(ticker, interval, datetime.now(), 99999))
        data['time'] = pd.to_datetime(data['time'], unit='s')
        data['symbol'] = np.full(shape=len(data), fill_value=ticker)
        data = data.drop(['real_volume'], axis=1)
        datas.append(data)
    
    data = pd.concat(datas, axis=0)
    data = data.set_index(['symbol', 'time'])
    return data

In [5]:
def get_features_data(data, 
                      momentums=True, 
                      ema_spread=True, 
                      ema_accel=True,
                      rsi=True,
                      rsi_lag=True,
                      rsi_longrun=True,
                      rsi_longrun_lag=True,
                      log_returns_lag=True,
                      rolling_std=True,
                      parkinson=True,
                      parkinson_lag=True,
                      tick_volume_lag=True,
                      spread_lag=True,
                      tickers_token=True,
                      frac_diff=False,
                      windows=None,
                      target_window=[1]):
    
    """
    Calculates various technical analysis features from the input data and adds them to the input data DataFrame.

    Args:
    data (DataFrame): Input data containing OHLCV columns
    momentums (bool): Whether or not to calculate momentum features (default True)
    ema_spread (bool): Whether or not to calculate EMA spread features (default True)
    ema_accel (bool): Whether or not to calculate EMA acceleration features (default True)
    rsi (bool): Whether or not to calculate RSI features (default True)
    rsi_lag (bool): Whether or not to calculate lagged RSI features (default True)
    rsi_longrun (bool): Whether or not to calculate long-run RSI features (default True)
    rsi_longrun_lag (bool): Whether or not to calculate lagged long-run RSI features (default True)
    log_returns_lag (bool): Whether or not to calculate log-returns lag features (default True)
    rolling_std (bool): Whether or not to calculate rolling standard deviation features (default True)
    parkinson (bool): Whether or not to calculate Parkinson volatility features (default True)
    parkinson_lag (bool): Whether or not to calculate lagged Parkinson volatility features (default True)
    tick_volume_lag (bool): Whether or not to calculate lagged tick volume features (default True)
    spread_lag (bool): Whether or not to calculate lagged spread features (default True)
    tickers_token (bool): Whether or not to tokenize the tickers features (default True)
    frac_diff (bool): Whether or not to calculate fractional differentiation features (default False)
    windows (dict): A dictionary containing windows values for each feature
    target_window (int): The target window for prediction (default 1)

    Returns: cleaned data
    
    
    Example of use with a "windows" window dictionary:
    
    windows_example = {
        'momentum_windows': [10, 20, 30, 50],
        'ema_window': 14,
        'ema_accel_windows': [1, 5, 10, 30],
        'rsi_lags': [1, 5, 10, 20, 50],
        'rsi_longrun_lag': [20, 50, 100, 200, 300],
        'parkinson_lag': [10, 20, 30],
        'tick_volume_lag': [1, 5, 10, 20],
        'spread_lag': [5, 10, 20]
    }
    
    """

    
    if windows is None:
        windows = {}
    #Calcul du momentum
    if momentums:
        if windows.get('momentum_windows') is None:
            momentum_windows = [30, 60, 120, 240, 480, 1060]
        else:
            momentum_windows = windows['momentum_windows']
        
        for lag in momentum_windows:
            data[f'momentum_{lag}p'] = data.groupby(level='symbol', group_keys=False).apply(lambda x: x.close.pct_change(lag))
    
    #Calcul du spread ema
    if ema_spread:
        if windows.get('ema_window') is None:
            ema_window = 20
        else:
            ema_window = windows['ema_window']
        ema = (data
               .groupby(level='symbol', group_keys=False)
               .apply(lambda x: ta.EMA(x.close, timeperiod=ema_window)))
        data[f'ema_spread{ema_window}'] = data.close - ema
    #Calcul de l'acceleration ema
    
        if ema_accel:
            if windows.get('ema_accel_windows') is None:
                ema_accel_windows = [1, 5, 10, 30]

            else:
                ema_accel_windows = windows['ema_accel_windows']
        for lag in ema_accel_windows:
            data[f'ema_accel_{lag}p'] = ema.groupby(level='symbol', group_keys=False).apply(lambda x: x.pct_change(lag))
        
    #Calcul de l'RSI classique 14p et d'un autre plus long terme
    
    if rsi:
        data['rsi'] = data.groupby(level='symbol', group_keys=False).apply(lambda x: ta.RSI(x.close))
        
        if rsi_lag:
            if windows.get('rsi_lags') is None:
                rsi_lags = [1, 5, 10, 20, 25, 35, 50]

            else:
                rsi_lags = windows['rsi_lags']
                
            for lag in rsi_lags:
                
                data[f'rsi_lag_{lag}'] = data.groupby(level='symbol', group_keys=False).apply(lambda x: x.rsi.shift(lag))
                
    if rsi_longrun:
        
        data['rsi_longrun'] = data.groupby(level='symbol', group_keys=False).apply(lambda x: ta.RSI(x.close, timeperiod=100))
        
        if rsi_longrun_lag:
            if windows.get('rsi_longrun_lag') is None:
                rsi_longrun_lag = [10, 20, 30, 50, 100, 150, 200]

            else:
                rsi_longrun_lag = windows['rsi_longrun_lag']
                
            for lag in rsi_longrun_lag:
                
                data[f'rsi_lr_lag_{lag}'] = data.groupby(level='symbol', group_keys=False).apply(lambda x: x.rsi.shift(lag))
                
    # Calcul des log-rentabilités
    if log_returns_lag:
        
        if windows.get('log_returns_lags') is None:
            
            log_returns_lags = list(range(1, 10))
        
        else:
            log_returns_lags = windows['log_returns_lags']
            
        data['log_returns_lag_0p'] = (data
                                      .groupby(level='symbol', group_keys=False)
                                      .apply(lambda x: np.log(x.close/x.close.shift(1))))
        for lag in log_returns_lags:
            data[f'log_returns_lag_{lag}p'] = (data
                                               .groupby(level='symbol', group_keys=False)
                                               .apply(lambda x: x.log_returns_lag_0p.shift(lag)))
    # Calcul de l'écart-type mobile    
    if rolling_std:
        
        if windows.get('windows_std') is None:
            
            windows_std = [20, 50, 150, 300]
            
        else:
            
            windows_std = windows['windows_std']
        
        for window_std in windows_std:
            
            data[f'rolling_std_{window_std}w'] = (data
                                                  .groupby(level='symbol', group_keys=False, as_index=False)['log_returns_lag_0p']
                                                  .rolling(window_std)
                                                  .std()
                                                  .drop(['symbol'], axis=1))
    # Calcul de la volatilité de parkinson
    
    if parkinson:
        
        data['parkinson'] = (data
                             .groupby(level='symbol', group_keys=False)
                             .apply(lambda x: np.sqrt((1/4*np.log(2)) * np.log(x.high/x.low)**2)))
        
        if parkinson_lag:
            if windows.get('parkinson_lags') is None:
                
                parkinson_lags = list(range(1, 10))
                
            else:
                
                parkinson_lags = windows['parkinson_lags']
                
            for lag in parkinson_lags:
                
                data[f'parkinson_{lag}p'] = (data
                                             .groupby(level='symbol', group_keys=False)
                                             .apply(lambda x: x.parkinson.shift(lag)))
    
    # Mise en retard du volume
    if tick_volume_lag:
        
        if windows.get('tick_volume_lags') is None:
            
            tick_volume_lags = list(range(1, 10))
            
        else:
            
            tick_volume_lags = windows['tick_volume_lags']
            
        for lag in tick_volume_lags:
            
            data[f'tick_volume_{lag}p'] = (data
                                           .groupby(level='symbol', group_keys=False)
                                           .apply(lambda x: x.tick_volume.shift(lag)))
    # Mise en retard du spread       
    if spread_lag:
        
        if windows.get('spread_lags') is None:
            
            spread_lags = list(range(1, 10))
            
        else:
            
            spread_lags = windows['spread_lags']
            
        for lag in spread_lags:
            
            data[f'spread_lags_{lag}p'] = (data.groupby(level='symbol', group_keys=False)
                                           .apply(lambda x: x.spread.shift(lag)))
            
    # Tokenization des tickers        
    if tickers_token:
        
        data['ticker_token'] = pd.factorize(data.index.get_level_values(0))[0]
    
    # frac diff pas encore disponible
    if frac_diff:
        pass    
    
    #nétoyage et préparation des données finales
    for lag in target_window:
        data[f'target_{lag}p'] = (data.groupby(level='symbol', group_keys=False)
                                            .apply(lambda x: np.log(x.close/x.close.shift(lag))))
        data[f'target_{lag}p'] = data[f'target_{lag}p'].shift(-lag)
    prices = data[['open', 'high', 'low', 'close', 'tick_volume', 'spread']]
    data = data.drop(['open', 'high', 'low', 'close'], axis=1)
    data = data.dropna(how='any')
    data = data.replace([np.inf, -np.inf], np.nan)
    data = data.fillna(method='ffill')
    
    return data, prices
            
            
        
            
        

In [6]:
def get_VAR_logic_data(data, momentum=True, lags=list(range(1, 11))):
    
    windows={
        'momentum_windows' : [30, 60, 120, 240, 480, 960, 2000],
        'log_returns_lags' : lags
        
    }
    
    data, prices = get_features_data(data, 
                      momentums=momentum, 
                      ema_spread=False, 
                      ema_accel=False,
                      rsi=False,
                      rsi_lag=False,
                      rsi_longrun=False,
                      rsi_longrun_lag=False,
                      log_returns_lag=True,
                      rolling_std=False,
                      parkinson=False,
                      parkinson_lag=False,
                      tick_volume_lag=False,
                      spread_lag=False,
                      tickers_token=False,
                      frac_diff=False,
                      windows=windows,
                      target_window=[1])
    data = data.drop(['tick_volume', 'spread'], axis=1)
    target_name = [target for target in data.columns if 'target' in target][0]
    target = data.pop(target_name)
    columns_wewant = data.columns
    for ticker in data.index.levels[0]:
        join = data.loc[ticker][columns_wewant]
        join.columns = [f'{c}_{ticker}' for c in join.columns]
        data = data.join(join, on='time')
    
    data = data.join(target)
    data = data.drop(columns_wewant, axis=1)
    data = data.fillna(method='ffill')
    data = data.dropna()
    return data, prices
    

In [11]:
data = get_clean_mt5_data(symbols, mt.TIMEFRAME_M1)

In [59]:
data, prices = get_VAR_logic_data(data)

KeyboardInterrupt: 

In [25]:
with pd.HDFStore('data.h5') as hdf:
    hdf.put('data_rsi_filtre2', data)
    hdf.put('prices_rsi_filtre2', prices)

In [6]:
import time
start = time.time()
for i in range(100000):
    x = 1+1
end = time.time()
print(end-start)

0.005003452301025391


In [12]:
data, prices = get_features_data(data, spread_lag=False, tick_volume_lag=False, 
                                 parkinson_lag=False, rsi_lag=False, rsi_longrun=False,
                                 rsi_longrun_lag=False, windows={'windows_std' : [50],
                                                                 'momentum_windows': [30, 60, 120, 240, 480, 1060, 2120, 3180]}, 
                                 target_window=[1, 5, 10, 30, 60, 120, 240])

In [8]:
data.to_hdf('data.h5', 'HFT_M1_data')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4056279 entries, ('AUDCAD', Timestamp('2022-12-29 15:01:00')) to ('USDTRY', Timestamp('2023-04-06 09:38:00'))
Data columns (total 73 columns):
 #   Column              Dtype  
---  ------              -----  
 0   tick_volume         uint64 
 1   spread              int32  
 2   momentum_30p        float64
 3   momentum_60p        float64
 4   momentum_120p       float64
 5   momentum_240p       float64
 6   momentum_480p       float64
 7   momentum_1060p      float64
 8   ema_spread20        float64
 9   ema_accel_1p        float64
 10  ema_accel_5p        float64
 11  ema_accel_10p       float64
 12  ema_accel_30p       float64
 13  rsi                 float64
 14  rsi_lag_1           float64
 15  rsi_lag_5           float64
 16  rsi_lag_10          float64
 17  rsi_lag_20          float64
 18  rsi_lag_25          float64
 19  rsi_lag_35          float64
 20  rsi_lag_50          float64
 21  rsi_longrun         float64
 22  rsi_lr_lag