---
## Generate training (chart) data - PARELLELIZE EXPERIMENT

In [1]:
# required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mplfinance as mpf
import multiprocessing as mp
from PIL import Image
import os
import tqdm
from datetime import datetime
import create_chart
import json

In [2]:
#instruments = con.get_instruments()
curr_pairs = ['EUR/USD',
             'GBP/USD',
             'USD/CHF',
             'AUD/USD',
             'USD/CAD',
             'NZD/USD',
             'EUR/CHF',
             'EUR/GBP',
             'EUR/AUD',
             'EUR/CAD']

curr_pair = curr_pairs[0]

In [3]:
# extract saved signal and price data | merge 
folder_path    = f'./data/fx_data/{curr_pair[:3]}_{curr_pair[4:]}/'
fx_signal_data = pd.read_parquet(folder_path+f'fx_data_{curr_pair[:3]}_{curr_pair[4:]}_w_sig.parquet')
fx_data        = pd.read_parquet(folder_path+f'fx_data_{curr_pair[:3]}_{curr_pair[4:]}.parquet')
fx_data.set_index('date',inplace = True)
fx_final = fx_data.merge(right = fx_signal_data
                         , left_index=True
                         , right_index=True
                         , how = 'inner')

# print status
print('--------------------------\n'\
      f'Curr. Pair: {curr_pair[:3]}-{curr_pair[4:]}\n\n' \
      f'Start-time: {fx_final.index.min()},\nEnd-time: {fx_final.index.max()},\n\n' \
      f'Signal-Breakdown\n{pd.DataFrame(fx_final.signal.value_counts())}\n\n' \
      f'Dataset-size:{fx_final.shape}\n'\
       '--------------------------')


--------------------------
Curr. Pair: EUR-USD

Start-time: 2020-06-25 07:15:00,
End-time: 2020-11-19 13:45:00,

Signal-Breakdown
      signal
HOLD   10225
BUY       10
SELL       7

Dataset-size:(10242, 11)
--------------------------


In [4]:
# create a single `open', `close', `high', `low' metrics with bid/ask prices 
fx_final.loc[:,'open']  = (fx_final.loc[:,'bidopen']  + fx_final.loc[:,'askopen'])/2 
fx_final.loc[:,'close'] = (fx_final.loc[:,'bidclose'] + fx_final.loc[:,'askclose'])/2 
fx_final.loc[:,'high']  = (fx_final.loc[:,'bidhigh']  + fx_final.loc[:,'askhigh'])/2 
fx_final.loc[:,'low']   = (fx_final.loc[:,'bidlow']   + fx_final.loc[:,'asklow'])/2 
fx_final.sort_index(inplace=True)

In [5]:
# filter out only the required metrics / structure the dataframe
fx_final = fx_final.loc[:,['open','close','high','low','tickqty', 'signal', 'signal_count']]
fx_final.columns = ['open','close','high','low','volume', 'signal', 'signal_count']

In [6]:
# create basic moving-average indicators (50 EMA 200 EMA)
fx_final['ewm_50_m'] = fx_final['close'].ewm(span=50
                                           , min_periods=0
                                           , adjust=False
                                           , ignore_na=False).mean()

fx_final['ewm_50_h'] = fx_final['high'].ewm(span=50
                                           , min_periods=0
                                           , adjust=False
                                           , ignore_na=False).mean()

fx_final['ewm_50_l'] = fx_final['low'].ewm(span=50
                                           , min_periods=0
                                           , adjust=False
                                           , ignore_na=False).mean()

fx_final['ewm_200'] = fx_final['close'].ewm(span=200
                                           , min_periods=0
                                           , adjust=False
                                           , ignore_na=False).mean()

In [7]:
# number of candle-stick history to consider for each prediction
look_back_time = 40

try:
    # get the image labels in the target folder
    file_name_li = os.listdir(folder_path+'train_imgs/')

    # extract the time labels from files
    file_time_li = [datetime.strptime(file_name[8:24], '%Y-%m-%d_%H-%M') for file_name in file_name_li]

    # get the most recent time
    max_time_existing = max(file_time_li)
    
    # filter time index for new data (excluding already existing data)
    fx_final_date_filtered = fx_final.index[fx_final.index>max_time_existing]
    
    # create incremental image chunks with 30 images (each chunk include 30 candles)
    data_idx_chuncks = [i for i in zip(fx_final_date_filtered.to_list(), fx_final_date_filtered.to_list()[look_back_time:])]
    
except:
    # create incremental image chunks with 30 images (each chunk include 30 candles)
    data_idx_chuncks = [i for i in zip(fx_final.index.to_list(), fx_final.index.to_list()[look_back_time:])]

In [8]:
# structure the data for parallel processing

# add curr. pair information to dataframe
fx_final.loc[:,"curr_1"] = curr_pair[:3]
fx_final.loc[:,"curr_2"] = curr_pair[4:]

# add folder_path to dataframe
fx_final.loc[:,"f_path"] = folder_path

# create data chunks based on time indexes
df_chuncks = []

for data_chunk in tqdm.tqdm(data_idx_chuncks):
    df_chuncks.append(fx_final.loc[data_chunk[0]:data_chunk[1]])

100%|████████████████████████████████████████████████████████████████████████████| 5046/5046 [00:01<00:00, 2607.34it/s]


In [10]:
# create folder to store images
if not os.path.exists(folder_path+'train_imgs'):
     os.makedirs(folder_path+'train_imgs')

In [11]:
# parallelize
# with mp.Pool(processes=5) as pool:
#       predicted_dates = list()
predicted_dates = []

Pool = mp.Pool(processes=5)
for _ in tqdm.tqdm(Pool.imap(create_chart.create_chart_parellel, df_chuncks), total=len(df_chuncks)):
    predicted_dates.append(_)

100%|██████████████████████████████████████████████████████████████████████████████| 5046/5046 [19:13<00:00,  4.37it/s]


In [35]:
# read old log datafile
with open(folder_path+curr_pair[:3]+'_'+curr_pair[4:]+"_chart_log.txt") as json_file:
    log_data = json.load(json_file)

# add new log data
log_data[datetime.now().strftime('%Y-%m-%d %H:%M:%S')] = predicted_dates

# overwite the log datafile
with open(folder_path+curr_pair[:3]+'_'+curr_pair[4:]+"_chart_log.txt", 'w') as outfile:
    json.dump(log_data, outfile)