## Generate charts

In [6]:
# required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mplfinance as mpf
from PIL import Image
import os, shutil
import multiprocessing as mp
import tqdm
from datetime import datetime
import create_chart
import json

In [7]:
def generate_training_data(curr_pair, candle_hist = 40, nb_pool_wrks = 5, chunk_size=1, overwrite=False):
    
    
    # delete exsisting content of the training-data folder
    if overwrite:
        folder = f'./data/fx_data/{curr_pair[:3]}_{curr_pair[4:]}/train_imgs/'
        for filename in os.listdir(folder):
            file_path = os.path.join(folder, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print('Failed to delete %s. Reason: %s' % (file_path, e))    
    
    
    # extract saved signal and price data | merge 
    folder_path    = f'./data/fx_data/{curr_pair[:3]}_{curr_pair[4:]}/'
    fx_signal_data = pd.read_parquet(folder_path+f'fx_data_{curr_pair[:3]}_{curr_pair[4:]}_w_sig.parquet')
    fx_data        = pd.read_parquet(folder_path+f'fx_data_{curr_pair[:3]}_{curr_pair[4:]}.parquet')
    fx_data.set_index('date',inplace = True)
    fx_final = fx_data.merge(right = fx_signal_data
                             , left_index=True
                             , right_index=True
                             , how = 'inner')

    # print status
    print('\n------------------------------------\n'\
          f'Curr. Pair: {curr_pair[:3]}-{curr_pair[4:]}\n\n' \
          f'Start-time: {fx_final.index.min()},\nEnd-time: {fx_final.index.max()},\n\n' \
          f'Signal-Breakdown\n{pd.DataFrame(fx_final.signal.value_counts())}\n\n' \
          f'Dataset-size:{fx_final.shape}\n'\
           '--------------------------')
    
    # create a single `open', `close', `high', `low' metrics with bid/ask prices 
    fx_final.loc[:,'open']  = (fx_final.loc[:,'bidopen']  + fx_final.loc[:,'askopen'])/2 
    fx_final.loc[:,'close'] = (fx_final.loc[:,'bidclose'] + fx_final.loc[:,'askclose'])/2 
    fx_final.loc[:,'high']  = (fx_final.loc[:,'bidhigh']  + fx_final.loc[:,'askhigh'])/2 
    fx_final.loc[:,'low']   = (fx_final.loc[:,'bidlow']   + fx_final.loc[:,'asklow'])/2 
    fx_final.sort_index(inplace=True)
        
    # filter out only the required metrics / structure the dataframe
    fx_final = fx_final.loc[:,['open','close','high','low','tickqty', 'signal', 'signal_count']]
    fx_final.columns = ['open','close','high','low','volume', 'signal', 'signal_count']    
        
    # create basic moving-average indicators (50 EMA 200 EMA)
    fx_final['ewm_50_m'] = fx_final['close'].ewm(span=50
                                               , min_periods=0
                                               , adjust=False
                                               , ignore_na=False).mean()

    fx_final['ewm_50_h'] = fx_final['high'].ewm(span=50
                                               , min_periods=0
                                               , adjust=False
                                               , ignore_na=False).mean()

    fx_final['ewm_50_l'] = fx_final['low'].ewm(span=50
                                               , min_periods=0
                                               , adjust=False
                                               , ignore_na=False).mean()

    fx_final['ewm_200'] = fx_final['close'].ewm(span=200
                                               , min_periods=0
                                               , adjust=False
                                               , ignore_na=False).mean()    
        
    # number of candle-stick history to consider for each prediction
    look_back_time = candle_hist

    try:
        # get the image labels in the target folder
        file_name_li = os.listdir(folder_path+'train_imgs/')

        # extract the time labels from files
        file_time_li = [datetime.strptime(file_name[8:24], '%Y-%m-%d_%H-%M') for file_name in file_name_li]

        # get the most recent time
        max_time_existing = max(file_time_li)

        # filter time index for new data (excluding already existing data)
        fx_final_date_filtered = fx_final.index[fx_final.index>max_time_existing]

        # create incremental image chunks with 30 images (each chunk include 30 candles)
        data_idx_chuncks = [i for i in zip(fx_final_date_filtered.to_list(), fx_final_date_filtered.to_list()[look_back_time:])]

    except:
        # create incremental image chunks with 30 images (each chunk include 30 candles)
        data_idx_chuncks = [i for i in zip(fx_final.index.to_list(), fx_final.index.to_list()[look_back_time:])]    
        
        
    ### structure the data for parallel-processing ###
    
    # add curr. pair information to dataframe
    fx_final.loc[:,"curr_1"] = curr_pair[:3]
    fx_final.loc[:,"curr_2"] = curr_pair[4:]

    # add folder_path to dataframe
    fx_final.loc[:,"f_path"] = folder_path

    # create dataframe chunks based on time indexes
    df_chuncks = []

    print("\n Partitioning dataset... ", end='\r')
        
    for data_chunk in tqdm.tqdm(data_idx_chuncks):
        df_chuncks.append(fx_final.loc[data_chunk[0]:data_chunk[1]])
        
                
    # create folder to store images
    if not os.path.exists(folder_path+'train_imgs'):
         os.makedirs(folder_path+'train_imgs')
            
    # list to store predicted date time period
    predicted_dates = []

    # initialize workers
    Pool = mp.Pool(processes=nb_pool_wrks)
    
    print("\n Genarating charts... ", end='\r')
    # generate candle-stick charts (parallelized)
    for _ in tqdm.tqdm(Pool.imap(create_chart.create_chart_parellel, df_chuncks, chunksize=chunk_size), total=len(df_chuncks)):
        predicted_dates.append(_)
    
    # release workers
    Pool.close()
    Pool.join()
    
 
    ## update the chart image creation log ##
    # read old log datafile
    try:
        with open(folder_path+curr_pair[:3]+'_'+curr_pair[4:]+"_chart_log.txt") as json_file:
            log_data = json.load(json_file)
    except:
        log_data = {}

    # add new log data
    log_data[datetime.now().strftime('%Y-%m-%d %H:%M:%S')] = predicted_dates

    # overwite the log datafile
    with open(folder_path+curr_pair[:3]+'_'+curr_pair[4:]+"_chart_log.txt", 'w') as outfile:
        json.dump(log_data, outfile)
        
    print(f"\nExecution complete for {curr_pair[:3]}-{curr_pair[4:]} Pair!\n" \
           '------------------------------------', end='\r')
    

In [8]:
curr_pairs = ['EUR/USD',
             'GBP/USD',
             'USD/CHF',
             'AUD/USD',
             'USD/CAD',
             'NZD/USD',
             'EUR/CHF',
             'EUR/GBP',
             'EUR/AUD',
             'EUR/CAD']

In [9]:
# generate training data for every curr. pair
for curr_pair in curr_pairs:
    generate_training_data(curr_pair
                           , candle_hist = 40
                           , nb_pool_wrks = 6
                           , chunk_size = 2
                           , overwrite=True)


------------------------------------
Curr. Pair: EUR-USD

Start-time: 2020-06-25 07:15:00,
End-time: 2020-11-20 18:15:00,

Signal-Breakdown
      signal
HOLD   10326
SELL      17
BUY       15

Dataset-size:(10358, 11)
--------------------------

 Partitioning dataset... 

100%|██████████████████████████████████████████████████████████████████████████| 10318/10318 [00:04<00:00, 2514.56it/s]



 Genarating charts... 

 77%|███████████████████████████████████████████████████████████▏                 | 7925/10318 [24:49<06:50,  5.83it/s]

KeyboardInterrupt: 