# Features Engineering Pipelines
## Create a data feature engineering pipelines for cleaned datasets

Calculate FD(Fractal Dimension) and Volatility 

## Packages & Configurations

In [1]:
# load packages
import pandas as pd
import numpy as np

import arcticdb as adb

from datetime import datetime, timedelta, timezone
import time


In [2]:
# Configurations
DB_PATH = '/Users/zway/Desktop/BTC_Project/DB'

# Read datasets

In [3]:
# Read dataset(s)
# Open ArcticDB libraries
ac = adb.Arctic(f"lmdb://{DB_PATH}")
cleaned_lib = ac['cleaned_data']
engineered_lib = ac.get_library('engineered_data', create_if_missing=True)

# Load the cleaned BTC data
df = cleaned_lib.read('btc_cleaned_sample').data

df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36111 entries, 0 to 36110
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   id         36111 non-null  int64         
 1   Timestamp  36111 non-null  datetime64[ns]
 2   Open       36111 non-null  float64       
 3   High       36111 non-null  float64       
 4   Low        36111 non-null  float64       
 5   Close      36111 non-null  float64       
 6   Volume     36111 non-null  float64       
 7   Avg_price  36111 non-null  float64       
dtypes: datetime64[ns](1), float64(6), int64(1)
memory usage: 2.2 MB


(None,
         id           Timestamp     Open     High      Low    Close    Volume  \
 0  1675787 2023-07-01 00:00:00  30407.1  30418.2  30407.1  30416.9  0.000027   
 1  1675788 2023-07-01 00:01:00  30416.9  30416.9  30392.3  30393.2  0.000016   
 2  1675789 2023-07-01 00:02:00  30393.5  30417.3  30393.5  30402.2  0.000074   
 3  1675790 2023-07-01 00:03:00  30404.1  30420.8  30404.1  30413.4  0.000056   
 4  1675791 2023-07-01 00:04:00  30413.6  30427.2  30413.4  30422.8  0.000039   
 
    Avg_price  
 0   30412.65  
 1   30404.60  
 2   30405.40  
 3   30412.45  
 4   30420.30  )

# Define Features Calculating Functions

Find keltner bands and count the number of times the average price cross keltner bands to find the FD.

In [4]:
def count_crossings_vectorized(prices, lower_bands, upper_bands):
    p1, p2 = prices[:-1], prices[1:] # price at t & price at t + 1
    
    # expand dimensions: make prices and bands into 2D matrices for vectorized broadcasting
    p1_matrix, p2_matrix = p1[None, :], p2[None, :] # Shape (1, T-1)
    lower_matrix, upper_matrix = lower_bands[:, None], upper_bands[:, None] # Shape (B, 1)
    
    cross_up_lower = (p1_matrix < lower_matrix) & (lower_matrix < p2_matrix) # Cross up lower band
    cross_down_lower = (p1_matrix > lower_matrix) & (lower_matrix > p2_matrix) # Cross down lower band
    cross_up_upper = (p1_matrix < upper_matrix) & (upper_matrix < p2_matrix) # Cross up upper band
    cross_down_upper = (p1_matrix > upper_matrix) & (upper_matrix > p2_matrix) # Cross down upper band
    # The result is a B x (T−1) boolean matrix that shows which bands were crossed at which times.
    
    crossing = cross_down_lower| cross_down_upper | cross_up_lower | cross_up_upper
    return crossing.sum()

def compute_keltner_fd(df_window):
    # Compute normalized fd using vectorized crossing count across 1000 keltner bands
    
    if df_window.shape[0] < 3:
        return np.nan

    prices = df_window['Avg_price'].values
    mean_price = prices.mean()
    
    n_range = np.arange(1, 1001)
    deviations = n_range * 0.00001 * mean_price
    
    upper_bands = mean_price + deviations
    lower_bands = mean_price - deviations

    return count_crossings_vectorized(prices, lower_bands, upper_bands)  # Normalize later if needed


Volatility calculated by using formula: Price Range / Mean Price 

Normalized Volatility = Volatility / Global Max Volatility

In [5]:
def compute_features(df_name, df_input, work_lib):
    start_ts = df_input['Timestamp'].min()
    end_ts = df_input['Timestamp'].max()

    intervals = pd.date_range(start=start_ts, end=end_ts, freq='6min')
    
    raw_fd_list = []
    feature_rows = []
    global_max_vol = 0
    
    for i in range(len(intervals) - 1):
        # slice df into window
        t0, t1 = intervals[i], intervals[i+1]
        df_window = df_input[(df_input['Timestamp'] >= t0) & (df_input['Timestamp'] < t1)]
        
        print(f"{i}, Processing interval {t0} - {t1}")
        if df_window.empty:
            feature_rows.append([t0, t1] + [np.nan] * 6)
            raw_fd_list.append(np.nan)
            continue
                
        avg_price = df_window['Avg_price'].values
        
        # find the mean price in this interval
        mean_price = avg_price.mean()
        
        max_price = max(df_window['High'].values)
        min_price = min(df_window['Low'].values)
        
        price_range = (max_price - min_price)
        
        vol = (price_range / mean_price) if mean_price else 0
        global_max_vol = max(global_max_vol, vol)
        norm_vol = (vol / global_max_vol) if global_max_vol else 0
        
        fd = compute_keltner_fd(df_window) # raw fd, haven't normalized
        raw_fd_list.append(fd)
        
        feature_rows.append([t0, t1, mean_price, max_price, min_price, price_range, norm_vol, None])
    
    # Normalize FD
    max_fd = max([fd for fd in raw_fd_list if not np.isnan(fd)], default=1)
    norm_fd_list = [fd / max_fd if not np.isnan(fd) else np.nan for fd in raw_fd_list]
    
    # Insert normalized FD into feature rows
    for i in range(len(feature_rows)):
        feature_rows[i][7] = norm_fd_list[i]
    
    
    columns = ['start_time', 'end_time', 'mean_price', 'max_price', 'min_price', 'price_range', 'norm_vol', 'norm_fd']
    features = pd.DataFrame(feature_rows, columns=columns)
    ac[work_lib].write(df_name, data=features)
    print(f"{df_name} _> {features.shape}")
    return features
        

# Compute

In [6]:
try:
    compute_features('BTC_engineered_sample', df, 'engineered_data')
    print('Feature Calculation Completed!')
except Exception as e:
    print(f"Error processing: {e}")

0, Processing interval 2023-07-01 00:00:00 - 2023-07-01 00:06:00
1, Processing interval 2023-07-01 00:06:00 - 2023-07-01 00:12:00
2, Processing interval 2023-07-01 00:12:00 - 2023-07-01 00:18:00
3, Processing interval 2023-07-01 00:18:00 - 2023-07-01 00:24:00
4, Processing interval 2023-07-01 00:24:00 - 2023-07-01 00:30:00
5, Processing interval 2023-07-01 00:30:00 - 2023-07-01 00:36:00
6, Processing interval 2023-07-01 00:36:00 - 2023-07-01 00:42:00
7, Processing interval 2023-07-01 00:42:00 - 2023-07-01 00:48:00
8, Processing interval 2023-07-01 00:48:00 - 2023-07-01 00:54:00
9, Processing interval 2023-07-01 00:54:00 - 2023-07-01 01:00:00
10, Processing interval 2023-07-01 01:00:00 - 2023-07-01 01:06:00
11, Processing interval 2023-07-01 01:06:00 - 2023-07-01 01:12:00
12, Processing interval 2023-07-01 01:12:00 - 2023-07-01 01:18:00
13, Processing interval 2023-07-01 01:18:00 - 2023-07-01 01:24:00
14, Processing interval 2023-07-01 01:24:00 - 2023-07-01 01:30:00
15, Processing inter

In [7]:
df_features = engineered_lib.read('BTC_engineered_sample').data
print(df_features.head())

           start_time            end_time    mean_price  max_price  min_price  \
0 2023-07-01 00:00:00 2023-07-01 00:06:00  30412.208333    30427.2    30392.3   
1 2023-07-01 00:06:00 2023-07-01 00:12:00  30432.991667    30449.8    30414.9   
2 2023-07-01 00:12:00 2023-07-01 00:18:00  30403.083333    30450.3    30361.4   
3 2023-07-01 00:18:00 2023-07-01 00:24:00  30411.458333    30428.8    30393.2   
4 2023-07-01 00:24:00 2023-07-01 00:30:00  30435.150000    30467.4    30410.1   

   price_range  norm_vol   norm_fd  
0         34.9  1.000000  0.036554  
1         34.9  0.999317  0.043081  
2         88.9  1.000000  0.152742  
3         35.6  0.400340  0.050914  
4         57.3  0.643865  0.060487  


In [8]:
print(df_features)

              start_time            end_time    mean_price  max_price  \
0    2023-07-01 00:00:00 2023-07-01 00:06:00  30412.208333    30427.2   
1    2023-07-01 00:06:00 2023-07-01 00:12:00  30432.991667    30449.8   
2    2023-07-01 00:12:00 2023-07-01 00:18:00  30403.083333    30450.3   
3    2023-07-01 00:18:00 2023-07-01 00:24:00  30411.458333    30428.8   
4    2023-07-01 00:24:00 2023-07-01 00:30:00  30435.150000    30467.4   
...                  ...                 ...           ...        ...   
7432 2023-07-31 23:12:00 2023-07-31 23:18:00  29167.120000    29176.4   
7433 2023-07-31 23:18:00 2023-07-31 23:24:00  29179.450000    29182.6   
7434 2023-07-31 23:24:00 2023-07-31 23:30:00  29169.650000    29179.4   
7435 2023-07-31 23:30:00 2023-07-31 23:36:00  29170.050000    29178.8   
7436 2023-07-31 23:36:00 2023-07-31 23:42:00  29182.700000    29186.5   

      min_price  price_range  norm_vol   norm_fd  
0       30392.3         34.9  1.000000  0.036554  
1       30414.9      

# Sample & New Features
Include Volume & Garman-Klass volatility

In [13]:
def compute_features(df_name, df_input, work_lib):
    start_ts = df_input['Timestamp'].min()
    end_ts = df_input['Timestamp'].max()

    intervals = pd.date_range(start=start_ts, end=end_ts, freq='6min')
    
    raw_fd_list = []
    feature_rows = []
    global_max_gk = 0  # for GK normalization
    
    for i in range(len(intervals) - 1):
        # slice df into window
        t0, t1 = intervals[i], intervals[i+1]
        df_window = df_input[(df_input['Timestamp'] >= t0) & (df_input['Timestamp'] < t1)]
        
        print(f"{i}, Processing interval {t0} - {t1}")
        if df_window.empty:
            feature_rows.append([t0, t1] + [np.nan] * 7)
            raw_fd_list.append(np.nan)
            continue
                
        avg_price = df_window['Avg_price'].values
        
        # find the mean price in this interval
        mean_price = avg_price.mean()
        
        max_price = max(df_window['High'].values)
        min_price = min(df_window['Low'].values)
        
        price_range = (max_price - min_price)
        
        volume_mean = df_window['Volume'].mean()
        
        # Garman-Klass Volatility
        hl_term = 0.5 * (np.log(df_window['High'] / df_window['Low']) ** 2)
        co_term = (2 * np.log(2) - 1) * (np.log(df_window['Close'] / df_window['Open']) ** 2)
        gk_per_row = hl_term - co_term
        gk_vol = np.sqrt(gk_per_row.mean())
        
        global_max_gk = max(global_max_gk, gk_vol)
        norm_gk_vol = gk_vol / global_max_gk if global_max_gk else 0
        
        fd = compute_keltner_fd(df_window) # raw fd, haven't normalized
        raw_fd_list.append(fd)
        
        feature_rows.append([t0, t1, mean_price, max_price, min_price, price_range, volume_mean, norm_gk_vol, None])
    
    # Normalize FD
    max_fd = max([fd for fd in raw_fd_list if not np.isnan(fd)], default=1)
    norm_fd_list = [fd / max_fd if not np.isnan(fd) else np.nan for fd in raw_fd_list]
    
    # Insert normalized FD into feature rows
    for i in range(len(feature_rows)):
        feature_rows[i][8] = norm_fd_list[i]
    
    
    columns = ['start_time', 'end_time', 'mean_price', 'max_price', 'min_price', 'price_range', 'volume_mean', 'norm_gk_vol', 'norm_fd']
    features = pd.DataFrame(feature_rows, columns=columns)
    ac[work_lib].write(df_name, data=features)
    print(f"{df_name} _> {features.shape}")
    return features
        

In [14]:
try:
    compute_features('BTC_engineered_sample_2', df, 'engineered_data') # volume + gk volatility
    print('Feature Calculation Completed!')
except Exception as e:
    print(f"Error processing: {e}")

0, Processing interval 2023-07-01 00:00:00 - 2023-07-01 00:06:00
1, Processing interval 2023-07-01 00:06:00 - 2023-07-01 00:12:00
2, Processing interval 2023-07-01 00:12:00 - 2023-07-01 00:18:00
3, Processing interval 2023-07-01 00:18:00 - 2023-07-01 00:24:00
4, Processing interval 2023-07-01 00:24:00 - 2023-07-01 00:30:00
5, Processing interval 2023-07-01 00:30:00 - 2023-07-01 00:36:00
6, Processing interval 2023-07-01 00:36:00 - 2023-07-01 00:42:00
7, Processing interval 2023-07-01 00:42:00 - 2023-07-01 00:48:00
8, Processing interval 2023-07-01 00:48:00 - 2023-07-01 00:54:00
9, Processing interval 2023-07-01 00:54:00 - 2023-07-01 01:00:00
10, Processing interval 2023-07-01 01:00:00 - 2023-07-01 01:06:00
11, Processing interval 2023-07-01 01:06:00 - 2023-07-01 01:12:00
12, Processing interval 2023-07-01 01:12:00 - 2023-07-01 01:18:00
13, Processing interval 2023-07-01 01:18:00 - 2023-07-01 01:24:00
14, Processing interval 2023-07-01 01:24:00 - 2023-07-01 01:30:00
15, Processing inter

In [15]:
df_features = engineered_lib.read('BTC_engineered_sample_2').data
df_features

Unnamed: 0,start_time,end_time,mean_price,max_price,min_price,price_range,volume_mean,norm_gk_vol,norm_fd
0,2023-07-01 00:00:00,2023-07-01 00:06:00,30412.208333,30427.2,30392.3,34.9,0.000037,1.000000,0.036554
1,2023-07-01 00:06:00,2023-07-01 00:12:00,30432.991667,30449.8,30414.9,34.9,0.000030,0.691906,0.043081
2,2023-07-01 00:12:00,2023-07-01 00:18:00,30403.083333,30450.3,30361.4,88.9,0.000063,1.000000,0.152742
3,2023-07-01 00:18:00,2023-07-01 00:24:00,30411.458333,30428.8,30393.2,35.6,0.000032,0.567270,0.050914
4,2023-07-01 00:24:00,2023-07-01 00:30:00,30435.150000,30467.4,30410.1,57.3,0.000028,0.416537,0.060487
...,...,...,...,...,...,...,...,...,...
7432,2023-07-31 23:12:00,2023-07-31 23:18:00,29167.120000,29176.4,29156.8,19.6,0.000035,0.020867,0.026980
7433,2023-07-31 23:18:00,2023-07-31 23:24:00,29179.450000,29182.6,29176.3,6.3,0.000052,0.041372,
7434,2023-07-31 23:24:00,2023-07-31 23:30:00,29169.650000,29179.4,29163.2,16.2,0.000002,0.030386,
7435,2023-07-31 23:30:00,2023-07-31 23:36:00,29170.050000,29178.8,29163.2,15.6,0.000050,0.014520,0.018712
