In [1]:
# pip install pyarrow
# pip install modin[all]
# pip install distributed
# pip install dask

In [2]:
# 데이터 크기가 커서 저장이 어려움. 따라서 dask dataframe으로 저장.
# pip install "dask[dataframe]" --upgrade


In [3]:
import os
#USE ONLY ONE OF THESE:
# os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask

new_dir = 'C:/Users/user/2023-2_TradeMachine/실전/hausdorff_distance_method'
os.chdir(new_dir)

# Import packages
import numpy as np
import csv
import dask.dataframe as dd # This is a main package to process a large csv file.
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd 

df3 = pd.read_csv('C:/Users/user/2023-2_TradeMachine/실전/hausdorff_distance_method/BTC_sum_both_10m.csv')


## 1번 코드 

In [12]:
import pandas as pd
from scipy.spatial.distance import directed_hausdorff

# 하우스도르프 거리 계산을 위해서는 최소한 두 개의 다차원 포인트가 필요하다.
# 여기서는 데이터프레임의 처음 두 열을 사용하여 모의 다차원 포인트 두 개를 생성한다.

# 두 개의 점 집합을 생성 (반으로 나눔)
half = len(df3) // 2 
point1 = df3.loc[:half, ['realized_volatility', 'num_trades']].values
point2 = df3.loc[half:, ['realized_volatility', 'num_trades']].values

# 하우스도르프 거리를 계산 
# 두 점 집합 사이의 최대 하우스도르프 거리를 구함 
hausdorff_distance = max(directed_hausdorff(point1, point2)[0], directed_hausdorff(point2, point1)[0])
# directed_hausdorff(point1, point2)[0] ->  point1 집합의 각 점에서 point2 집합의 점까지의 하우스도르프 거리 중 가장 큰 값을 반환
# 
hausdorff_distance 

251.00000000012872

1번 코드는 직접적인 하우스도르프 거리를 계산한다. 
하우스도르프 거리는 두 점 집합 간의 거리를 측정하는 방법 중 하나다. 
한 집합의 모든 점에서 다른 집합의 점까지의 최대 최소 거리를 의미한다. 

이 방법은 두 집합 간의 가장 먼 점을 기반으로 거리를 측정하므로, 극단적인 값 또는 이상치에 민감하다는 단점이 있다.

## 2번 코드 ('realized_volatility'와 'num_trades')

 'realized_volatility'와 'num_trades' 간의 평균화된 하우스도르프 거리를 계산하고, 
 이 거리는 두 변수 간의 유사성을 측정하는 방법 중 하나임.


변수의 분포가 얼마나 다른지를 측정하는 것이 중요한 경우, 하우스도르프 거리는 유용한 지표가 될 수 있다. 
하우스도르프 거리가 크면 두 분포가 서로 다르다는 것을 의미하고, 하우스도르프 거리가 작으면 두 분포가 유사하다는 것을 의미함.



In [13]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import directed_hausdorff
from sklearn.metrics.pairwise import pairwise_distances

# 평균화된 하우스도르프 거리 함수 정의
# 이 함수는 두 점 집합 간의 '평균 최단 거리'를 계산한다.
def averaged_hausdorff_distance(set1, set2, max_ahd=np.inf):
    # 두 집합 중 하나라도 비어있으면 max_ahd 반환
    # max_ahd는 "최대 가능한 평균화된 하우스도르프 거리"를 의미하는 파라미터"다
    # 두 집합 중 하나가 비어 있을 때 거리를 무한대로 설정하여 두 집합이 "최대한 멀리 떨어져 있다"는 것을 의미
    if len(set1) == 0 or len(set2) == 0:
        return max_ahd
     # 집합을 numpy 배열로 변환
    set1 = np.array(set1)
    set2 = np.array(set2)
    # 각 집합이 2차원인지 확인 (각 점이 N차원이어야 함)
    assert set1.ndim == 2, 'got %s' % set1.ndim
    assert set2.ndim == 2, 'got %s' % set2.ndim
    # 유클리드 거리는 두 점 사이의 직선 거리를 계산하는 가장 일반적인 방법임.
    # 두 집합 간의 모든 점 쌍에 대한 유클리드 거리를 계산 + 두 집합의 점들이 같은 차원을 가져야 함 
    # => pairwise_distances 함수는 첫 번째 집합의 각 점과 두 번째 집합의 각 점 사이의 거리를 계산하고, 이를 2차원 행렬로 반환
    assert set1.shape[1] == set2.shape[1], 'The points in both sets must have the same number of dimensions, got %s and %s.' % (set2.shape[1], set2.shape[1])
    d2_matrix = pairwise_distances(set1, set2, metric='euclidean')
    # 각 점에서 다른 집합의 모든 점에 대한 거리 중 최소값을 찾고, 최소값들의 평균을 계산하여 평균화된 하우스도르프 거리를 얻는다. 
    # 이 방법은 하우스도르프 거리의 한계점인 이상치에 대한 민감도를 줄여주는 효과가 있다. 
    res = np.average(np.min(d2_matrix, axis=0)) + np.average(np.min(d2_matrix, axis=1))
    return res

# 데이터를 정규화
# 이렇게 하면 모든 값이 0과 1 사이로 스케일링되며, 더 나은 비교를 할 수 있다.
df3['realized_volatility'] = df3['realized_volatility'] / df3['realized_volatility'].max()
df3['num_trades'] = df3['num_trades'] / df3['num_trades'].max()

# 각 DataFrame에서 무작위로 하위 집합을 샘플링
# 이는 전체 데이터셋이 너무 큰 경우에 유용하며, 계산 비용을 줄일 수 있음
subset_size = min(1000, len(df3))  # 이 값 조정할 필요 있음
subset1 = df3.sample(subset_size, random_state=1)[['realized_volatility', 'num_trades']].values
subset2 = df3.sample(subset_size, random_state=2)[['realized_volatility', 'num_trades']].values

# Compute the average Hausdorff distance
averaged_distance = averaged_hausdorff_distance(subset1, subset2)
averaged_distance_col = list()
averaged_distance_result = list()
averaged_distance_col.append('num_trades')
averaged_distance_result.append(averaged_distance)

print(averaged_distance)
print(averaged_distance_col)
print(averaged_distance_result)


0.01055017679134983
['num_trades']
[0.01055017679134983]


2번 코드는 평균화된 하우스도르프 거리를 계산한다. 이는 한 집합의 모든 점에서 다른 집합의 점까지의 거리 중 최소 거리를 평균한 값을 사용합니다. 

평균화된 하우스도르프 거리는 이상치 또는 극단적인 값을 포함하는 데이터 집합에 대한 강건성을 높인다. 하우스도르프 거리가 두 데이터 집합 사이의 가장 멀리 떨어진 점들을 기반으로 거리를 측정하는 반면, 평균화된 하우스도르프 거리는 모든 점 간의 거리를 고려하여 더욱 균형 잡힌 거리 측정치를 제공한다. 

이런 특성으로 인해 평균화된 하우스도르프 거리는 특이값이나 이상치의 영향을 크게 받지 않으며, 데이터 분포의 전반적인 특성을 반영하는데 효과적이다.

총정리!

A라는 집합의 특정 점이 B 집합이 가지는 모든 점과의 거리 중 최소 거리를 구하고, A집합의 점들이 갖는 그 최소 거리들 중에 최대 거리를 찾는 것이 hausdorff distance를 구하는 것이다.

hausdorff distance가 클수록 
두 집합이 서로 다른 공간에 위치하고 있으므로,
두 집합 사이의 거리가 멀다는 것을 의미하고, 
두 집합이 서로 많이 다르다는 것을 나타낸다. 


따라서 하우스도르프 거리를 사용하면 두 데이터 집합이 얼마나 비슷한지 혹은 얼마나 다른지를 측정할 수 있다. 


## 2-1 번 코드 ('realized_volatility'와 변수에 대해서)

In [14]:
df3.columns

Index(['window_start', 'window_end', 'realized_volatility', 'num_trades',
       'lowest_return', 'highest_return', 'high_low_gap', 'trade_vol',
       'volume_power', 'time_id', 'dv1_realized_volatility',
       'dv2_lowest_return', 'liq_last_1', 'liq_last_2', 'liq_last_5',
       'liq_last_10', 'liq_last_15', 'bidask_spread_0', 'bidask_spread_1'],
      dtype='object')

In [15]:
numeric_cols = df3.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_cols = numeric_cols[1:] # numeric_cols[0] = 'realized_volatility'
print(numeric_cols)

['num_trades', 'lowest_return', 'highest_return', 'high_low_gap', 'trade_vol', 'volume_power', 'dv1_realized_volatility', 'dv2_lowest_return', 'liq_last_1', 'liq_last_2', 'liq_last_5', 'liq_last_10', 'liq_last_15', 'bidask_spread_0', 'bidask_spread_1']


In [16]:
for another_var in numeric_cols:
    # 결측값 제거
    df3 = df3.dropna(subset=['realized_volatility', another_var])

    # 무한대 값을 최대값으로 대체
    df3.replace([np.inf, -np.inf], np.nan, inplace=True)
    df3.fillna(df3.max(), inplace=True)

    # 데이터 정규화
    df3['realized_volatility'] = df3['realized_volatility'] / df3['realized_volatility'].max()
    df3[another_var] = df3[another_var] / df3[another_var].max()

    # 각 DataFrame에서 무작위로 하위 집합을 샘플링
    subset_size = min(1000, len(df3))  # 이 값 조정할 필요 있음
    subset1 = df3.sample(subset_size, random_state=1)[['realized_volatility', another_var]].values
    subset2 = df3.sample(subset_size, random_state=2)[['realized_volatility', another_var]].values

    # Compute the average Hausdorff distance
    averaged_distance = averaged_hausdorff_distance(subset1, subset2)
    averaged_distance_col.append(another_var)
    averaged_distance_result.append(averaged_distance)



In [17]:
print(averaged_distance_col)
print(averaged_distance_result)

['num_trades', 'num_trades', 'lowest_return', 'highest_return', 'high_low_gap', 'trade_vol', 'volume_power', 'dv1_realized_volatility', 'dv2_lowest_return', 'liq_last_1', 'liq_last_2', 'liq_last_5', 'liq_last_10', 'liq_last_15', 'bidask_spread_0', 'bidask_spread_1']
[0.01055017679134983, 0.01055017679134983, 0.05049734373986403, 0.009437152867570735, 0.009517282161400242, 0.008529258965739452, 0.004002339383157829, 0.015212897638206914, 0.05857175675620198, 0.003170885419790807, 0.003800522303705987, 0.0039915623648147745, 0.004140878778354513, 0.004240540286990721, 0.014387991230889664, 0.014490871359933169]


In [18]:
df_HDD_relative_volatility = pd.DataFrame(averaged_distance_result, index=averaged_distance_col, columns=['Hausdorff Distance regarding "realized_volatility"'])
df_HDD_relative_volatility = df_HDD_relative_volatility.sort_values(by='Hausdorff Distance regarding "realized_volatility"')
df_HDD_relative_volatility

Unnamed: 0,"Hausdorff Distance regarding ""realized_volatility"""
liq_last_1,0.003171
liq_last_2,0.003801
liq_last_5,0.003992
volume_power,0.004002
liq_last_10,0.004141
liq_last_15,0.004241
trade_vol,0.008529
highest_return,0.009437
high_low_gap,0.009517
num_trades,0.01055


## 2번 정리 

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import directed_hausdorff
from sklearn.metrics.pairwise import pairwise_distances

def averaged_hausdorff_distance(set1, set2, max_ahd=np.inf):
    if len(set1) == 0 or len(set2) == 0:
        return max_ahd
    set1 = np.array(set1)
    set2 = np.array(set2)
    assert set1.ndim == 2, 'got %s' % set1.ndim
    assert set2.ndim == 2, 'got %s' % set2.ndim
    assert set1.shape[1] == set2.shape[1], 'The points in both sets must have the same number of dimensions, got %s and %s.' % (set2.shape[1], set2.shape[1])
    d2_matrix = pairwise_distances(set1, set2, metric='euclidean')
    res = np.average(np.min(d2_matrix, axis=0)) + np.average(np.min(d2_matrix, axis=1))
    return res

df3['realized_volatility'] = df3['realized_volatility'] / df3['realized_volatility'].max()
df3['num_trades'] = df3['num_trades'] / df3['num_trades'].max()

subset_size = min(1000, len(df3))  # 이 값 조정할 필요 있음
subset1 = df3.sample(subset_size, random_state=1)[['realized_volatility', 'num_trades']].values
subset2 = df3.sample(subset_size, random_state=2)[['realized_volatility', 'num_trades']].values

averaged_distance = averaged_hausdorff_distance(subset1, subset2)
averaged_distance_col = list()
averaged_distance_result = list()
averaged_distance_col.append('num_trades')
averaged_distance_result.append(averaged_distance)

for another_var in numeric_cols:
    # 결측값 제거
    df3 = df3.dropna(subset=['realized_volatility', another_var])

    # 무한대 값을 최대값으로 대체
    df3.replace([np.inf, -np.inf], np.nan, inplace=True)
    df3.fillna(df3.max(), inplace=True)

    # 데이터 정규화
    df3['realized_volatility'] = df3['realized_volatility'] / df3['realized_volatility'].max()
    df3[another_var] = df3[another_var] / df3[another_var].max()

    # 각 DataFrame에서 무작위로 하위 집합을 샘플링
    subset_size = min(1000, len(df3))  # 이 값 조정할 필요 있음
    subset1 = df3.sample(subset_size, random_state=1)[['realized_volatility', another_var]].values
    subset2 = df3.sample(subset_size, random_state=2)[['realized_volatility', another_var]].values

    # Compute the average Hausdorff distance
    averaged_distance = averaged_hausdorff_distance(subset1, subset2)
    averaged_distance_col.append(another_var)
    averaged_distance_result.append(averaged_distance)



## 3번 코드 (변수간의 유사성 말고, time_id간의 유사성 구하기)

In [26]:
print(min(df3.time_id))
print(max(df3.time_id))

2022-12-16 21:06:00
2023-02-26 04:07:00


In [27]:
sorted(df3.time_id.unique())[0:10] 
# time_id가 1분마다 끊어져 있는 걸 볼 수 있음.

['2022-12-16 21:06:00',
 '2022-12-16 21:07:00',
 '2022-12-16 21:08:00',
 '2022-12-16 21:09:00',
 '2022-12-16 21:10:00',
 '2022-12-16 21:11:00',
 '2022-12-16 21:12:00',
 '2022-12-16 21:13:00',
 '2022-12-16 21:14:00',
 '2022-12-16 21:15:00']

In [28]:
df = df3[0:5000]
# 10만개는 많아서, 5천개만 뽑아서 let me trial first
print(df.time_id.unique())
print(df.time_id.nunique())


['2022-12-16 21:06:00' '2022-12-16 21:07:00' '2022-12-16 21:08:00' ...
 '2022-12-20 08:28:00' '2022-12-20 08:29:00' '2022-12-20 08:30:00']
5000


In [29]:
df.columns

Index(['window_start', 'window_end', 'realized_volatility', 'num_trades',
       'lowest_return', 'highest_return', 'high_low_gap', 'trade_vol',
       'volume_power', 'time_id', 'dv1_realized_volatility',
       'dv2_lowest_return', 'liq_last_1', 'liq_last_2', 'liq_last_5',
       'liq_last_10', 'liq_last_15', 'bidask_spread_0', 'bidask_spread_1'],
      dtype='object')

In [32]:
# 특정 time_id 선택
target_time_id = 1000  # 예시로 선택한 time_id

# 'time_id' 컬럼을 정수로 변환
df3['time_id'] = df3['time_id'].astype(int)
df3['time_id'] = pd.to_datetime(df3['time_id'])

# 5분 전과 5분 후의 time_id 범위 설정
before_time_id = target_time_id - 5
after_time_id = target_time_id + 5

# 해당 범위의 time_id에 해당하는 데이터만 선택
subset_df = df3[(df3['time_id'] >= before_time_id) & (df3['time_id'] <= after_time_id)]

# 결과를 저장할 리스트
result = []

# target_time_id와 다른 time_id 간의 Hausdorff 거리 계산
for time_id in subset_df['time_id'].unique():
    if time_id != target_time_id:
        subset1 = subset_df[subset_df['time_id'] == target_time_id].drop(columns=['time_id']).values
        subset2 = subset_df[subset_df['time_id'] == time_id].drop(columns=['time_id']).values
        averaged_distance = averaged_hausdorff_distance(subset1, subset2)
        result.append((time_id, averaged_distance))

# 결과 정렬 (유사성이 높은 것부터)
result.sort(key=lambda x: x[1])

# 유사성이 가장 높은 time_id 3개 출력
similar_time_ids = [x[0] for x in result[:3]]
print(similar_time_ids)


ValueError: invalid literal for int() with base 10: '2022-12-16 21:06:00'

## 4번 코드 (BTC raw 데이터로 다시)

In [34]:
df_00 = pd.read_csv("C:/Users/user/2023-2_TradeMachine/실전/Data/BTC/BTC/0000.csv")

In [35]:
df_00.columns

Index(['Unnamed: 0', 'type_websocket', 'datetime', 'code', 'opening_price',
       'high_price', 'low_price', 'trade_price', 'prev_closing_price',
       'change', 'change_price', 'signed_change_price', 'change_rate',
       'signed_change_rate', 'trade_volume', 'acc_trade_volume',
       'acc_trade_volume_24h', 'acc_trade_price', 'acc_trade_price_24h',
       'trade_date', 'trade_time', 'trade_timestamp', 'ask_bid',
       'acc_ask_volume', 'acc_bid_volume', 'highest_52_week_price',
       'highest_52_week_date', 'lowest_52_week_price', 'lowest_52_week_date',
       'market_state', 'is_trading_suspended', 'delisting_date',
       'sequential_id', 'total_ask_size', 'total_bid_size', 'orderbook_ap_0',
       'orderbook_as_0', 'orderbook_bp_0', 'orderbook_bs_0', 'orderbook_ap_1',
       'orderbook_as_1', 'orderbook_bp_1', 'orderbook_bs_1', 'orderbook_ap_2',
       'orderbook_as_2', 'orderbook_bp_2', 'orderbook_bs_2', 'orderbook_ap_3',
       'orderbook_as_3', 'orderbook_bp_3', 'orderbook

In [37]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df_00.head()

Unnamed: 0.1,Unnamed: 0,type_websocket,datetime,code,opening_price,high_price,low_price,trade_price,prev_closing_price,change,change_price,signed_change_price,change_rate,signed_change_rate,trade_volume,acc_trade_volume,acc_trade_volume_24h,acc_trade_price,acc_trade_price_24h,trade_date,trade_time,trade_timestamp,ask_bid,acc_ask_volume,acc_bid_volume,highest_52_week_price,highest_52_week_date,lowest_52_week_price,lowest_52_week_date,market_state,is_trading_suspended,delisting_date,market_warning,timestamp,stream_type,sys_datetime,sequential_id,total_ask_size,total_bid_size,orderbook_ap_0,orderbook_as_0,orderbook_bp_0,orderbook_bs_0,orderbook_ap_1,orderbook_as_1,orderbook_bp_1,orderbook_bs_1,orderbook_ap_2,orderbook_as_2,orderbook_bp_2,orderbook_bs_2,orderbook_ap_3,orderbook_as_3,orderbook_bp_3,orderbook_bs_3,orderbook_ap_4,orderbook_as_4,orderbook_bp_4,orderbook_bs_4,orderbook_ap_5,orderbook_as_5,orderbook_bp_5,orderbook_bs_5,orderbook_ap_6,orderbook_as_6,orderbook_bp_6,orderbook_bs_6,orderbook_ap_7,orderbook_as_7,orderbook_bp_7,orderbook_bs_7,orderbook_ap_8,orderbook_as_8,orderbook_bp_8,orderbook_bs_8,orderbook_ap_9,orderbook_as_9,orderbook_bp_9,orderbook_bs_9,orderbook_ap_10,orderbook_as_10,orderbook_bp_10,orderbook_bs_10,orderbook_ap_11,orderbook_as_11,orderbook_bp_11,orderbook_bs_11,orderbook_ap_12,orderbook_as_12,orderbook_bp_12,orderbook_bs_12,orderbook_ap_13,orderbook_as_13,orderbook_bp_13,orderbook_bs_13,orderbook_ap_14,orderbook_as_14,orderbook_bp_14,orderbook_bs_14
0,13630,orderbook,2022-12-16 21:05:36.000,KRW-BTC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1671192336839,,2022-12-16 21:05:36.962,,12.572064,43.355116,22570000.0,2.299561,22568000.0,0.437095,22571000.0,1.359031,22558000.0,0.000441,22578000.0,0.444919,22557000.0,0.223807,22580000.0,0.748174,22556000.0,0.918411,22581000.0,0.377,22555000.0,3.401517,22585000.0,0.289745,22554000.0,0.039866,22586000.0,2.27448,22552000.0,0.004878,22587000.0,1.239113,22551000.0,0.698997,22591000.0,0.560116,22550000.0,36.872502,22592000.0,1.279305,22549000.0,0.004347,22593000.0,0.580447,22548000.0,0.031437,22594000.0,0.138398,22547000.0,0.144656,22595000.0,0.3,22545000.0,0.392714,22596000.0,0.041776,22544000.0,0.143039,22598000.0,0.64,22543000.0,0.041408
1,28121,trade,2022-12-16 21:05:37.000,KRW-BTC,,,,22568000.0,22942000.0,FALL,374000.0,,,,0.003899,,,,,,12:05:37,1671192000000.0,ASK,,,,,,,,,,,1671192337188,REALTIME,2022-12-16 21:05:37.278,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,28122,ticker,2022-12-16 21:05:37.000,KRW-BTC,22952000.0,23050000.0,22500000.0,22568000.0,22942000.0,FALL,374000.0,-374000.0,0.016302,-0.016302,0.003899,2409.572175,3704.241755,54810870000.0,84658410000.0,20221216.0,120537,1671192000000.0,ASK,1469.562531,940.009644,62850000.0,2021-12-24,21509000.0,2022-11-14,ACTIVE,0.0,,NONE,1671192337188,REALTIME,2022-12-16 21:05:37.281,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,28123,orderbook,2022-12-16 21:05:37.000,KRW-BTC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1671192337200,,2022-12-16 21:05:37.354,,12.572064,43.351216,22570000.0,2.299561,22568000.0,0.433195,22571000.0,1.359031,22558000.0,0.000441,22578000.0,0.444919,22557000.0,0.223807,22580000.0,0.748174,22556000.0,0.918411,22581000.0,0.377,22555000.0,3.401517,22585000.0,0.289745,22554000.0,0.039866,22586000.0,2.27448,22552000.0,0.004878,22587000.0,1.239113,22551000.0,0.698997,22591000.0,0.560116,22550000.0,36.872502,22592000.0,1.279305,22549000.0,0.004347,22593000.0,0.580447,22548000.0,0.031437,22594000.0,0.138398,22547000.0,0.144656,22595000.0,0.3,22545000.0,0.392714,22596000.0,0.041776,22544000.0,0.143039,22598000.0,0.64,22543000.0,0.041408
4,28125,orderbook,2022-12-16 21:05:37.000,KRW-BTC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1671192337379,,2022-12-16 21:05:37.506,,12.572064,43.352103,22570000.0,2.299561,22568000.0,0.433195,22571000.0,1.359031,22558000.0,0.000441,22578000.0,0.444919,22557000.0,0.223807,22580000.0,0.748174,22556000.0,0.919297,22581000.0,0.377,22555000.0,3.401517,22585000.0,0.289745,22554000.0,0.039866,22586000.0,2.27448,22552000.0,0.004878,22587000.0,1.239113,22551000.0,0.698997,22591000.0,0.560116,22550000.0,36.872502,22592000.0,1.279305,22549000.0,0.004347,22593000.0,0.580447,22548000.0,0.031437,22594000.0,0.138398,22547000.0,0.144656,22595000.0,0.3,22545000.0,0.392714,22596000.0,0.041776,22544000.0,0.143039,22598000.0,0.64,22543000.0,0.041408


In [59]:
pd.DataFrame(df_00.isnull().sum()).head()

Unnamed: 0,0
Unnamed: 0,0
type_websocket,0
datetime,0
code,0
opening_price,5162


### 1분 혹은 10초 간격으로 만들기

In [64]:
import pandas as pd

df_00 = pd.read_csv("C:/Users/user/2023-2_TradeMachine/실전/Data/BTC/BTC/0000.csv")


# 'datetime' 컬럼이 있는지 확인
if 'datetime' in df_00.columns:
    # 'datetime' 컬럼을 datetime 형식으로 변환
    df_00['datetime'] = pd.to_datetime(df_00['datetime'])
    # 1분 간격으로 새 변수 생성
    df_00['datetime_1min'] = df_00['datetime'].dt.round('1min')

else:
    print("'datetime' 컬럼이 데이터프레임에 없습니다.")


In [79]:
'''
print(min(df_00['datetime_1min']))
print(max(df_00['datetime_1min']))
print(-------------------------)
print(df_00['datetime_1min'].nunique())
print(sorted(df_00['datetime_1min'].unique()))
print(df_00.shape) # 22490 행 중에 113개의 1분 마다의 데이터가 있네
'''

"\nprint(min(df_00['datetime_1min']))\nprint(max(df_00['datetime_1min']))\nprint(-------------------------)\nprint(df_00['datetime_1min'].nunique())\nprint(sorted(df_00['datetime_1min'].unique()))\nprint(df_00.shape) # 22490 행 중에 113개의 1분 마다의 데이터가 있네\n"

In [53]:
print(sorted(df_00['datetime_1min'].unique()))


[Timestamp('2022-12-16 21:06:00'), Timestamp('2022-12-16 21:07:00'), Timestamp('2022-12-16 21:08:00'), Timestamp('2022-12-16 21:09:00'), Timestamp('2022-12-16 21:10:00'), Timestamp('2022-12-16 21:11:00'), Timestamp('2022-12-16 21:12:00'), Timestamp('2022-12-16 21:13:00'), Timestamp('2022-12-16 21:14:00'), Timestamp('2022-12-16 21:15:00'), Timestamp('2022-12-16 21:16:00'), Timestamp('2022-12-16 21:17:00'), Timestamp('2022-12-16 21:18:00'), Timestamp('2022-12-16 21:19:00'), Timestamp('2022-12-16 21:20:00'), Timestamp('2022-12-16 21:21:00'), Timestamp('2022-12-16 21:22:00'), Timestamp('2022-12-16 21:23:00'), Timestamp('2022-12-16 21:24:00'), Timestamp('2022-12-16 21:25:00'), Timestamp('2022-12-16 21:26:00'), Timestamp('2022-12-16 21:27:00'), Timestamp('2022-12-16 21:28:00'), Timestamp('2022-12-16 21:29:00'), Timestamp('2022-12-16 21:30:00'), Timestamp('2022-12-16 21:31:00'), Timestamp('2022-12-16 21:32:00'), Timestamp('2022-12-16 21:33:00'), Timestamp('2022-12-16 21:34:00'), Timestamp('20

### datetime_1min 기준으로 hausdorff distannce와 유사성 높은 datetime_1min 값들 구하기 

datetime_1min은 datetime을 1분 간격으로 전처리한 변수인데

유사성이 높은, 즉, datetime_1min 간의 trade_price의 hausdorff distance가 작은 순서대로 datetime_1min 10개 추출

그런데 10개 미만으로 추출되는 경우가 있다. 이거는 trade_price의 null값이 있는 경우이거나, 특정 시간대에 데이터가 없는 경우다. 

In [70]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import directed_hausdorff
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler

# CSV 파일 불러오기
df_00 = pd.read_csv("C:/Users/user/2023-2_TradeMachine/실전/Data/BTC/BTC/0000.csv")

# 'datetime' 컬럼을 datetime 형식으로 변환하고 1분 간격으로 새 변수 생성
df_00['datetime'] = pd.to_datetime(df_00['datetime'])
df_00['datetime_1min'] = df_00['datetime'].dt.round('1min')

# 'trade_price' 컬럼 스케일링 (normalize)
df_00 = df_00[np.isfinite(df_00['trade_price'])]
scaler = MinMaxScaler()
df_00['trade_price'] = scaler.fit_transform(df_00[['trade_price']])

# 특정 datetime_1min 선택
target_datetime = pd.Timestamp('2022-12-16 21:06:00')  # 예시로 선택한 datetime

# 5분 전과 5분 후의 datetime 범위 설정
before_datetime = target_datetime - pd.Timedelta(minutes=5)
after_datetime = target_datetime + pd.Timedelta(minutes=5)

# 해당 범위의 datetime_1min에 해당하는 데이터만 선택
subset_df = df_00[(df_00['datetime_1min'] >= before_datetime) & (df_00['datetime_1min'] <= after_datetime)]

# 결과를 저장할 리스트
datetime_ids = []
hausdorff_distances = []

# target_datetime과 다른 datetime_1min 간의 Hausdorff 거리 계산
for datetime_1min in subset_df['datetime_1min'].unique():
    if datetime_1min != target_datetime:
        subset1 = subset_df[subset_df['datetime_1min'] == target_datetime][['trade_price']].values
        subset2 = subset_df[subset_df['datetime_1min'] == datetime_1min][['trade_price']].values
        averaged_distance = directed_hausdorff(subset1, subset2)[0]  # Hausdorff 거리 계산
        datetime_ids.append(datetime_1min)
        hausdorff_distances.append(averaged_distance)

# 결과 정렬 (유사성이 높은 것부터)
sorted_result = sorted(zip(datetime_ids, hausdorff_distances), key=lambda x: x[1])

# 유사성이 가장 높은 datetime_1min 10개와 해당 Hausdorff 거리 값 추출
similar_datetime_ids = [[x[0] for x in sorted_result[:10]], [x[1] for x in sorted_result[:10]]]

print(similar_datetime_ids)



[[Timestamp('2022-12-16 21:07:00'), Timestamp('2022-12-16 21:08:00'), Timestamp('2022-12-16 21:09:00'), Timestamp('2022-12-16 21:10:00'), Timestamp('2022-12-16 21:11:00')], [0.0, 0.0458715596330137, 0.0458715596330137, 0.05504587155962781, 0.05504587155962781]]


In [73]:
result_df = pd.DataFrame(columns=['datetime_1min', 'HD_datetime', 'HD_result'])

for target_datetime in df_00['datetime_1min'].unique():
    before_datetime = target_datetime - pd.Timedelta(minutes=5)
    after_datetime = target_datetime + pd.Timedelta(minutes=5)
    subset_df = df_00[(df_00['datetime_1min'] >= before_datetime) & (df_00['datetime_1min'] <= after_datetime)]
    
    datetime_ids = []
    hausdorff_distances = []
    
    for datetime_1min in subset_df['datetime_1min'].unique():
        if datetime_1min != target_datetime:
            subset1 = subset_df[subset_df['datetime_1min'] == target_datetime][['trade_price']].values
            subset2 = subset_df[subset_df['datetime_1min'] == datetime_1min][['trade_price']].values
            averaged_distance = directed_hausdorff(subset1, subset2)[0]
            datetime_ids.append(datetime_1min)
            hausdorff_distances.append(averaged_distance)
    
    sorted_result = sorted(zip(datetime_ids, hausdorff_distances), key=lambda x: x[1])
    similar_datetimes = [x[0] for x in sorted_result[:10]]
    similar_distances = [x[1] for x in sorted_result[:10]]

    result_df.loc[len(result_df)] = [target_datetime, similar_datetimes, np.round(similar_distances,3)]

df_00 = df_00.merge(result_df, on='datetime_1min', how='left')

df_00.head()


Unnamed: 0.1,Unnamed: 0,type_websocket,datetime,code,opening_price,high_price,low_price,trade_price,prev_closing_price,change,change_price,signed_change_price,change_rate,signed_change_rate,trade_volume,acc_trade_volume,acc_trade_volume_24h,acc_trade_price,acc_trade_price_24h,trade_date,trade_time,trade_timestamp,ask_bid,acc_ask_volume,acc_bid_volume,highest_52_week_price,highest_52_week_date,lowest_52_week_price,lowest_52_week_date,market_state,is_trading_suspended,delisting_date,market_warning,timestamp,stream_type,sys_datetime,sequential_id,total_ask_size,total_bid_size,orderbook_ap_0,orderbook_as_0,orderbook_bp_0,orderbook_bs_0,orderbook_ap_1,orderbook_as_1,orderbook_bp_1,orderbook_bs_1,orderbook_ap_2,orderbook_as_2,orderbook_bp_2,orderbook_bs_2,orderbook_ap_3,orderbook_as_3,orderbook_bp_3,orderbook_bs_3,orderbook_ap_4,orderbook_as_4,orderbook_bp_4,orderbook_bs_4,orderbook_ap_5,orderbook_as_5,orderbook_bp_5,orderbook_bs_5,orderbook_ap_6,orderbook_as_6,orderbook_bp_6,orderbook_bs_6,orderbook_ap_7,orderbook_as_7,orderbook_bp_7,orderbook_bs_7,orderbook_ap_8,orderbook_as_8,orderbook_bp_8,orderbook_bs_8,orderbook_ap_9,orderbook_as_9,orderbook_bp_9,orderbook_bs_9,orderbook_ap_10,orderbook_as_10,orderbook_bp_10,orderbook_bs_10,orderbook_ap_11,orderbook_as_11,orderbook_bp_11,orderbook_bs_11,orderbook_ap_12,orderbook_as_12,orderbook_bp_12,orderbook_bs_12,orderbook_ap_13,orderbook_as_13,orderbook_bp_13,orderbook_bs_13,orderbook_ap_14,orderbook_as_14,orderbook_bp_14,orderbook_bs_14,datetime_1min,HD_datetime_x,HD_result_x,HD_datetime_y,HD_result_y,HD_datetime,HD_result
0,28121,trade,2022-12-16 21:05:37,KRW-BTC,,,,0.458716,22942000.0,FALL,374000.0,,,,0.003899,,,,,,12:05:37,1671192000000.0,ASK,,,,,,,,,,,1671192337188,REALTIME,2022-12-16 21:05:37.278,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.0458715596330137, 0.0458715596330137, ...","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.05, 0.05, 0.06, 0.06]","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
1,28122,ticker,2022-12-16 21:05:37,KRW-BTC,22952000.0,23050000.0,22500000.0,0.458716,22942000.0,FALL,374000.0,-374000.0,0.016302,-0.016302,0.003899,2409.572175,3704.241755,54810870000.0,84658410000.0,20221216.0,120537,1671192000000.0,ASK,1469.562531,940.009644,62850000.0,2021-12-24,21509000.0,2022-11-14,ACTIVE,0.0,,NONE,1671192337188,REALTIME,2022-12-16 21:05:37.281,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.0458715596330137, 0.0458715596330137, ...","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.05, 0.05, 0.06, 0.06]","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
2,13641,trade,2022-12-16 21:05:37,KRW-BTC,,,,0.458716,22942000.0,FALL,374000.0,,,,0.017509,,,,,,12:05:37,1671192000000.0,ASK,,,,,,,,,,,1671192337961,REALTIME,2022-12-16 21:05:38.051,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.0458715596330137, 0.0458715596330137, ...","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.05, 0.05, 0.06, 0.06]","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
3,13642,ticker,2022-12-16 21:05:37,KRW-BTC,22952000.0,23050000.0,22500000.0,0.458716,22942000.0,FALL,374000.0,-374000.0,0.016302,-0.016302,0.017509,2409.589684,3704.241755,54811260000.0,84658410000.0,20221216.0,120537,1671192000000.0,ASK,1469.58004,940.009644,62850000.0,2021-12-24,21509000.0,2022-11-14,ACTIVE,0.0,,NONE,1671192337961,REALTIME,2022-12-16 21:05:38.061,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.0458715596330137, 0.0458715596330137, ...","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.05, 0.05, 0.06, 0.06]","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
4,28174,trade,2022-12-16 21:05:41,KRW-BTC,,,,0.458716,22942000.0,FALL,374000.0,,,,0.062169,,,,,,12:05:41,1671192000000.0,ASK,,,,,,,,,,,1671192341773,REALTIME,2022-12-16 21:05:41.893,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.0458715596330137, 0.0458715596330137, ...","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.05, 0.05, 0.06, 0.06]","[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"


예에에에에에에 됐다 

## 4번 코드 정리 (+ 함수 설정) 

In [76]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['datetime_1min'] = df['datetime'].dt.round('1min')
    df = df[np.isfinite(df['trade_price'])]
    scaler = MinMaxScaler()
    df['trade_price'] = scaler.fit_transform(df[['trade_price']])
    return df

def calculate_hausdorff(target_datetime, df):
    before_datetime = target_datetime - pd.Timedelta(minutes=5)
    after_datetime = target_datetime + pd.Timedelta(minutes=5)
    subset_df = df[(df['datetime_1min'] >= before_datetime) & (df['datetime_1min'] <= after_datetime)]
    datetime_ids = []
    hausdorff_distances = []
    for datetime_1min in subset_df['datetime_1min'].unique():
        if datetime_1min != target_datetime:
            subset1 = subset_df[subset_df['datetime_1min'] == target_datetime][['trade_price']].values
            subset2 = subset_df[subset_df['datetime_1min'] == datetime_1min][['trade_price']].values
            distance = directed_hausdorff(subset1, subset2)[0]
            datetime_ids.append(datetime_1min)
            hausdorff_distances.append(distance)
    sorted_result = sorted(zip(datetime_ids, hausdorff_distances), key=lambda x: x[1])
    similar_datetimes = [x[0] for x in sorted_result[:10]] # 유사성 높은 datetime_1min을 10개가 아닌 다른 숫자로 넣고 싶으면, 10을 수정하세요
    similar_distances = np.round([x[1] for x in sorted_result[:10]], 3)
    return similar_datetimes, similar_distances

def apply_hausdorff(df):
    result_df = pd.DataFrame(columns=['datetime_1min', 'HD_datetime', 'HD_result'])
    for target_datetime in df['datetime_1min'].unique():
        similar_datetimes, similar_distances = calculate_hausdorff(target_datetime, df)
        result_df.loc[len(result_df)] = [target_datetime, similar_datetimes, similar_distances]
    df = df.merge(result_df, on='datetime_1min', how='left')
    return df

file_path = "C:/Users/user/2023-2_TradeMachine/실전/Data/BTC/BTC/0000.csv"
df_00 = preprocess_data(file_path)
df_00 = apply_hausdorff(df_00)
df_00.head()


Unnamed: 0.1,Unnamed: 0,type_websocket,datetime,code,opening_price,high_price,low_price,trade_price,prev_closing_price,change,change_price,signed_change_price,change_rate,signed_change_rate,trade_volume,acc_trade_volume,acc_trade_volume_24h,acc_trade_price,acc_trade_price_24h,trade_date,trade_time,trade_timestamp,ask_bid,acc_ask_volume,acc_bid_volume,highest_52_week_price,highest_52_week_date,lowest_52_week_price,lowest_52_week_date,market_state,is_trading_suspended,delisting_date,market_warning,timestamp,stream_type,sys_datetime,sequential_id,total_ask_size,total_bid_size,orderbook_ap_0,orderbook_as_0,orderbook_bp_0,orderbook_bs_0,orderbook_ap_1,orderbook_as_1,orderbook_bp_1,orderbook_bs_1,orderbook_ap_2,orderbook_as_2,orderbook_bp_2,orderbook_bs_2,orderbook_ap_3,orderbook_as_3,orderbook_bp_3,orderbook_bs_3,orderbook_ap_4,orderbook_as_4,orderbook_bp_4,orderbook_bs_4,orderbook_ap_5,orderbook_as_5,orderbook_bp_5,orderbook_bs_5,orderbook_ap_6,orderbook_as_6,orderbook_bp_6,orderbook_bs_6,orderbook_ap_7,orderbook_as_7,orderbook_bp_7,orderbook_bs_7,orderbook_ap_8,orderbook_as_8,orderbook_bp_8,orderbook_bs_8,orderbook_ap_9,orderbook_as_9,orderbook_bp_9,orderbook_bs_9,orderbook_ap_10,orderbook_as_10,orderbook_bp_10,orderbook_bs_10,orderbook_ap_11,orderbook_as_11,orderbook_bp_11,orderbook_bs_11,orderbook_ap_12,orderbook_as_12,orderbook_bp_12,orderbook_bs_12,orderbook_ap_13,orderbook_as_13,orderbook_bp_13,orderbook_bs_13,orderbook_ap_14,orderbook_as_14,orderbook_bp_14,orderbook_bs_14,datetime_1min,HD_datetime,HD_result
0,28121,trade,2022-12-16 21:05:37,KRW-BTC,,,,0.458716,22942000.0,FALL,374000.0,,,,0.003899,,,,,,12:05:37,1671192000000.0,ASK,,,,,,,,,,,1671192337188,REALTIME,2022-12-16 21:05:37.278,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
1,28122,ticker,2022-12-16 21:05:37,KRW-BTC,22952000.0,23050000.0,22500000.0,0.458716,22942000.0,FALL,374000.0,-374000.0,0.016302,-0.016302,0.003899,2409.572175,3704.241755,54810870000.0,84658410000.0,20221216.0,120537,1671192000000.0,ASK,1469.562531,940.009644,62850000.0,2021-12-24,21509000.0,2022-11-14,ACTIVE,0.0,,NONE,1671192337188,REALTIME,2022-12-16 21:05:37.281,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
2,13641,trade,2022-12-16 21:05:37,KRW-BTC,,,,0.458716,22942000.0,FALL,374000.0,,,,0.017509,,,,,,12:05:37,1671192000000.0,ASK,,,,,,,,,,,1671192337961,REALTIME,2022-12-16 21:05:38.051,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
3,13642,ticker,2022-12-16 21:05:37,KRW-BTC,22952000.0,23050000.0,22500000.0,0.458716,22942000.0,FALL,374000.0,-374000.0,0.016302,-0.016302,0.017509,2409.589684,3704.241755,54811260000.0,84658410000.0,20221216.0,120537,1671192000000.0,ASK,1469.58004,940.009644,62850000.0,2021-12-24,21509000.0,2022-11-14,ACTIVE,0.0,,NONE,1671192337961,REALTIME,2022-12-16 21:05:38.061,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
4,28174,trade,2022-12-16 21:05:41,KRW-BTC,,,,0.458716,22942000.0,FALL,374000.0,,,,0.062169,,,,,,12:05:41,1671192000000.0,ASK,,,,,,,,,,,1671192341773,REALTIME,2022-12-16 21:05:41.893,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"


## 4번 코드 정리 (코드 설명 주석처리) -> 4번 코드 정리와 코드 똑같음 설명만 주석에 추가함 

In [77]:
##### 주석 있는 버전  (위 코드와 같음)


# 함수 1: 데이터 전처리
def preprocess_data(file_path):
    # 파일을 불러온다.
    df = pd.read_csv(file_path)
    # 'datetime' 컬럼을 datetime 형식으로 변환
    df['datetime'] = pd.to_datetime(df['datetime'])
    # 'datetime' 컬럼을 1분 간격으로 반올림하여 'datetime_1min' 변수를 생성
    df['datetime_1min'] = df['datetime'].dt.round('1min') # => 여기도 1분 말고 10초로 나중에 수정하려면 해도 됨. 10초는 '10s'임.
    # 'trade_price' 컬럼에서 무한대 값을 제거
    df = df[np.isfinite(df['trade_price'])]
    # MinMaxScaler를 사용하여 'trade_price' 컬럼을 정규화
    scaler = MinMaxScaler()
    df['trade_price'] = scaler.fit_transform(df[['trade_price']])
    
    return df




# 함수 2: 특정 datetime_1min에 대한 Hausdorff 거리 계산
def calculate_hausdorff(target_datetime, df):
    # 주어진 target_datetime 기준으로 5분 전과 5분 후의 datetime을 계산
    before_datetime = target_datetime - pd.Timedelta(minutes=5)
    after_datetime = target_datetime + pd.Timedelta(minutes=5)
    
    # 해당 범위 내의 datetime_1min을 선택
    subset_df = df[(df['datetime_1min'] >= before_datetime) & (df['datetime_1min'] <= after_datetime)]
    
    # 결과를 저장할 리스트를 초기화
    datetime_ids = []
    hausdorff_distances = []
    
    # target_datetime과 다른 datetime_1min 간의 Hausdorff 거리를 계산함
    for datetime_1min in subset_df['datetime_1min'].unique():
        if datetime_1min != target_datetime:
            subset1 = subset_df[subset_df['datetime_1min'] == target_datetime][['trade_price']].values
            subset2 = subset_df[subset_df['datetime_1min'] == datetime_1min][['trade_price']].values
            distance = directed_hausdorff(subset1, subset2)[0]  # Hausdorff 거리 계산
            datetime_ids.append(datetime_1min)
            hausdorff_distances.append(distance)
    
    # 결과를 유사성이 높은 순서대로 정렬
    sorted_result = sorted(zip(datetime_ids, hausdorff_distances), key=lambda x: x[1])
    
    # 유사성이 가장 높은 datetime_1min 10개와 해당 Hausdorff 거리 값 추출 (10 대신 다른 숫자로 수정 가능)
    similar_datetimes = [x[0] for x in sorted_result[:10]] # 여기랑 
    similar_distances = np.round([x[1] for x in sorted_result[:10]], 3) # 여기  => 10말고 80으로 수정하면 됨
    
    return similar_datetimes, similar_distances




# 함수 3: 전체 DataFrame에 Hausdorff 거리 적용
def apply_hausdorff(df):
    # 결과를 저장할 DataFrame을 초기화함 (datetime_1min, HD_datetime, HD_result 컬럼 포함)
    result_df = pd.DataFrame(columns=['datetime_1min', 'HD_datetime', 'HD_result'])
    
    # DataFrame의 모든 고유한 datetime_1min 값에 대해 Hausdorff 거리를 계산
    for target_datetime in df['datetime_1min'].unique():
        similar_datetimes, similar_distances = calculate_hausdorff(target_datetime, df)
        
        # 결과를 result_df에 추가
        result_df.loc[len(result_df)] = [target_datetime, similar_datetimes, similar_distances]
    
    # 원래 DataFrame df와 result_df를 'datetime_1min' 기준으로 병합
    df = df.merge(result_df, on='datetime_1min', how='left')
    
    return df




# 파일 경로
file_path = "C:/Users/user/2023-2_TradeMachine/실전/Data/BTC/BTC/0000.csv"
# 데이터 전처리
df_00 = preprocess_data(file_path)
# Hausdorff 거리 적용
df_00 = apply_hausdorff(df_00)
# 결과 출력
df_00.head()


Unnamed: 0.1,Unnamed: 0,type_websocket,datetime,code,opening_price,high_price,low_price,trade_price,prev_closing_price,change,change_price,signed_change_price,change_rate,signed_change_rate,trade_volume,acc_trade_volume,acc_trade_volume_24h,acc_trade_price,acc_trade_price_24h,trade_date,trade_time,trade_timestamp,ask_bid,acc_ask_volume,acc_bid_volume,highest_52_week_price,highest_52_week_date,lowest_52_week_price,lowest_52_week_date,market_state,is_trading_suspended,delisting_date,market_warning,timestamp,stream_type,sys_datetime,sequential_id,total_ask_size,total_bid_size,orderbook_ap_0,orderbook_as_0,orderbook_bp_0,orderbook_bs_0,orderbook_ap_1,orderbook_as_1,orderbook_bp_1,orderbook_bs_1,orderbook_ap_2,orderbook_as_2,orderbook_bp_2,orderbook_bs_2,orderbook_ap_3,orderbook_as_3,orderbook_bp_3,orderbook_bs_3,orderbook_ap_4,orderbook_as_4,orderbook_bp_4,orderbook_bs_4,orderbook_ap_5,orderbook_as_5,orderbook_bp_5,orderbook_bs_5,orderbook_ap_6,orderbook_as_6,orderbook_bp_6,orderbook_bs_6,orderbook_ap_7,orderbook_as_7,orderbook_bp_7,orderbook_bs_7,orderbook_ap_8,orderbook_as_8,orderbook_bp_8,orderbook_bs_8,orderbook_ap_9,orderbook_as_9,orderbook_bp_9,orderbook_bs_9,orderbook_ap_10,orderbook_as_10,orderbook_bp_10,orderbook_bs_10,orderbook_ap_11,orderbook_as_11,orderbook_bp_11,orderbook_bs_11,orderbook_ap_12,orderbook_as_12,orderbook_bp_12,orderbook_bs_12,orderbook_ap_13,orderbook_as_13,orderbook_bp_13,orderbook_bs_13,orderbook_ap_14,orderbook_as_14,orderbook_bp_14,orderbook_bs_14,datetime_1min,HD_datetime,HD_result
0,28121,trade,2022-12-16 21:05:37,KRW-BTC,,,,0.458716,22942000.0,FALL,374000.0,,,,0.003899,,,,,,12:05:37,1671192000000.0,ASK,,,,,,,,,,,1671192337188,REALTIME,2022-12-16 21:05:37.278,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
1,28122,ticker,2022-12-16 21:05:37,KRW-BTC,22952000.0,23050000.0,22500000.0,0.458716,22942000.0,FALL,374000.0,-374000.0,0.016302,-0.016302,0.003899,2409.572175,3704.241755,54810870000.0,84658410000.0,20221216.0,120537,1671192000000.0,ASK,1469.562531,940.009644,62850000.0,2021-12-24,21509000.0,2022-11-14,ACTIVE,0.0,,NONE,1671192337188,REALTIME,2022-12-16 21:05:37.281,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
2,13641,trade,2022-12-16 21:05:37,KRW-BTC,,,,0.458716,22942000.0,FALL,374000.0,,,,0.017509,,,,,,12:05:37,1671192000000.0,ASK,,,,,,,,,,,1671192337961,REALTIME,2022-12-16 21:05:38.051,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
3,13642,ticker,2022-12-16 21:05:37,KRW-BTC,22952000.0,23050000.0,22500000.0,0.458716,22942000.0,FALL,374000.0,-374000.0,0.016302,-0.016302,0.017509,2409.589684,3704.241755,54811260000.0,84658410000.0,20221216.0,120537,1671192000000.0,ASK,1469.58004,940.009644,62850000.0,2021-12-24,21509000.0,2022-11-14,ACTIVE,0.0,,NONE,1671192337961,REALTIME,2022-12-16 21:05:38.061,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
4,28174,trade,2022-12-16 21:05:41,KRW-BTC,,,,0.458716,22942000.0,FALL,374000.0,,,,0.062169,,,,,,12:05:41,1671192000000.0,ASK,,,,,,,,,,,1671192341773,REALTIME,2022-12-16 21:05:41.893,1671192000000000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022-12-16 21:06:00,"[2022-12-16 21:07:00, 2022-12-16 21:08:00, 202...","[0.0, 0.046, 0.046, 0.055, 0.055]"
