### LIBRARY IMPORT

In [1]:
import numpy as np
import pandas as pd

from contextlib import contextmanager
import time  
from sklearn.neighbors import NearestNeighbors  
from sklearn.preprocessing import minmax_scale  
from typing import Dict, List, Optional, Tuple  
import seaborn as sns 
import gc
import traceback 

import matplotlib.pyplot as plt

from scipy.stats import kendalltau

import os
os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask
working_directory = 'D:\\OneDrive - 한동대학교\\PROJECT\\트머프로젝트\\'  ## 서로 다른 환경에서는 이곳을 수정해야 함.
os.chdir(working_directory)



### CHECK NULL VALUE

In [2]:
# combined_result_df = pd.read_csv("./DB/professor_BTC_sum_both_10m.csv")
combined_result_df_raw = pd.read_csv("./output/BTC_sum_both_10m.csv")
print("# of rows of combined_Result_Df:", combined_result_df_raw.shape[0])

combined_result_df_raw['window_start'] = pd.to_datetime(combined_result_df_raw['window_start'])  # Convert to datetime

# Define the time range
start_time = pd.to_datetime('00:00:00').time()
end_time = pd.to_datetime('06:00:00').time()

# Filter and drop rows
filtered_df = combined_result_df_raw[~combined_result_df_raw['window_start'].apply(lambda x: start_time <= x.time() <= end_time)]
print("# of rows of filtered_df:", filtered_df.shape[0])

combined_result_df = filtered_df.dropna()
print("# of rows of filtered_df:", combined_result_df.shape[0])

# 평균이 아닌 이전 값으로 Null 채우기
# combined_result_df = combined_result_df.fillna(method='ffill') 


# of rows of combined_Result_Df: 102336
# of rows of filtered_df: 76766
# of rows of filtered_df: 76696


### Add Feature  

In [3]:
combined_result_df['liq_last_1'] = np.log10(combined_result_df['liq_last_1'] + 0.00001)
combined_result_df['liq_last_2'] = np.log10(combined_result_df['liq_last_2'] + 0.00001)
combined_result_df['liq_last_5'] = np.log10(combined_result_df['liq_last_5'] + 0.00001)
combined_result_df['liq_last_10'] = np.log10(combined_result_df['liq_last_10'] + 0.00001)
combined_result_df['liq_last_15'] = np.log10(combined_result_df['liq_last_15'] + 0.00001)
combined_result_df['trade_vol'] = np.log10(combined_result_df['trade_vol'] + 0.00001)
combined_result_df['num_trades'] = np.log10(combined_result_df['num_trades'] + 0.00001)

combined_result_df['trade.tau'] = np.sqrt(1 / combined_result_df['num_trades'])
combined_result_df['tvpl1'] = combined_result_df['trade_vol'] / combined_result_df['liq_last_1']
combined_result_df['tvpl2'] = combined_result_df['trade_vol'] / combined_result_df['liq_last_2'] 
combined_result_df['tvpl5'] = combined_result_df['trade_vol'] / combined_result_df['liq_last_5'] 
combined_result_df['tvpl10'] = combined_result_df['trade_vol'] / combined_result_df['liq_last_10'] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

### NUMERIC FEATURES & CALCULATE CORR 

In [18]:
main_feature_list = list(combined_result_df.columns)
main_feature_list.remove('window_start')
main_feature_list.remove('window_end')
main_feature_list.remove('time_id')
main_feature_list.remove('volume_power')
main_feature_list.remove('dv1_realized_volatility')
main_feature_list.remove('dv2_lowest_return')
main_feature_list.remove('dv3_highest_return')
main_feature_list.remove('dv4_realized_volatility_30s')
main_feature_list.remove('prices_30s_for_NN')
main_feature_list.remove('window_end_150_ticker')
main_feature_list.remove('window_end_300_ticker')
main_feature_list.remove('window_end_450_ticker')
main_feature_list.remove('window_end_150_orderbook')
main_feature_list.remove('window_end_300_orderbook')
main_feature_list.remove('window_end_450_orderbook')
main_feature_list.remove('volume_power_150')
main_feature_list.remove('volume_power_300')


In [5]:
pd.set_option('display.max_columns', None)

combined_result_df.head(3)

Unnamed: 0,window_start,window_end,realized_volatility,num_trades,lowest_return,highest_return,high_low_gap,trade_vol,volume_power,end_price,prices_30s_for_NN,time_id,BB_width_w20,BB_width_w40,BB_width_w10,dv1_realized_volatility,dv2_lowest_return,dv3_highest_return,window_end_150_ticker,realized_volatility_150,num_trades_150,lowest_return_150,highest_return_150,high_low_gap_150,trade_vol_150,volume_power_150,window_end_300_ticker,realized_volatility_300,num_trades_300,lowest_return_300,highest_return_300,high_low_gap_300,trade_vol_300,volume_power_300,window_end_450_ticker,realized_volatility_450,num_trades_450,lowest_return_450,highest_return_450,high_low_gap_450,trade_vol_450,volume_power_450,liq_last_1,liq_last_2,liq_last_5,liq_last_10,liq_last_15,ep_liq_5,bidask_spread_0,bidask_spread_1,window_end_150_orderbook,liq_last_1_150,liq_last_2_150,liq_last_5_150,liq_last_10_150,liq_last_15_150,bidask_spread_0_150,bidask_spread_1_150,window_end_300_orderbook,liq_last_1_300,liq_last_2_300,liq_last_5_300,liq_last_10_300,liq_last_15_300,bidask_spread_0_300,bidask_spread_1_300,window_end_450_orderbook,liq_last_1_450,liq_last_2_450,liq_last_5_450,liq_last_10_450,liq_last_15_450,bidask_spread_0_450,bidask_spread_1_450,tvpl,tvpl_epliq5,dv4_realized_volatility_30s,trade.tau,tvpl1,tvpl2,tvpl5,tvpl10
38,2022-12-16 21:44:00,2022-12-16 21:54:00,0.000105,2.575188,-8.9e-05,0.001815,0.001904,1.138501,1.442885,22599000.0,datetime\n2022-12-16 21:44:00 0.000000\n202...,2022-12-16 21:44:00,48144.956559,47257.071103,55194.202594,0.000154,-0.00031,0.001238,2022-12-16 21:54:00,4.6e-05,85,-8.9e-05,8.9e-05,0.000177,3.814411,1.452098,2022-12-16 21:54:00,0.0001,147,-8.9e-05,0.000354,0.000443,5.543539,1.719981,2022-12-16 21:54:00,9.2e-05,209,-8.9e-05,0.001284,0.001373,9.178568,3.329562,-2.503398,-2.492577,-2.45735,-2.43752,-2.411095,0.001029,2.030769,10.261539,2022-12-16 21:54:00,0.001555,0.001703,0.002171,0.002716,0.003101,2.0,9.0,2022-12-16 21:54:00,0.000238,0.000819,0.002223,0.003171,0.003458,6.061539,8.061539,2022-12-16 21:54:00,0.000152,0.000271,0.000473,0.000754,0.000952,3.434782,7.26087,4289.730793,13374.653117,0.003451,0.623154,-0.454782,-0.456757,-0.463304,-0.467074
39,2022-12-16 21:45:00,2022-12-16 21:55:00,0.000107,2.557507,-8.9e-05,0.001815,0.001904,1.100893,1.262884,22596000.0,datetime\n2022-12-16 21:45:00 0.000000\n202...,2022-12-16 21:45:00,50564.809898,50311.440306,54273.382058,0.000156,-0.000177,0.001371,2022-12-16 21:55:00,7.8e-05,68,-8.9e-05,0.000354,0.000443,1.921894,4.441761,2022-12-16 21:55:00,0.000106,126,-8.9e-05,0.000399,0.000487,3.760741,3.587342,2022-12-16 21:55:00,0.000116,259,-8.9e-05,0.001815,0.001904,9.098996,2.272833,-2.887379,-2.786359,-2.687438,-2.633461,-2.614329,0.000948,2.098361,13.0,2022-12-16 21:55:00,0.000299,0.000458,0.0007,0.001334,0.001723,5.636363,9.0,2022-12-16 21:55:00,0.001734,0.002134,0.002249,0.002762,0.003344,2.071429,7.196429,2022-12-16 21:55:00,7.3e-05,0.000174,0.000356,0.00041,0.000702,4.693182,11.454545,7760.955603,13305.005263,0.003887,0.625305,-0.381278,-0.395101,-0.409644,-0.41804
40,2022-12-16 21:46:00,2022-12-16 21:56:00,0.000106,2.562293,0.0,0.001815,0.001815,1.124306,1.278754,22599000.0,datetime\n2022-12-16 21:46:00 0.000089\n202...,2022-12-16 21:46:00,53873.536126,53013.302636,51861.353627,0.000159,-4.4e-05,0.001238,2022-12-16 21:56:00,0.000141,69,0.0,0.000354,0.000354,2.159312,2.504304,2022-12-16 21:56:00,0.000109,125,0.0,0.000753,0.000753,4.498119,5.960898,2022-12-16 21:56:00,0.000119,286,0.0,0.001815,0.001815,9.661851,1.848649,-2.787476,-2.78578,-2.635687,-2.586336,-2.562354,0.000952,1.4875,9.4125,2022-12-16 21:56:00,0.000284,0.001064,0.001406,0.00193,0.002127,6.533333,8.533334,2022-12-16 21:56:00,7e-05,0.000121,0.000282,0.000583,0.000783,2.34375,8.859375,2022-12-16 21:56:00,0.003831,0.003906,0.004177,0.004337,0.004587,2.861111,9.388889,8179.864864,13988.353243,0.004303,0.62472,-0.403342,-0.403587,-0.42657,-0.43471


In [6]:
combined_result_df.columns

Index(['window_start', 'window_end', 'realized_volatility', 'num_trades',
       'lowest_return', 'highest_return', 'high_low_gap', 'trade_vol',
       'volume_power', 'end_price', 'prices_30s_for_NN', 'time_id',
       'BB_width_w20', 'BB_width_w40', 'BB_width_w10',
       'dv1_realized_volatility', 'dv2_lowest_return', 'dv3_highest_return',
       'window_end_150_ticker', 'realized_volatility_150', 'num_trades_150',
       'lowest_return_150', 'highest_return_150', 'high_low_gap_150',
       'trade_vol_150', 'volume_power_150', 'window_end_300_ticker',
       'realized_volatility_300', 'num_trades_300', 'lowest_return_300',
       'highest_return_300', 'high_low_gap_300', 'trade_vol_300',
       'volume_power_300', 'window_end_450_ticker', 'realized_volatility_450',
       'num_trades_450', 'lowest_return_450', 'highest_return_450',
       'high_low_gap_450', 'trade_vol_450', 'volume_power_450', 'liq_last_1',
       'liq_last_2', 'liq_last_5', 'liq_last_10', 'liq_last_15', 'ep_liq_5'

In [7]:
# data = combined_result_df

# # Drop non-numeric columns or columns with missing values
# main_feature_list = data.select_dtypes(include=[np.number]).columns

# # Calculate correlation matrix using kendalltau method
# correlation_matrix = data[main_feature_list].corr(method=lambda x, y: kendalltau(x, y).correlation)

# # main_feature_list.remove('dv1_realized_volatility')

# new_index = main_feature_list.difference(['dv1_realized_volatility', 'dv2_lowest_resturn', 'dv3_highest_return', 'dv4_realized_volatility_30s'])
# main_feature_list = new_index


In [8]:
# Check correlation table (Which variable is more correlated with the main DV?)
main_feature_list2 = list(combined_result_df.columns)
data = combined_result_df
correlation_matrix = data[main_feature_list2].corr(method=lambda x, y: kendalltau(x, y).correlation)
pd.set_option('display.max_rows', None)
sorted_df = correlation_matrix.sort_values(by='dv1_realized_volatility', ascending=False)
sorted_df['dv1_realized_volatility']

dv1_realized_volatility        1.000000
bidask_spread_1                0.261702
realized_volatility            0.257841
bidask_spread_0                0.235108
realized_volatility_450        0.207903
realized_volatility_300        0.173606
bidask_spread_1_450            0.172044
dv3_highest_return             0.158075
tvpl                           0.156133
bidask_spread_1_300            0.148512
tvpl_epliq5                    0.148384
bidask_spread_0_450            0.144913
realized_volatility_150        0.139846
high_low_gap                   0.139254
bidask_spread_0_300            0.130962
high_low_gap_450               0.128161
high_low_gap_300               0.122152
bidask_spread_1_150            0.121751
high_low_gap_150               0.117816
BB_width_w10                   0.116845
bidask_spread_0_150            0.102077
dv4_realized_volatility_30s    0.098961
BB_width_w20                   0.077383
highest_return                 0.070202
highest_return_450             0.062350


In [9]:
tempa = combined_result_df[main_feature_list].isnull()

tempa.sum().sum()

0

In [11]:
print(tempa.sum().sum())

0


### BUILD NEIGHBORS

In [19]:
N_NEIGHBORS_MAX = 65 

class Neighbors:
    def __init__(self, 
                 name: str, 
                 pivot: pd.DataFrame, 
                 p: float, 
                 metric: str = 'minkowski', 
                 metric_params: object = None, 
                 exclude_self: bool = True,
                 ):
        self.name = name
        self.exclude_self = exclude_self
        self.p = p
        self.metric = metric

        nn = NearestNeighbors(
            n_neighbors=N_NEIGHBORS_MAX, 
            p=p, 
            metric=metric, 
            metric_params=metric_params
        )
        
        nn.fit(pivot)
        _, self.neighbors = nn.kneighbors(pivot, return_distance=True)

        self.columns = self.index = self.feature_values = self.feature_col = None

    def rearrange_feature_values(self, df: pd.DataFrame, feature_col: str) -> None:
        raise NotImplementedError()

    def make_nn_feature(self, n=5, agg=np.mean) -> pd.DataFrame:
        assert self.feature_values is not None, "should call rearrange_feature_values beforehand"

        start = 1 if self.exclude_self else 0

        pivot_aggs = pd.DataFrame(
            agg(self.feature_values[start:n,:,0], axis=0), 
            columns=self.columns, 
            index=self.index
        )

        dst = pivot_aggs.reset_index() # unstack().
        # print("dst.shape:", dst.shape)
        new_column_names = ['time_id', f'{self.feature_col}_nn{n}_{self.name}_{agg.__name__}'] # 3개를 예측했는데 2개만 들어왔다??
        dst.columns = new_column_names 
        return dst
    

class TimeIdNeighbors(Neighbors):
    def rearrange_feature_values(self, df: pd.DataFrame, feature_col: str) -> None:
        # feature_pivot = df.pivot(index='time_id', values=feature_col)
        # feature_pivot = feature_pivot.fillna(feature_pivot.mean())

        feature_df = df[['time_id', feature_col]]
        feature_df.set_index('time_id', inplace=True)
        feature_df = feature_df.fillna(feature_df.mean())

        feature_values = np.zeros((N_NEIGHBORS_MAX, feature_df.shape[0], 1))

        for i in range(N_NEIGHBORS_MAX):
            feature_values[i, :, 0] += feature_df.values[self.neighbors[:, i], 0]

        self.columns = list(feature_df.columns)
        self.index = list(feature_df.index)
        self.feature_values = feature_values
        self.feature_col = feature_col

    def __repr__(self) -> str:
        return f"time-id NN (name={self.name}, metric={self.metric}, p={self.p})"


### PROGRESS CHECK FUNCTION

In [20]:
@contextmanager
def timer(name: str):
    s = time.time()
    yield
    elapsed = time.time() - s
    print(f'[{name}] {elapsed: .3f}초')

def print_trace(name: str = ''):
    print(f'{name or "익명"}에서 에러가 발생했습니다.')
    print(traceback.format_exc())


In [22]:
df_pv[main_feature_list].std()

realized_volatility        7.206466e-05
num_trades                 3.267403e-01
lowest_return              1.323066e-03
highest_return             1.231322e-03
high_low_gap               1.668138e-03
trade_vol                  3.747567e-01
end_price                  3.819627e+06
BB_width_w20               7.406247e+04
BB_width_w40               1.009046e+05
BB_width_w10               5.474241e+04
realized_volatility_150    9.522369e-05
num_trades_150             2.389702e+02
lowest_return_150          6.735843e-04
highest_return_150         6.483234e-04
high_low_gap_150           8.454146e-04
trade_vol_150              1.425755e+01
realized_volatility_300    8.317475e-05
num_trades_300             4.477292e+02
lowest_return_300          9.434780e-04
highest_return_300         8.840137e-04
high_low_gap_300           1.183505e-03
trade_vol_300              2.645044e+01
realized_volatility_450    7.657907e-05
num_trades_450             6.484936e+02
lowest_return_450          1.151436e-03


In [23]:
df_pv = combined_result_df.copy()
df_pv = df_pv.drop(['window_start', 'window_end','volume_power'], axis=1)

# Standard All Feature
df_pv[main_feature_list] = scaler.fit_transform(df_pv[main_feature_list])

### GET NN CLASS

In [24]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 객체 생성
scaler = StandardScaler()

USE_ONE_FEATURE_C = True
USE_ONE_FEATURE_M_1 = True
USE_ONE_FEATURE_M_2 = True

USE_TWO_FEATURES = True

USE_ALL_FEATURES = True
USE_SEVALRAL_FEATURES = True

# Top 5 Related Feature
top_5_high_feat = list(correlation_matrix['realized_volatility'].sort_values().keys())[:5]
top_5_low_feat = list(correlation_matrix['realized_volatility'].sort_values().keys())[-6:-1]


# Top 5 Absolute Related Feature

sorted_data = correlation_matrix['realized_volatility'].abs().sort_values(ascending=False)

top_5_high_abs_feat = list(sorted_data.head(6).keys())[1:]
top_5_low_abs_feat = list(sorted_data.tail(5).keys())

# time_id_neighbors List 
time_id_neighbors: List[Neighbors] = []

with timer('knn fit'):
    df_pv = combined_result_df.copy()
    df_pv = df_pv.drop(['window_start', 'window_end','volume_power'], axis=1)
    
    # Standard All Feature
    df_pv[main_feature_list] = scaler.fit_transform(df_pv[main_feature_list])

    # USE ONLY ONE FACTOR
    ## Canberra Distance
    if USE_ONE_FEATURE_C :
        for feat in main_feature_list :
            df_nn = df_pv[['time_id',feat]]
            df_nn.set_index('time_id', inplace=True)
            df_nn = df_nn.fillna(df_nn.mean())

            time_id_neighbors.append(
                TimeIdNeighbors(
                    feat + '_c', 
                    df_nn, 
                    p=2, 
                    metric='canberra', 
                    exclude_self=True
                )
            )
    ## Manhattan Distance
    
    if USE_ONE_FEATURE_M_1:
        for feat in main_feature_list :
            df_nn = df_pv[['time_id',feat]]
            df_nn.set_index('time_id', inplace=True)
            df_nn = df_nn.fillna(df_nn.mean())

            time_id_neighbors.append(
                TimeIdNeighbors(feat + '_m_p1', df_nn, p=1)
            )

    ## Euclidean Distance

    if USE_ONE_FEATURE_M_2:
        for feat in main_feature_list :
            df_nn = df_pv[['time_id',feat]]
            df_nn.set_index('time_id', inplace=True)
            df_nn = df_nn.fillna(df_nn.mean())

            time_id_neighbors.append(
                TimeIdNeighbors(feat + '_m_p2', df_nn, p=2)
            )

    # TWO FACTOR

    if USE_TWO_FEATURES:
        feature_list = ['time_id','realized_volatility','bidask_spread_0']
        df_nn = df_pv[feature_list]
        df_nn.set_index('time_id', inplace=True)
        df_nn = df_nn.fillna(df_nn.mean())

        ## Canberra

        time_id_neighbors.append(
                TimeIdNeighbors(
                    feat + 'two_c', 
                    df_nn, 
                    p=2, 
                    metric='canberra', 
                    exclude_self=True
                )
            )
        ## Euclidean Distance
        time_id_neighbors.append(
            TimeIdNeighbors(
                'two_m', 
                df_nn, 
                p=2, 
                exclude_self=True
            )
        )

    # USE SEVALRAL FACTOR
    if USE_SEVALRAL_FEATURES:
        ## High Related Feature 
        feature_list = ['time_id']
        feature_list += top_5_high_feat
        df_nn = df_pv[feature_list]
        df_nn.set_index('time_id', inplace=True)
        df_nn = df_nn.fillna(df_nn.mean())

        ### Euclidean Distance
        time_id_neighbors.append(
            TimeIdNeighbors(
                'sev_high_nn_m', 
                df_nn, 
                p=2, 
                exclude_self=True
            )
        )

        ## Low Related Feature

        feature_list = ['time_id']
        feature_list += top_5_low_feat
        df_nn = df_pv[feature_list]
        df_nn.set_index('time_id', inplace=True)
        df_nn = df_nn.fillna(df_nn.mean())        

        time_id_neighbors.append(
            TimeIdNeighbors(
                'sev_low_nn_m', 
                df_nn, 
                p=2, 
                exclude_self=True
            )
        )

        ## High Abs Related Feature

        feature_list = ['time_id']
        feature_list += top_5_high_abs_feat
        df_nn = df_pv[feature_list]
        df_nn.set_index('time_id', inplace=True)
        df_nn = df_nn.fillna(df_nn.mean())
        
        time_id_neighbors.append(
            TimeIdNeighbors(
                'sev_high_abs_nn_m', 
                df_nn, 
                p=2, 
                exclude_self=True
            )
        )

        ## Low Abs Related Feature

        feature_list = ['time_id']
        feature_list += top_5_low_abs_feat
        df_nn = df_pv[feature_list]
        df_nn.set_index('time_id', inplace=True)
        df_nn = df_nn.fillna(df_nn.mean())

        time_id_neighbors.append(
            TimeIdNeighbors(
                'sev_low_abs_nn_m', 
                df_nn, 
                p=2, 
                exclude_self=True
            )
        )


    # USE ALL FACTOR

    if USE_ALL_FEATURES:
        df_nn = df_pv.copy()
        df_nn = df_nn.drop(['dv1_realized_volatility'], axis=1)
        df_nn.set_index('time_id', inplace=True)
        df_nn = df_nn.fillna(df_nn.mean())

        time_id_neighbors.append(
            TimeIdNeighbors(
                'all_nn_m_p1', 
                df_nn, 
                p=1, 
                exclude_self=True
            )
        )

        time_id_neighbors.append(
            TimeIdNeighbors(
                'all_nn_m_p2', 
                df_nn, 
                p=2, 
                exclude_self=True
            )
        )


KeyboardInterrupt: 

#### Aggregate Features With NN

In [None]:
def make_nearest_neighbor_feature(df: pd.DataFrame) -> pd.DataFrame:
    df2 = combined_result_df.copy()
    print(df2.shape)

    ### time_id를 기준으로 얻어진 neighbor를 대상으로 feature 만들기
    feature_cols = {
        'realized_volatility': [np.mean, np.min, np.max, np.std],
        'lowest_return': [np.max, np.mean, np.min],
        'num_trades': [np.mean],
        'trade.tau': [np.mean],
        'trade_vol': [np.mean],
        'dv1_realized_volatility': [np.mean],
        'bidask_spread_1': [np.mean],
        'bidask_spread_0': [np.mean],
        'tvpl': [np.mean],
        'tvpl_epliq5': [np.mean],
        'high_low_gap': [np.mean],
        'BB_width_w10': [np.mean],
        'BB_width_w20': [np.mean],
        'high_low_gap': [np.mean],
       
    }

    time_id_neigbor_sizes = [2, 4, 8, 16, 32, 48, 64]

    ndf: Optional[pd.DataFrame] = None
    
    # 새로운 feature를 기존 df에 추가하는 함수
    def _add_ndf(ndf: Optional[pd.DataFrame], dst: pd.DataFrame) -> pd.DataFrame:
        if ndf is None:
            return dst
        else:
            ndf[dst.columns[-1]] = dst[dst.columns[-1]].astype(np.float32)
            #columns_to_convert = [dst.columns[-1]]  # 열 변환 대상을 선택하거나 여러 열을 지정할 수 있음
            #converted_columns = dst[columns_to_convert].astype(np.float32)
            #ndf = pd.concat([ndf, converted_columns], axis=1)

            return ndf

    # neighbor time_id
    for feature_col in feature_cols.keys():
        try: 
            for nn in time_id_neighbors:
                nn.rearrange_feature_values(df2, feature_col)


            time_id_ns = time_id_neigbor_sizes

            for agg in feature_cols[feature_col]:
                for n in time_id_ns:
                    try:
                        for nn in time_id_neighbors:
                            dst = nn.make_nn_feature(n, agg)
                            ndf = _add_ndf(ndf, dst)
                    except Exception:
                        WHERE_ERROR = feature_col
                        print_trace('time-id nn')
                        pass
        except Exception:
            print_trace('time-id nn')

    if ndf is not None:
        df2 = pd.merge(df2, ndf, on=['time_id'], how='left')
    
    print(df2.shape)

    return df2

In [1]:
gc.collect()

with timer('make nearest neighbor feature'):
    df3 = make_nearest_neighbor_feature(combined_result_df)

gc.collect()

NameError: name 'gc' is not defined

In [None]:
coin_list = 'BTC'
df3.to_csv(working_directory + "output\\{}_sum_plus_nn_features.csv".format(coin_list), index=False)

## NN PREDICTION
- NN을 기준으로 예측한 dv1_rv 값
- NN을 기준으로 비슷한 경우들의 dv1_rv 값들의 평균 (비슷했던 경우들의 평균)

아래 과정은 PASS 가능

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [None]:
target_value = "dv1_realized_volatility"
predict_list = []
for item in df3.columns:
    if target_value in item:
        predict_list.append(item)

predict_list = predict_list[1:]


predict = {}
for item in predict_list :
    predict[item] = rmspe(
        np.array(df3["dv1_realized_volatility"]),
        np.array(df3[item])
    )

In [None]:
sorted_items = sorted(predict.items(), key=lambda x: x[1])

for key, value in sorted_items[:10]:
    print(key, value)

In [None]:
# Save DF
# df3.to_parquet("my.parquet")

### FEATURE SELECTION
