In [None]:
import numpy as np
import pandas as pd
import os
import glob
import lightgbm as lgbm
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()
#裡面只有名稱 時間 target

In [None]:
df0_book=pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
df0_book.head()

# **Book_data**
* 時間 
* 最具/第二最具競爭力的購買水平的標準化價格
* 最具/第二最具競爭力的銷售水平的標準化價格
* 最具/第二最具競爭力的買入級別的股票數量。
* 最具/第二最具競爭力的賣出水平上的股票數量。



In [None]:
df0_trade=pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
df0_trade.head()

# **Trade_data**
* 時間
* 一秒鐘內發生的已執行交易的平均價格。價格已標準化，平均值已按每筆交易中交易的股票數量加權。
* 交易的股票總數。
* 交易訂單的數量

--------------------------------------------------------------------------------

# What do we have?

* **買賣差價bid/ask spread**

由於不同股票在市場上的交易水平不同，我們採用最佳報價和最佳買入價的比率來計算買賣價差。
買賣價差的公式可以寫成以下形式：

𝐵𝑖𝑑𝐴𝑠𝑘𝑆𝑝𝑟𝑒𝑎𝑑=𝐵𝑒𝑠𝑡𝑂𝑓𝑓𝑒𝑟/𝐵𝑒𝑠𝑡𝐵𝑖𝑑−1 

* **加權平均價格(WAP)Weighted averaged price**

訂單簿也是股票估值的主要來源之一。
公平的帳面估值必須考慮兩個因素：訂單的水平和規模the level and the size of orders。
在本次比賽中，我們使用加權平均價格 (WAP) 來計算瞬時股票估值併計算已實現的波動率作為我們的**target**。
WAP的公式可以寫成如下，它考慮了最高價格和數量信息：

$WAP = \frac{BidPrice_{1}*AskSize_{1} + AskPrice_{1}*BidSize_{1}}{BidSize_{1} + AskSize_{1}}$
WAP=購買價格＊販賣規模＋販賣價格＊購買規模 / 購買規模＋販賣規模

* **小知識**

如果兩本書的買入價和賣出價分別處於相同的價格水平，則報價越多的書將產生較低的股票估值，因為訂單中有更多的潛在賣家，而更多的賣家意味著 市場上更多的供應**導致股票估值較低的事實**。


在大多數情況下，在連續交易時間內，訂單簿不應出現買單高於賣單或賣單的情況。 換句話說，最有可能的是，買入價和賣出價永遠不應該交叉。

* **對數收益率Log returns**

我們如何比較昨天和今天的股票價格？
我們可以通過將移動除以股票的起始價格來解決上述問題，有效地計算價格變化的百分比，也稱為股票收益stock return。


Log returns有幾個優點：
1. 他是可以隨時間增加的
2. 常規回報（regular returns）不能低於 -100%，而Log returns不受限制


Log returns在這邊我們寫成：
$r_{t_1, t_2} = \log \left( \frac{S_{t_2}}{S_{t_1}} \right)$

𝑆𝑡 = 𝑆在時間 𝑡 的股票價格
通常我們在看這10分鐘的對數收益率，我們寫成：$r_t = r_{t - 10 min, t}$

* **實際波動率Realized volatility**

當我們交易期權時，我們模型的一個有價值的輸入是**對數收益率的標準差standard deviation of the stock log returns**.。在更長或更短的時間間隔內計算的對數回報的標準差會有所不同，因此它通常被標準化為 1 年期，**年化標準差稱為波動率**。

在本次比賽中，您將獲得 10 分鐘的賬面數據，我們要求您預測接下來 10 分鐘的波動性。波動率將按如下方式測量：
$\sigma = \sqrt{\sum_{t}r_{t-1, t}^2}$

* **ＲＭＳＰＥ**

這個比賽的重點！

$\sqrt{\frac{1}{n}\sum^{n}_{i=1}((y_i-\hat{y_i})/y_i)^2}$

# **Feature Engineering**

In [None]:
def wap1(df):
    wap=(df['bid_price1']*df['ask_size1']+df['ask_price1']*df['bid_size1'])/(df['bid_size1']+df['ask_size1'])
    return wap
def wap2(df):
    wap=(df['bid_price2']*df['ask_size2']+df['ask_price2']*df['bid_size2'])/(df['bid_size2']+df['ask_size2'])
    return wap
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))
def count_unique(series):
    return len(np.unique(series))


In [None]:
#這裡我們來定義如何處理book裡的數據
def book_prep(path):
    df=pd.read_parquet(path)
    df['wap']=wap1(df)
    df['log_return']=df.groupby('time_id')['wap'].apply(log_return)
    df['wap2']=wap2(df)
    df['log_return2']=df.groupby('time_id')['wap2'].apply(log_return)
    df['price_spread']=(df['ask_price1']-df['bid_price1'])/(df['ask_price1']+df['bid_price1'])
    df['price_spread2']=(df['ask_price2']-df['bid_price2'])/(df['ask_price2']+df['bid_price2'])
    df['bid_spread']=df['bid_price1']-df['bid_price2']
    df['ask_spread']=df['ask_price1']-df['ask_price2']
    df['total_volume']=df['ask_size1'] + df['ask_size2'] + df['bid_size1'] + df['bid_size2']
    df['volume_imbalance']=abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    #建立一個用來聚合的字典
    create_feature_dict = {
        'log_return':[realized_volatility],
        'log_return2':[realized_volatility],
        'wap':[np.mean],
        'wap2':[np.mean],
        'price_spread':[np.mean],
        'price_spread2':[np.mean],
        'bid_spread':[np.mean],
        'ask_spread':[np.mean],
        'volume_imbalance':[np.mean],
        'total_volume':[np.mean],
            }
    #book裡的特徵建立完成，再來用groupby所有秒數
    df_feature=pd.DataFrame(df.groupby(['time_id']).agg(create_feature_dict)).reset_index()
    df_feature.columns=['_'.join(col) for col in df_feature.columns]#這裡會把time_id變成time_id_
    #
    last_seconds=[150,300,450]
    for second in last_seconds:
        second=600-second
        df_feature_sec=pd.DataFrame(df.query(f'seconds_in_bucket >={second}').groupby(['time_id']).agg(create_feature_dict)).reset_index()
        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
        df_feature_sec=df_feature_sec.add_suffix('_'+str(second))
        df_feature=pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
        df_feature=df_feature.drop([f'time_id__{second}'],axis=1)
        
    #建立row_id
    stock_id=path.split('=')[1]
    df_feature['row_id']=df_feature['time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature=df_feature.drop(['time_id_'],axis=1)
    return df_feature

In [None]:
%%time
data_dir = '../input/optiver-realized-volatility-prediction/'
path = data_dir + "book_train.parquet/stock_id=0"
book_prep(path)
#讓我們來查看執行book處理後的樣子，並看需要多少時間

In [None]:
#先查看trade裡的資料
look=pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
look

In [None]:
#這裡我們利用trade裡的資料來建立特徵
def trade_prep(path):
    df = pd.read_parquet(path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    
    aggregate_dictionary = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    df_feature = df.groupby('time_id').agg(aggregate_dictionary).reset_index()
    
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]

    
    
    last_seconds = [150,300,450]
    
    for second in last_seconds:
        second = 600 - second
    
        df_feature_sec = df.query(f'seconds_in_bucket >= {second}').groupby('time_id').agg(aggregate_dictionary)
        df_feature_sec = df_feature_sec.reset_index()
        
        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
        df_feature_sec = df_feature_sec.add_suffix('_' + str(second))
        
        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
        df_feature = df_feature.drop([f'time_id__{second}'],axis=1)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['trade_time_id_'],axis=1)
    
    return df_feature

In [None]:
%%time
data_dir = '../input/optiver-realized-volatility-prediction/'
path = data_dir + "trade_train.parquet/stock_id=0"
trade_prep(path)

# Joblib
Joblib 是一組在 Python 中提供輕量級流水線的工具。特別是：

函數的透明磁盤緩存和延遲重新評估（記憶模式）
簡單的並行計算
Joblib 經過優化，特別是在大數據上快速且穩健，並且對numpy數組進行了特定優化。它是 BSD 許可的。
* 避免兩次計算相同的事情：代碼經常一次又一次地重新運行，例如在對計算量大的工作（如在科學開發中）進行原型設計時，但為緩解此問題而手工製作的解決方案容易出錯，並且經常導致不可重複的結果。
* 透明地持久化到磁盤：有效地持久化包含大數據的任意對像是很困難的。使用 joblib 的緩存機制避免了手寫持久化，並將磁盤上的文件隱式鏈接到原始 Python 對象的執行上下文。因此，joblib 的持久性有利於恢復應用程序狀態或計算作業，例如在崩潰後。

In [None]:
from joblib import Parallel, delayed
def preprocessor(list_stock_ids, is_train = True):
    df = pd.DataFrame()
    
    def for_joblib(stock_id):
        #先設定是否為train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
            
        df_tmp = pd.merge(book_prep(file_path_book),trade_prep(file_path_trade),on='row_id',how='left')
     
        return pd.concat([df,df_tmp])
    
    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
        )

    df =  pd.concat(df,ignore_index = True)
    return df

In [None]:
#測試一下，把stock0和stock1合併
list_stock_ids = [0,1]
preprocessor(list_stock_ids, is_train = True)

*  Training_data

In [None]:
train = pd.read_csv(data_dir + 'train.csv')
train_ids = train.stock_id.unique()

In [None]:
%%time
df_train = preprocessor(list_stock_ids= train_ids, is_train = True)
#這裡我們把整個訓練資料合併起來

In [None]:
#把feature和target合併
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_train = train.merge(df_train, on = ['row_id'], how = 'left')

*  Test_data

In [None]:
test = pd.read_csv(data_dir + 'test.csv')
test_ids = test.stock_id.unique()

In [None]:
%%time
df_test = preprocessor(list_stock_ids= test_ids, is_train = False)

In [None]:
df_test = test.merge(df_test, on = ['row_id'], how = 'left')

# Ready To Train

In [None]:
from sklearn.model_selection import KFold
df_train['stock_id']=df_train['row_id'].apply(lambda x:x.split('-')[0])
df_test['stock_id']=df_test['row_id'].apply(lambda x:x.split('-')[0])

stock_id_target_mean=df_train.groupby('stock_id')['target'].mean()
df_test['stock_id_target_enc']=df_test['stock_id'].map(stock_id_target_mean)

tmp=np.repeat(np.nan,df_train.shape[0])
kf=KFold(n_splits=10,shuffle=True)
for idx_1,idx_2 in kf.split(df_train):
    target_mean = df_train.iloc[idx_1].groupby('stock_id')['target'].mean()
    tmp[idx_2]=df_train['stock_id'].iloc[idx_2].map(target_mean)
df_train['stock_id_target_enc'] = tmp

In [None]:
#這裡我們把stock_id改成int
df_train['stock_id'] = df_train['stock_id'].astype(int)
df_test['stock_id'] = df_test['stock_id'].astype(int)

In [None]:
X=df_train.drop(['row_id','target'],axis=1)
y=df_train['target']

In [None]:
#定義RMSPE
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def feval_RMSPE(preds, lgbm_train):
    labels = lgbm_train.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False
#定義模型用的超參數
params = {
      "objective": "rmse", 
      "metric": "rmse", 
      "boosting_type": "gbdt",
      'early_stopping_rounds': 30,
      'learning_rate': 0.01,
      'lambda_l1': 1,
      'lambda_l2': 1,
  }

In [None]:
#這裡我們設定Cross Vaildation需要用的前置作業
kf = KFold(n_splits=4,shuffle=True)
oof = pd.DataFrame()        
models = []                  
scores = 0.0          

In [None]:
#開始訓練
for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):

    print("Fold :", fold+1)
    
    # 建立訓練和驗證資料
    X_train, y_train = X.loc[trn_idx], y[trn_idx]
    X_valid, y_valid = X.loc[val_idx], y[val_idx]
    
    #RMSPE 權重
    #https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/250324
    #根據大神，我們利用RMSPE weight
    weights = 1/np.square(y_train)
    lgbm_train = lgbm.Dataset(X_train,y_train,weight = weights)

    weights = 1/np.square(y_valid)
    lgbm_valid = lgbm.Dataset(X_valid,y_valid,reference = lgbm_train,weight = weights)
    
    # 構建模型
    model = lgbm.train(params=params,
                      train_set=lgbm_train,
                      valid_sets=[lgbm_train, lgbm_valid],
                      num_boost_round=1000,         
                      feval=feval_RMSPE,
                      verbose_eval=200,
                      categorical_feature = ['stock_id']                
                     )
    
    # 預測valid裡的資料
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    #這裡我們顯示出每次訓練後RMSPE的分數
    RMSPE = round(rmspe(y_true = y_valid, y_pred = y_pred),4)
    print(f'Performance of the　prediction: , RMSPE: {RMSPE}')

    #記錄分數與模型
    scores += RMSPE / 4
    models.append(model)
    print("*" * 100)
    

In [None]:
#看個成果吧
scores

In [None]:
#準備測試資料
y_pred = df_test[['row_id']]
X_test = df_test.drop(['time_id', 'row_id'], axis = 1)
X_test

In [None]:
target = np.zeros(len(X_test))
#使用各個模型來預測test，並使用最好的模型
for model in models:
    pred = model.predict(X_test[X_valid.columns], num_iteration=model.best_iteration)
    target += pred / len(models)

In [None]:
y_pred = y_pred.assign(target = target)
y_pred

In [None]:
y_pred.to_csv('submission.csv',index = False)