# PIP

In [1]:
! pip install pyarrow
! pip install xgboost --upgrade
! pip install protobuf




# Import

In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
from joblib import Parallel, delayed
import gc

In [2]:
from xgboost import XGBRegressor

import sys

sys.path.extend(['../', '../../'])
import kaggle_evaluation.jane_street_inference_server 

In [3]:
# Train or not
TRAINING = False

# Def some useful fuction

In [6]:
"""
優化 df 的內存使用量
"""



def reduce_mem_usage(self, float16_as32=True):
    # 計算並顯示初始資料表的內存使用量 (MB)
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    # 遍歷每一列的名稱
    for col in df.columns:
        col_type = df[col].dtype  # 獲取列的數據類型
        
        # 若資料類型為數值 (非 object 或 category 類型)
        if col_type != object and str(col_type) != 'category':
            c_min, c_max = df[col].min(), df[col].max()  # 計算列中的最小值與最大值
            
            # 若資料類型為整數 (int)
            if str(col_type)[:3] == 'int':
                # 若範圍在 int8 (-128 到 127) 內，轉換為 int8
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                # 若範圍在 int16 (-32,768 到 32,767) 內，轉換為 int16
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                # 若範圍在 int32 (-2,147,483,648 到 2,147,483,647) 內，轉換為 int32
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                # 若範圍在 int64 內，轉換為 int64
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            # 若資料類型為浮點數 (float)
            else:
                # 若範圍在 float16 內，並且需要更高精度時可選擇轉換為 float32
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32) if float16_as32 else df[col].astype(np.float16)
                # 若範圍在 float32 內，轉換為 float32
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                # 若範圍在 float64 內，轉換為 float64
                else:
                    df[col] = df[col].astype(np.float64)

    # 計算並顯示優化後的內存使用量 (MB)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    # 計算內存減少的百分比
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [7]:

def reform_data(df, skip_dates=500, num_valid_dates=100, feature_names=None):
    """
    
    參數:
        df(DataFrame): 要處理的df
        skip_dates (int): 需要跳過的日期數量。
        num_valid_dates (int): 驗證集的日期數量。
        training (bool): 是否處於訓練模式。
        reduce_mem_usage_fn (function): 內存優化函數，如果提供，將用於優化 DataFrame 的內存。
    
    返回:
        df (DataFrame): 加載並處理後的訓練數據。
        train_dates (array): 訓練日期的陣列。
        valid_dates (array): 驗證日期的陣列。
    """
    
    # 如果沒有提供特徵名稱，則使用預設的 79 個特徵
    if feature_names is None:
        feature_names = [f"feature_{i:02d}" for i in range(79)]
    
    # 檢查是否處於訓練模式
    if TRAINING:
        
        # 進行內存優化
        #df = reduce_mem_usage(df, False)
        
        # 過濾數據，僅保留日期大於等於 skip_dates 的記錄
        df = df[df['date_id'] >= skip_dates].reset_index(drop=True)
        
        # 獲取唯一的日期列表
        dates = df['date_id'].unique()
        
        # 定義驗證集的日期（最後的 `num_valid_dates` 天）
        valid_dates = dates[-num_valid_dates:]
        
        # 定義訓練集的日期（除了最後 `num_valid_dates` 天以外的所有日期）
        train_dates = dates[:-num_valid_dates]
        
        # 返回加載的數據、訓練日期和驗證日期
        return df, train_dates, valid_dates
    else:
        print("未啟用訓練模式，未加載任何數據。")
        return None, None, None



# Data load

In [7]:
df = pd.read_parquet(r'C:\Users\User\Desktop\kaggle\Jane-street-Real-Time-Market-Data-Forecasting\train.parquet\partition_id=0')

for id in range(1,9):
    path = r'C:\Users\User\Desktop\kaggle\Jane-street-Real-Time-Market-Data-Forecasting\train.parquet\partition_id=' + str(id)
    df_temp = pd.read_parquet(path)
    df = pd.concat([df, df_temp], ignore_index=True)

In [8]:
data, train_date, valid_date = reform_data(df, skip_dates=500, num_valid_dates=100)

未啟用訓練模式，未加載任何數據。


In [9]:
data

In [10]:
train_date

In [11]:
valid_date

In [8]:
N_fold = 5

In [9]:
feature_names = [f"feature_{i:02d}" for i in range(79)]

# Model

In [10]:
# 創建目錄來存儲訓練好的模型
os.system('mkdir models')

# 定義路徑以儲存/載入預訓練模型（當不在訓練模式時使用）
model_path = r'C:\Users\User\Desktop\kaggle\Jane-street-Real-Time-Market-Data-Forecasting\version1\models'

In [11]:
# 若處於訓練模式，準備驗證數據
if TRAINING:
    # 提取驗證日期的特徵、目標變量和權重
    X_valid = df[feature_names].loc[df['date_id'].isin(valid_date)]
    y_valid = df['responder_6'].loc[df['date_id'].isin(valid_date)]
    w_valid = df['weight'].loc[df['date_id'].isin(valid_date)]

In [12]:


# 初始化一個列表以存儲訓練好的模型
models = []

# 定義函數來訓練模型或載入預訓練模型
def train(model_dict, model_name='lgb'):
    if TRAINING:
        selected_dates = [date for ii, date in enumerate(train_date) if ii % N_fold != i]
        model = model_dict[model_name]
        
        X_train = df[feature_names].loc[df['date_id'].isin(selected_dates)]
        y_train = df['responder_6'].loc[df['date_id'].isin(selected_dates)]
        w_train = df['weight'].loc[df['date_id'].isin(selected_dates)]

        if model_name == 'lgb':
            model.fit(X_train, y_train, sample_weight=w_train,  
                     eval_metric=[r2_lgb],
                     eval_set=[(X_valid, y_valid)],
                     callbacks=[lgb.early_stopping(100), lgb.log_evaluation(10)])
            
        elif model_name == 'cbt':
            evalset = cbt.Pool(X_valid, y_valid, weight=w_valid)
            model.fit(X_train, y_train, sample_weight=w_train, 
                     eval_set=[evalset], 
                     verbose=10, 
                     early_stopping_rounds=100)
            
        else:
            model = XGBRegressor()
            model.fit(X_train, y_train, 
            sample_weight=w_train, 
            eval_set=[(X_valid, y_valid)], 
            sample_weight_eval_set=[w_valid], 
            #early_stopping_rounds=100,
            verbose=10
            )



        models.append(model)
        joblib.dump(model, f'./models/{model_name}_{i}.model')
        
        del X_train, y_train, w_train
        gc.collect()
        
    else:
        models.append(joblib.load(f'{model_path}/{model_name}_{i}.model'))
    
    return

# 定義 XGBoost 的自定義 R2 評估指標
def r2_xgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return -r2

# 定義 LightGBM 的自定義 R2 評估指標
def r2_lgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return 'r2', r2, True

# 定義 CatBoost 的自定義 R2 評估指標
class r2_cbt(object):
    def get_final_error(self, error, weight):
        return 1 - error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w * (target[i] ** 2)
            error_sum += w * ((approx[i] - target[i]) ** 2)

        return error_sum, weight_sum




In [13]:
# 定義模型字典，包含不同模型的配置
model_dict = {
    'lgb': lgb.LGBMRegressor(n_estimators=500, device='gpu', gpu_use_dp=True, objective='l2'),
    'xgb': xgb.XGBRegressor(n_estimators=2000, learning_rate=0.1, max_depth=6, tree_method='hist', device="cuda", objective='reg:squarederror', eval_metric=r2_xgb, disable_default_eval_metric=True, early_stopping_rounds=100),
    'cbt': cbt.CatBoostRegressor(iterations=1000, learning_rate=0.05, task_type='GPU', loss_function='RMSE', eval_metric=r2_cbt()),
}

# 對每一折進行模型訓練
for i in range(1,N_fold):
    train(model_dict, 'xgb')
    train(model_dict, 'lgb')
    train(model_dict, 'cbt')

# prediction

In [14]:
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags

    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    
    feat = test[feature_names].to_numpy()
    
    pred = [model.predict(feat) for model in models]
    pred = np.mean(pred, axis=0)
    
    predictions = predictions.with_columns(pl.Series('responder_6', pred.ravel()))

    # The predict function must return a DataFrame
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    # with columns 'row_id', 'responer_6'
    assert list(predictions.columns) == ['row_id', 'responder_6']
    # and as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [15]:
# test data
test_data = pl.read_parquet(r'C:\Users\User\Desktop\kaggle\Jane-street-Real-Time-Market-Data-Forecasting\test.parquet\date_id=0\part-0.parquet')
test_data.head()

row_id,date_id,time_id,symbol_id,weight,is_scored,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,…,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78
i64,i16,i16,i8,f32,bool,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0,0,0,0,3.169998,True,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,,-0.0,,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,…,,-0.0,,-0.0,0.0,-0.0,0.0,0.0,,0.0,,,-0.0,,-0.0,0.0,,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,,,0.0,0.0,-0.0,-0.0
1,0,0,1,2.165993,True,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,,-0.0,,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,…,,-0.0,,-0.0,0.0,0.0,0.0,0.0,,0.0,,,-0.0,,-0.0,0.0,,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,,,0.0,0.0,0.0,0.0
2,0,0,2,3.06555,True,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,,-0.0,,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,…,,-0.0,,-0.0,0.0,-0.0,-0.0,-0.0,,0.0,,,-0.0,,-0.0,0.0,,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,,,0.0,0.0,-0.0,-0.0
3,0,0,3,2.698642,True,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,,-0.0,,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,…,,-0.0,,-0.0,0.0,-0.0,0.0,-0.0,,-0.0,,,-0.0,,-0.0,0.0,,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,,,0.0,0.0,-0.0,-0.0
4,0,0,4,1.80333,True,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,,-0.0,,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,…,,-0.0,,-0.0,0.0,0.0,0.0,0.0,,0.0,,,-0.0,,-0.0,0.0,,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,,,0.0,0.0,0.0,0.0


In [16]:
# lags data
lags_data = pl.read_parquet(r'C:\Users\User\Desktop\kaggle\Jane-street-Real-Time-Market-Data-Forecasting\lags.parquet\date_id=0\part-0.parquet')
lags_data.head()

date_id,time_id,symbol_id,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32
0,0,0,-0.442215,-0.322407,0.143594,-0.92689,-0.782236,-0.036595,-1.305746,-0.795677,-0.143724
0,0,1,-0.651829,-1.70784,-0.893942,-1.065488,-1.871338,-0.615652,-1.162801,-1.205924,-1.245934
0,0,2,-0.656373,-0.264575,-0.892879,-1.511886,-1.03348,-0.378265,-1.57429,-1.863071,-0.027343
0,0,3,-0.188186,-0.19097,-0.70149,0.098453,-1.015506,-0.054984,0.329152,-0.965471,0.576635
0,0,4,-0.257462,-0.471325,-0.29742,0.074018,-0.324194,-0.597093,0.219856,-0.276356,-0.90479


In [17]:
prediction = predict(test_data, lags_data)

In [18]:
prediction

row_id,responder_6
i64,f64
0,0.020846
1,0.020846
2,0.020846
3,0.020846
4,0.020846
…,…
34,0.020846
35,0.020846
36,0.010585
37,0.020846


# Submittion

In [19]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)