In [9]:
import pandas as pd
import polars as pl
import numpy as np
import gc
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import StratifiedGroupKFold

In [10]:
class CONFIG:
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    lag_cols_rename = { f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)}
    valid_ratio = 0.05
    start_dt = 1000

In [11]:
partition_ids = [4, 5]
train = pl.scan_parquet("../../data/train.parquet").filter(pl.col("partition_id").is_in(partition_ids))
train = train.select(pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"), pl.all()) # 增加 id 列
train = train.with_columns((pl.col(CONFIG.target_col) * 2).cast(pl.Int32).alias("label")) # 增加 label 列
train = train.filter(pl.col("date_id").gt(CONFIG.start_dt))   # 过滤掉前 1100 天的数据

In [12]:
lags = train.select(pl.col(CONFIG.lag_cols_original))
lags = lags.rename(CONFIG.lag_cols_rename)
lags = lags.with_columns(pl.col("date_id") + 1)
lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()

In [13]:
processed_train = train.join(lags, on=["date_id", "symbol_id"],  how="left")
train_df = processed_train.collect()
print(f'shape of train_df: {train_df.shape}')
if train_df.shape[0] == 0:
    raise ValueError("The train dataframe is empty. Please check the input data.")

# 获取到验证集
len_train = train_df.shape[0]
valid_records = int(len_train * CONFIG.valid_ratio)
len_of_val = len_train - valid_records
if len_of_val >= len_train:
    raise IndexError(f"Index {len_of_val} is out of bounds. The dataset has only {len_train} rows.")

# 获取最后一个训练集日期
# 使用 row 方法获取第 len_ofl_mdl 行的数据，注意返回的是元组
last_train_dt = train_df.row(len_of_val)[train_df.columns.index("date_id")]
print(f"Last training date: {last_train_dt}")

shape of train_df: (649528, 104)
Last training date: 1019


In [14]:
last_tr_dt  = train_df.select(pl.col("date_id")).row(len_of_val)[0]
print(f"len_train ={len_train}")
print(f"len_ofl_vali ={len_of_val}")
print(f"\n---> Last offline train date ={last_tr_dt}\n")

training_data = processed_train.filter(pl.col("date_id").le(last_tr_dt))
validation_data = processed_train.filter(pl.col("date_id").gt(last_tr_dt))

len_train =649528
len_ofl_vali =617052

---> Last offline train date =1019



In [15]:
print(training_data.collect().columns)
print(validation_data.collect().columns)

['id', 'date_id', 'time_id', 'symbol_id', 'weight', 'feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66', 'feature_6

In [16]:
training_data.collect().write_parquet(f"training.parquet", partition_by = "date_id",)
validation_data.collect().write_parquet("validation.parquet", partition_by = "date_id",)

---------