In [1]:
## Config
MARKET_DATA_PATH="/mnt/disk2/peiling.chen/Attention-CLX-stock-prediction/601988.SH.csv"
NEWS_DATA_DIR="../../resources/data/CSI300news_chunked_summarized_senti"

In [None]:
import polars as pl
import pandas as pd
import sys; sys.path.append("../..")
from finutils.models import *
from finutils.alpha import *

def transform_data(df: pl.DataFrame):
    print("Columns before transformation:", df.columns)  # 打印列名检查
    df = build_alpha158(df)
    df = build_label(df)
    return df

# 更新 MARKET_DATA_PATH 为实际文件路径
MARKET_DATA_PATH = "/mnt/disk2/peiling.chen/Attention-CLX-stock-prediction/601988.SH.csv"

df = (
    pl.read_csv(MARKET_DATA_PATH)
    .with_columns([
        pl.col("trade_date").cast(pl.Utf8).str.strptime(pl.Date, "%Y%m%d").alias("datetime"),
        pl.col("ts_code").alias("instrument"),
        (pl.col("amount") / (pl.col("vol") + 1e-12)).alias("vwap")
    ])
    .filter(
        (pl.col("datetime") >= pl.date(2007, 1, 4)) &
        (pl.col("datetime") <= pl.date(2022, 3, 17))
    )
    .sort(["instrument", "datetime"])
    .with_columns([
        pl.col("instrument").str.slice(2).alias("instrument_short")
    ])
    .select([
        pl.col("datetime"),
        pl.col("close"),
        pl.col("open"),
        pl.col("high"),
        pl.col("low"),
        pl.col("vol").alias("volume"),  # 确保这里使用的是 pl.col("vol")
        pl.col("vwap")
    ])
)

# 打印结果以检查
print(df.head())


# 构造 alpha 因子 + 标签
df = transform_data(df)

# 打印结果以检查
print(df.head())


# 转为 pandas DataFrame 后处理
df_pd = df.to_pandas()
df_pd = df_pd.assign(
    datetime=pd.to_datetime(df_pd['datetime']),
).set_index(['datetime'])

# 保存为 CSV 文件
df_pd.to_csv('/mnt/disk2/peiling.chen/finbot/finbot/modules/forecast/alpha158_features.csv')


shape: (5, 7)
┌────────────┬───────┬──────┬──────┬──────┬──────────┬──────────┐
│ datetime   ┆ close ┆ open ┆ high ┆ low  ┆ volume   ┆ vwap     │
│ ---        ┆ ---   ┆ ---  ┆ ---  ┆ ---  ┆ ---      ┆ ---      │
│ date       ┆ f64   ┆ f64  ┆ f64  ┆ f64  ┆ f64      ┆ f64      │
╞════════════╪═══════╪══════╪══════╪══════╪══════════╪══════════╡
│ 2007-01-04 ┆ 5.63  ┆ 5.69 ┆ 5.97 ┆ 5.37 ┆ 7.2813e6 ┆ 0.580231 │
│ 2007-01-05 ┆ 5.07  ┆ 5.3  ┆ 5.34 ┆ 5.07 ┆ 7.8673e6 ┆ 0.514837 │
│ 2007-01-08 ┆ 5.08  ┆ 4.87 ┆ 5.14 ┆ 4.83 ┆ 5.5658e6 ┆ 0.496564 │
│ 2007-01-09 ┆ 5.18  ┆ 5.06 ┆ 5.19 ┆ 4.95 ┆ 4.3345e6 ┆ 0.507175 │
│ 2007-01-10 ┆ 5.1   ┆ 5.25 ┆ 5.29 ┆ 5.05 ┆ 3.7212e6 ┆ 0.514682 │
└────────────┴───────┴──────┴──────┴──────┴──────────┴──────────┘
Columns before transformation: ['datetime', 'close', 'open', 'high', 'low', 'volume', 'vwap']
shape: (5, 170)
┌────────────┬───────┬──────┬──────┬───┬───────────────────┬────────┬──────────────┬───────────┐
│ datetime   ┆ close ┆ open ┆ high ┆ … ┆ BollingerBan

KeyError: "None of ['instrument'] are in the columns"

In [10]:
import pandas as pd
import sys; sys.path.append("../..")
from finutils.alpha import *
import os


df_news = pd.concat([(pd
    .read_json(os.path.join(NEWS_DATA_DIR, filename))
    .assign(instrument=filename.split('.')[0],datetime=lambda x: x['date'].dt.date)
    )
    for filename in os.listdir(NEWS_DATA_DIR)
    if filename.endswith(".json")
], ignore_index=True)

df_senti = build_senti_alpha(df_news,method="标签众数")
df_senti = (df_senti
    .assign(datetime=pd.to_datetime(df_senti['datetime']))
    .fillna({'SENTI': 0})
    .set_index(['instrument', 'datetime'])
)

In [12]:
## 特征拼接
df_tot=pd.merge(df_pd, df_senti, left_index=True, right_index=True, how='left')
df_tot.fillna({'SENTI':0}, inplace=True)

In [13]:
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# 获取所有特征列，去除 'ts_code' 和 'trade_date'
feature_cols = [col for col in df_tot.columns if col not in ['ts_code', 'trade_date']]

# 只选择数值型列
df_numerical = df_tot[feature_cols].select_dtypes(include=[float, int])

# 用中位数填补缺失值
df_numerical = df_numerical.apply(lambda col: col.fillna(col.median()))

# 检查并去除任何包含 Inf 或 -Inf 的行
df_numerical = df_numerical[np.isfinite(df_numerical).all(axis=1)]

# 增加常数项
X_with_const = add_constant(df_numerical)

# 计算每个特征的VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = X_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]

# 输出VIF结果
print(vif_data)


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


               Feature           VIF
0                const  0.000000e+00
1                 open  3.528640e+04
2                 high  6.601279e+04
3                  low  5.390689e+04
4                close  9.007199e+15
..                 ...           ...
182  BollingerBandDiff  2.585943e+02
183               SMA5  1.491450e+02
184       PriceChannel  7.753066e+01
185             LABEL0  1.084257e+00
186              SENTI           NaN

[187 rows x 2 columns]


  return 1 - self.ssr/self.centered_tss


In [14]:
# 定义 VIF 区间
bins = [0, 5, 10, np.inf]
labels = ["<5", "5-10", ">=10"]

# 添加 VIF 区间列
vif_data["VIF_Range"] = pd.cut(vif_data["VIF"], bins=bins, labels=labels, right=False)

# 统计各区间的特征数量
vif_summary = vif_data["VIF_Range"].value_counts().sort_index()

# 显示结果
print(vif_summary)


VIF_Range
<5       23
5-10      7
>=10    154
Name: count, dtype: int64


In [16]:
# 获取 VIF 在 5-10 区间的特征名称（去除常数项）
vif_selected_features = vif_data.loc[
    (vif_data["VIF"] >= 5) & (vif_data["VIF"] < 10) & (vif_data["Feature"] != "const"),
    "Feature"
].tolist()

# 让 datetime 成为第一列，其余特征依次排列
ordered_cols = ["datetime"] + [col for col in vif_selected_features if col != "datetime"]
df_filtered = df_filtered[ordered_cols]

# 保存到指定路径
output_path = "/mnt/disk2/peiling.chen/finbot/finbot/modules/forecast/filtered_alpha158_features_new.csv"
df_filtered.to_csv(output_path, index=False)

print(f"已保存筛选后的特征至：{output_path}")


已保存筛选后的特征至：/mnt/disk2/peiling.chen/finbot/finbot/modules/forecast/filtered_alpha158_features_new.csv


In [19]:
import pandas as pd

# 读取 alpha 特征数据
alpha_path = "/mnt/disk2/peiling.chen/finbot/finbot/modules/forecast/filtered_alpha158_features_new.csv"
df_alpha = pd.read_csv(alpha_path)
df_alpha["datetime"] = pd.to_datetime(df_alpha["datetime"])

# 读取 ARIMA 残差数据
arima_path = "/mnt/disk2/peiling.chen/Attention-CLX-stock-prediction/ARIMA_residuals1.csv"
df_arima = pd.read_csv(arima_path)

# 将 trade_date 转换为 datetime 格式
df_arima["trade_date"] = pd.to_datetime(df_arima["trade_date"])

# 重命名第二列为 factor_x
df_arima.rename(columns={df_arima.columns[1]: "factor_x"}, inplace=True)

# 将 trade_date 列重命名为 datetime 以便合并
df_arima["datetime"] = df_arima["trade_date"]

# 删除原来的 trade_date 列
df_arima.drop(columns=["trade_date"], inplace=True)

# 合并数据，按 datetime 列合并
df_merged = pd.merge(df_alpha, df_arima, on="datetime", how="left")

# 保存结果
output_path = "/mnt/disk2/peiling.chen/Attention-CLX-stock-prediction/merged_data.csv"
df_merged.to_csv(output_path, index=False)

print(f"已成功保存合并后的数据至：{output_path}")


已成功保存合并后的数据至：/mnt/disk2/peiling.chen/Attention-CLX-stock-prediction/merged_data.csv
