In [1]:
import pandas as pd

qc = pd.read_parquet("D:\\Quant\\Research\\OFI\\data\\features\\qc_all.parquet")

etfs = [
    "511360.XSHG","511090.XSHG","511380.XSHG","518880.XSHG","510500.XSHG",
    "159919.XSHE","510300.XSHG","510310.XSHG","159915.XSHE","510050.XSHG",
    "513090.XSHG","588000.XSHG"
]
qc12 = qc[qc["symbol"].isin(etfs)].copy()
qc12.shape, qc12.head(13)


((16122, 12),
          symbol        date  n_rows  dup_ts_ratio  crossed_ratio  \
 0   159915.XSHE  2020-01-02    4649           0.0            0.0   
 1   159915.XSHE  2020-01-03    4654           0.0            0.0   
 2   159915.XSHE  2020-01-06    4682           0.0            0.0   
 3   159915.XSHE  2020-01-07    4673           0.0            0.0   
 4   159915.XSHE  2020-01-08    4718           0.0            0.0   
 5   159915.XSHE  2020-01-09    4707           0.0            0.0   
 6   159915.XSHE  2020-01-10    4699           0.0            0.0   
 7   159915.XSHE  2020-01-13    4702           0.0            0.0   
 8   159915.XSHE  2020-01-14    4705           0.0            0.0   
 9   159915.XSHE  2020-01-15    4663           0.0            0.0   
 10  159915.XSHE  2020-01-16    4658           0.0            0.0   
 11  159915.XSHE  2020-01-17    4656           0.0            0.0   
 12  159915.XSHE  2020-01-20    4716           0.0            0.0   
 
     bad_price_cnt

In [2]:
qc12["bad_day"] = (
    (qc12["crossed_ratio"] > 0.001) |     # >0.1% 盘口穿价/锁定：怀疑错位
    (qc12["dup_ts_ratio"] > 0.05) |       # >5% 同时间戳重复：顺序不稳
    (qc12["bad_price_cnt"] > 0)           # 出现<=0或NaN价格：硬错误
)

summary = (qc12.groupby("symbol")
           .agg(
               days=("date","count"),
               bad_days=("bad_day","sum"),
               bad_ratio=("bad_day","mean"),
               rel_spread_med=("rel_spread_median","median"),
               crossed_med=("crossed_ratio","median"),
               dup_ts_med=("dup_ts_ratio","median"),
           )
           .sort_values(["bad_ratio","rel_spread_med"])
          )

summary


Unnamed: 0_level_0,days,bad_days,bad_ratio,rel_spread_med,crossed_med,dup_ts_med
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
511380.XSHG,1394,0,0.0,9e-05,0.0,0.0
159919.XSHE,1455,0,0.0,0.000247,0.0,0.0
159915.XSHE,1454,0,0.0,0.000436,0.0,0.0
510300.XSHG,1455,1,0.000687,0.000246,0.0,0.0
518880.XSHG,1455,1,0.000687,0.000246,0.0,0.0
510050.XSHG,1455,1,0.000687,0.000357,0.0,0.0
588000.XSHG,1246,2,0.001605,0.000929,0.0,0.0
510310.XSHG,1455,3,0.002062,0.000492,0.0,0.0
510500.XSHG,1455,4,0.002749,0.000161,0.0,0.0
513090.XSHG,1401,10,0.007138,0.000845,0.0,0.0


In [3]:
picked = summary.head(6).index.tolist()
picked


['511380.XSHG',
 '159919.XSHE',
 '159915.XSHE',
 '510300.XSHG',
 '518880.XSHG',
 '510050.XSHG']

In [None]:
from pathlib import Path
import json

Path("configs").mkdir(exist_ok=True)

config = {
    "universe": picked,
    "bad_day_rules": {
        "crossed_ratio_gt": 0.001,
        "dup_ts_ratio_gt": 0.05,
        "bad_price_cnt_gt": 0
    }
}

with open("configs/ofi_universe.json", "w", encoding="utf-8") as f:
    json.dump(config, f, ensure_ascii=False, indent=2)

summary.to_parquet("data/features/qc_summary_12etf.parquet")
