# R data practice by polars

In [9]:
import polars as pl
import numpy as np
data = (
    pl.read_csv(
        "data/stock-market-data.csv", 
        has_header= True, 
        infer_schema_length = int(1e12)
    )
)
data[:5]

symbol,date,pre_close,open,high,low,close,volume,amount,adj_factor,capt,index_w50,index_w300,index_w500,industry
str,i64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,str
"""600000.SH""",20120104,8.49,8.54,8.56,8.39,8.41,34201379,290229551.0,6.655275,125500000000.0,0.046409,0.021259,0.0,"""BANKS"""
"""600000.SH""",20120105,8.41,8.47,8.82,8.47,8.65,132116203,1144800000.0,6.655275,129080000000.0,0.046409,0.021259,0.0,"""BANKS"""
"""600000.SH""",20120106,8.65,8.63,8.78,8.62,8.71,61778687,537043761.0,6.655275,129980000000.0,0.046409,0.021259,0.0,"""BANKS"""
"""600000.SH""",20120109,8.71,8.72,8.99,8.68,8.95,80136249,711429611.0,6.655275,133560000000.0,0.046409,0.021259,0.0,"""BANKS"""
"""600000.SH""",20120110,8.95,8.95,9.1,8.88,9.07,72004632,647206633.0,6.655275,135350000000.0,0.046409,0.021259,0.0,"""BANKS"""


## 1. 哪些股票的代码中包含"8"这个数字？

In [None]:
(
    data
    .filter(
        pl.col("symbol").str.contains("8")
    )
    ["symbol"]
    .unique()
    .to_list()
    [:5]
)

['300128.SZ', '601558.SH', '000810.SZ', '300028.SZ', '000819.SZ']

## 2. 每天上涨和下跌的股票各有多少？

In [None]:

data2 = (
    data
        .with_columns(
            pl.when(pl.col("close") - pl.col("pre_close") > 0).then(pl.lit("上涨"))
            .when(pl.col("close") - pl.col("pre_close") == 0).then(pl.lit("不变"))
            .otherwise(pl.lit("下跌"))
            .alias("tag")
        )
        .filter(
            pl.col("tag") != "不变"
        )
    )

(
    data2
    .group_by(["date", "tag"])
    .agg(pl.col("symbol")
        .unique()
        .len()
        )
    .sort("date")
    [:5]
)


date,tag,symbol
i64,str,u32
20120104,"""上涨""",191
20120104,"""下跌""",2007
20120105,"""上涨""",132
20120105,"""下跌""",2071
20120106,"""上涨""",1444


## 3. 每天每个交易所上涨、下跌的股票各有多少？

In [None]:
data3 = (
    data
    .with_columns(
        pl.col("symbol")
            .str.slice(-2)
            .alias("exchange")
            )
)

data3 = (
    data3
    .with_columns(
        pl.when(pl.col("close") - pl.col("pre_close") > 0).then(pl.lit("上涨"))
        .when(pl.col("close") - pl.col("pre_close") == 0).then(pl.lit("不变"))
        .otherwise(pl.lit("下跌"))
        .alias("tag")
    )
    .filter(
        pl.col("tag") != "不变"
    )
)

(
    data3
    .group_by(["date", "exchange"])
    .agg(pl.n_unique("symbol"))
    .sort("date")
    [:5]
)

date,exchange,symbol
i64,str,u32
20120104,"""SZ""",1319
20120104,"""SH""",879
20120105,"""SZ""",1323
20120105,"""SH""",880
20120106,"""SH""",842


## 4. 沪深300成分股中，每天上涨、下跌的股票各有多少？

In [None]:
data4 = (
    data
    .filter(
        pl.col("index_w300") > 0 
    )
)
data4 = (
    data4
    .with_columns(
        pl.when(pl.col("close") - pl.col("pre_close") > 0).then(pl.lit("上涨"))
        .when(pl.col("close") - pl.col("pre_close") == 0).then(pl.lit("不变"))
        .otherwise(pl.lit("下跌"))
        .alias("tag")
    )
    .filter(
        pl.col("tag") != "不变"
    )
)
(
    data4
    .group_by(["date", "tag"])
    .agg(pl.col("symbol").n_unique())
    .sort("date")
    [:5]
)

date,tag,symbol
i64,str,u32
20120104,"""上涨""",20
20120104,"""下跌""",275
20120105,"""上涨""",50
20120105,"""下跌""",242
20120106,"""上涨""",202


## 5. 每天每个行业各有多少只股票？

In [None]:
(
    data
    .group_by(["date", "industry"])
    .agg(pl.col("symbol").n_unique())
    .sort("date")
    [:5]
)

date,industry,symbol
i64,str,u32
20120104,"""CONSSERV""",34
20120104,"""MATERIAL""",48
20120104,"""INDCONG""",13
20120104,"""UTILITIE""",74
20120104,"""PERSPRD""",25


## 6. 股票数最大的行业和总成交额最大的行业是否总是同一个行业？

In [None]:
data6= (
    data
    .group_by(["date", "industry"])
    .agg(
        pl.col("symbol")
        .n_unique()
        .alias("symbol_num"),
        pl.col("amount")
        .sum()
        .alias("amount_sum")
    )
)

(
    data6
    .filter(
        (pl.col('symbol_num') == pl.max("symbol_num").over("date"))
        & (pl.col('amount_sum') == pl.max('amount_sum').over("date"))
    )
    .sort("date")
)

date,industry,symbol_num,amount_sum
i64,str,u32,f64
20120104,"""HDWRSEMI""",224,6.8781e9
20120106,"""HDWRSEMI""",224,5.9793e9
20120213,"""HDWRSEMI""",227,1.4603e10
20120216,"""HDWRSEMI""",227,1.5065e10
20120217,"""HDWRSEMI""",229,1.1451e10
…,…,…,…
20120305,"""HDWRSEMI""",231,1.9803e10
20120307,"""HDWRSEMI""",231,1.7551e10
20120308,"""HDWRSEMI""",231,1.6576e10
20120418,"""HDWRSEMI""",237,1.4666e10


## 7. 每天涨幅超过5%、跌幅超过5%的股票各有多少？

In [None]:
(
    data
    .with_columns(
        (pl.col("close")/pl.col("pre_close") - 1)
        .alias("ret")
    )
    .filter(
        (pl.col("ret") > 0.05)
        | (pl.col("ret") < -0.05)
    )
    .with_columns(
        pl.when(
            pl.col("ret") > 0.05
        )
        .then(pl.lit("up5%+"))
        .otherwise(pl.lit("down5%+"))
        .alias("tag")
    )
    .group_by(
        ['date', 'tag']
    )
    .agg(
        pl.col("symbol")
        .n_unique()
        .alias("symbol_num")
    )
    .sort(
        "date"
    )[:5]
)

date,tag,symbol_num
i64,str,u32
20120104,"""down5%+""",277
20120104,"""up5%+""",17
20120105,"""down5%+""",885
20120105,"""up5%+""",10
20120106,"""up5%+""",52


## 8. 每天涨幅前10的股票的总成交额和跌幅前10的股票的总成交额比例是多少？

In [None]:
(
    data
    .with_columns(
        (pl.col("close")/pl.col("pre_close") - 1)
        .alias("ret")
    )
    .sort(
        ['date', 'ret'],
        descending = [False, True]
    )
    .group_by(
        "date"
    )
    .agg(
        pl.col("amount").head(10).sum()/
        pl.col("amount").tail(10).sum()
    )[:5]
)
    # .filter(
    #     ~pl.col('date')\
    #     .cum_count()\
    #     .over('date')\
    #     .is_between(10, pl.col('date').count().over('date') - 10, closed = 'left')
    # )\

date,amount
i64,f64
20120104,1.445576
20120105,1.723228
20120106,1.346791
20120109,2.842344
20120110,0.562402


## 9. 每天开盘涨停的股票与收盘涨停的股票各有多少？

In [None]:
(
    data
    .group_by(
        ["date"]
    )
    .agg(
        pl.col("symbol")
            .filter((pl.col("open")/pl.col("pre_close") - 1 > 0.015))
            .n_unique()
            .alias("openlimit_num"),
        pl.col("symbol")
            .filter((pl.col("close")/pl.col("pre_close") - 1 > 0.015))
            .n_unique()
            .alias("closelimit_num")
    )
    .sort(
        ["date"]
    )[:5]
)

date,openlimit_num,closelimit_num
i64,u32,u32
20120104,325,70
20120105,27,60
20120106,56,743
20120109,73,2142
20120110,95,2125


## 10. 每天统计最近3天出现过开盘涨停与收盘涨停的股票各有多少只？

In [None]:
(
    data
    .with_columns(
        pl.when(pl.col("open")/pl.col("pre_close") - 1 > 0.015)
            .then(1)
            .otherwise(0)
            .alias("openlimit"),
        pl.when(pl.col("close")/pl.col("pre_close") - 1 > 0.015)
            .then(1)
            .otherwise(0)
            .alias("closelimit")
    )
    .filter(
        (pl.col("openlimit") == 1) | (pl.col("closelimit") == 1)
    )
    .sort(
        ["date"]
    )
    .group_by(
        ['date']
    )
    .agg(
        pl.col("symbol")
            .filter(pl.col("openlimit") == 1)
            .unique()
            .alias("openlimit_symbol"),
        pl.col("symbol")
            .filter(pl.col("closelimit") == 1)
            .unique()
            .alias("closelimit_symbol")
    )
    .with_row_index(
            name = "index",
            offset = 1
    )
    .rolling(
            index_column = "index",
            period = "3i"
    )
    .agg(
        [
            pl.col("date").tail(1).alias("date"),
            pl.col("openlimit_symbol").list.explode().n_unique().alias("openlimit_num_3d"),
            pl.col("closelimit_symbol").list.explode().n_unique().alias("closelimit_num_3d")
        ]
    )
    .explode("date")[:5]
)
        

index,date,openlimit_num_3d,closelimit_num_3d
u32,i64,u32,u32
1,20120104,325,70
2,20120105,348,122
3,20120106,394,836
4,20120109,143,2174
5,20120110,208,2233


## 11. 股票每天的成交额变化率和收益率的相关性如何？

In [None]:
(
    data
    .sort(
        ["symbol", "date"]
    )
    .select(
        date = pl.col("date"),
        symbol = pl.col("symbol"),
        amount = (pl.col("amount")/pl.col("amount").shift(1) - 1).over("symbol"),
        ret = pl.col("close")/pl.col("pre_close") - 1
    )
    .filter(
        ~ pl.col('amount').is_infinite()&
        ~ pl.col('amount').is_nan()&
        ~ pl.col("ret").is_infinite()
    )
    .select(
        corr = pl.corr("amount", "ret", propagate_nans=True)
    )
)

corr
f64
0.220969


## 12. 每天每个行业的总成交额变化率和行业收益率的相关性如何？

In [None]:
(
    data
    .sort(
        ['industry', 'symbol', 'date']
    )
    .select(
        date = pl.col("date"),
        industry = pl.col("industry"),
        amount_ind = pl.col('amount').sum().over(['industry', 'date']),
        ret = (pl.col("close")/pl.col("pre_close") - 1),
        capt_weight = pl.col("capt")/(pl.col("capt").sum()).over(['industry', 'date'])
    )
    .select(
        pl.col("industry"),
        pl.col("date"),
        ret_ind = (pl.col("ret")*pl.col("capt_weight")).sum().over(["industry", "date"]),
        amount_ind_change = (pl.col("amount_ind")/(pl.col("amount_ind").shift(1)) - 1).over("industry")
    )
    .unique()
    .select(
        pl.corr("ret_ind", "amount_ind_change", propagate_nans=True)
    )
)

ret_ind
f64
0.377926


## 13. 每天市场的总成交额变化率和市场收益率的相关性如何？

In [None]:
(
    data
    .sort(
        ["date", "symbol"]
    )
    .select(
        pl.col("date"),
        pl.col("symbol"),
        pl.col("capt"),
        mkt_amount = pl.col("amount").sum().over("date"),
        ret = pl.col("close")/pl.col("pre_close") - 1
    )
    .select(
        pl.col("date"),
        pl.col("mkt_amount"),
        mkt_ret = (pl.col("ret") * pl.col("capt")/(pl.col("capt").sum())).sum().over("date")
    )
    .unique()
    .sort(
        "date"
    )
    .with_columns(
        mkt_amount_change = pl.col("mkt_amount")/pl.col("mkt_amount").shift(1) - 1
    )
    .select(
        pl.corr("mkt_amount_change", "mkt_ret")
    )
)

mkt_amount_change
f64
0.442459


## 14. 每天市场的总成交额的变化率和所有股票收益率的标准差相关性如何？ 

In [None]:
(
    data
    .select(
        date = pl.col("date"),
        amount_sum = pl.col("amount").sum().over("date"),
        ret_std = (pl.col("close")/pl.col("pre_close") - 1).std().over("date")
    )
    .with_columns(
        amount_sum_change = (pl.col("amount_sum")/pl.col("amount_sum").shift(1) - 1)
    )
    .select(
        pl.corr("amount_sum_change", "ret_std")
    )
)

amount_sum_change
f64
-0.192982


## 15. 每天每个行业的总成交额变化率和行业内股票收益率的标准差相关性如何？

In [None]:
(
    data
    .select(
        date = pl.col("date"),
        industry = pl.col("industry"),
        amount_ind = pl.col("amount").sum().over(["industry", "date"]),
        ret_ind = (pl.col("close")/pl.col("pre_close") - 1).std().over(["industry", "date"])
    )
    .with_columns(
        amount_ind_change = (pl.col("amount_ind")/pl.col("amount_ind").shift(1) - 1).over(["industry"])
    )
    .group_by(
        "industry"
    )
    .agg(
        pl.corr("amount_ind_change", "ret_ind")
    )
)    
    

industry,amount_ind_change
str,f64
"""AIRLINE""",0.209565
"""TRDDIST""",0.161375
"""AUTO""",0.110954
"""MEDIA""",0.231141
"""AERODEF""",0.269046
…,…
"""BLDPROD""",0.062142
"""BANKS""",0.526664
"""RETAIL""",0.060697
"""COMSERV""",0.269397


## 16. 上证50、沪深300、中证500指数成分股中，沪股和深股各有多少？

In [None]:
(
    data
    .filter(
        (pl.col("index_w50") > 0)
        |(pl.col("index_w300") > 0)
        |(pl.col("index_w500") > 0)
    )
    .select(
        date = pl.col("date"),
        symbol = pl.col("symbol"),
        index_w50 = pl.col("index_w50"),
        index_w300 = pl.col("index_w300"),
        index_w500 = pl.col("index_w500"),
        exchange = pl.when(pl.col("symbol").str.contains("SH")).then(pl.lit("上证")).otherwise(pl.lit("深证"))
    )
    .unique()
    .group_by(
        ["date", "exchange"]
    )
    .agg(
        index_w50_num = pl.col("symbol").filter(pl.col("index_w50") > 0).n_unique(),
        index_w300_num = pl.col("symbol").filter(pl.col("index_w300") > 0).n_unique(),
        index_w500_num = pl.col("symbol").filter(pl.col("index_w500") > 0).n_unique()
    )
)

date,exchange,index_w50_num,index_w300_num,index_w500_num
i64,str,u32,u32,u32
20120215,"""深证""",0,95,229
20120507,"""上证""",50,205,270
20120504,"""深证""",0,95,230
20120117,"""上证""",50,205,271
20120314,"""深证""",0,95,229
…,…,…,…,…
20120217,"""上证""",50,205,271
20120629,"""深证""",0,95,230
20120306,"""深证""",0,95,229
20120417,"""上证""",50,205,270


## 17. 上证50、沪深300、中证500指数成分股中，行业分布如何？

In [None]:
(
    data
    .filter(
        (pl.col("index_w50") > 0) |
        (pl.col("index_w300") > 0) |
        (pl.col("index_w500") > 0)
    )
    .select(
        date = pl.col("date"),
        symbol = pl.col("symbol"),
        industry = pl.col("industry"),
        index_w50 = pl.col("index_w50"),
        index_w300 = pl.col("index_w300"),
        index_w500 = pl.col("index_w500")
    )
    .group_by(
        ["date", "industry"]
    )
    .agg(
        index_w50_ind_num = pl.col("symbol").filter(pl.col("index_w50")> 0).n_unique(),
        index_w300_ind_num = pl.col("symbol").filter(pl.col("index_w300")> 0).n_unique(),
        index_w500_ind_num = pl.col("symbol").filter(pl.col("index_w500")> 0).n_unique()
    )
    .sort(
        "date"
    )
)

date,industry,index_w50_ind_num,index_w300_ind_num,index_w500_ind_num
i64,str,u32,u32,u32
20120104,"""MATERIAL""",0,0,11
20120104,"""CONMAT""",1,5,10
20120104,"""TRDDIST""",0,5,9
20120104,"""CNSTENG""",2,11,13
20120104,"""BANKS""",11,16,0
…,…,…,…,…
20120629,"""AUTO""",1,11,18
20120629,"""DVFININS""",5,18,1
20120629,"""INDCONG""",0,1,4
20120629,"""FOODPROD""",0,10,24


## 18. 每天上证50、沪深300、中证500指数成分股的总成交额各是多少

In [None]:
(
    data
    .group_by(
        "date"
    )
    .agg(
        amount_w50 = pl.col("amount").filter(pl.col("index_w50") > 0).sum(),
        amount_w300 = pl.col("amount").filter(pl.col("index_w300") > 0).sum(),
        amount_w500 = pl.col("amount").filter(pl.col("index_w500") > 0).sum()
    )
    .sort(
        "date"
    )
)

date,amount_w50,amount_w300,amount_w500
i64,f64,f64,f64
20120104,1.1372e10,3.2573e10,1.4525e10
20120105,1.5175e10,3.7635e10,1.5625e10
20120106,1.1458e10,3.1079e10,1.3829e10
20120109,1.9783e10,5.1478e10,1.9514e10
20120110,2.5841e10,7.5249e10,3.0515e10
…,…,…,…
20120625,1.5737e10,4.2504e10,2.2532e10
20120626,1.2170e10,3.3897e10,2.1483e10
20120627,9.9458e9,3.1438e10,2.1639e10
20120628,1.1597e10,3.4419e10,2.2602e10


## 19. 上证50、沪深300、中证500指数日收益率的历史波动率是多少？

In [None]:
(
    data
    .select(
        date = pl.col("date"),
        index_50 = (pl.col("close") * pl.col("index_w50")).sum().over("date"),
        index_300 = (pl.col("close") * pl.col("index_w300")).sum().over("date"),
        index_500 = (pl.col("close") * pl.col("index_w500")).sum().over("date")
    )
    .select(
        (pl.col("index_50")/pl.col("index_50").shift(1) - 1).std(),
        (pl.col("index_300")/pl.col("index_300").shift(1) - 1).std(),
        (pl.col("index_500")/pl.col("index_500").shift(1) - 1).std()
    )
)

index_50,index_300,index_500
f64,f64,f64
0.026437,0.019721,0.019045


## 20. 上证50、沪深300、中证500指数日收益率的相关系数矩阵？

In [None]:
(
    data
    .select(
        date = pl.col("date"),
        index_50 = (pl.col("close") * pl.col("index_w50")).sum().over("date"),
        index_300 = (pl.col("close") * pl.col("index_w300")).sum().over("date"),
        index_500 = (pl.col("close") * pl.col("index_w500")).sum().over("date")
    )
    .select(
        index_50 =  (pl.col("index_50")/pl.col("index_50").shift(1) - 1),
        index_300 =  (pl.col("index_300")/pl.col("index_300").shift(1) - 1),
        index_500 =  (pl.col("index_500")/pl.col("index_500").shift(1) - 1)
    )
    .drop_nulls()
    .corr()
)

index_50,index_300,index_500
f64,f64,f64
1.0,0.958448,0.724968
0.958448,1.0,0.85127
0.724968,0.85127,1.0


## 21. 上证50、沪深300、去除上证50的沪深300指数日收益率的相关系数矩阵？

In [None]:
(
    data
    .with_columns(
        index_w300_without_w50 =
            (pl.col("index_w300") > 0) & (pl.col("index_w50") == 0)
    )
    .with_columns(
        index_w300_without_w50 =
            pl.when((pl.col("index_w50") == 0) & (pl.col("index_w300") > 0)).then(pl.col("index_w300")).otherwise(0).over("date")
    )
    .select(
        date = pl.col("date"),
        index_50 = (pl.col("close") * pl.col("index_w50")).sum().over("date"),
        index_300 = (pl.col("close") * pl.col("index_w300")).sum().over("date"),
        index_300_without_50 = (pl.col("close")*pl.col("index_w300_without_w50")).sum().over("date")
    )
    .select(
        index_50 = pl.col("index_50")/pl.col("index_50").shift(1) - 1,
        index_300 = pl.col("index_300")/pl.col("index_300").shift(1) - 1,
        index_300_without_50 = pl.col("index_300_without_50")/pl.col("index_300_without_50").shift(1) - 1 
    )
    .drop_nulls()
    .corr()
)

index_50,index_300,index_300_without_50
f64,f64,f64
1.0,0.958448,0.618703
0.958448,1.0,0.800002
0.618703,0.800002,1.0


## 22. 每天沪深300指数成分占比最大的10只股票是哪些？

In [None]:
(
    data
    .sort(
        ["date", "index_w300"],
        descending = [False, True]
    )
    .group_by(
        "date"
    )
    .agg(
        pl.col("symbol").head(10)
    )
    .explode(
        "symbol"
    )
)

date,symbol
i64,str
20120104,"""600036.SH"""
20120104,"""600016.SH"""
20120104,"""601318.SH"""
20120104,"""601328.SH"""
20120104,"""601166.SH"""
…,…
20120629,"""601166.SH"""
20120629,"""600000.SH"""
20120629,"""600030.SH"""
20120629,"""000002.SZ"""


## 23. 各个行业的平均每日股票数量从大到小排序是什么？

In [None]:
(
    data
    .group_by(
        ["date", "industry"]
    )
    .agg(
        pl.col("symbol").count()
    )
    .select(
        industry = pl.col("industry"),
        symbol_num = pl.col("symbol").mean().over("industry")
    )
    .unique()
    .sort(
        "symbol_num",
        descending = True
    )
)

industry,symbol_num
str,f64
"""HDWRSEMI""",233.34188
"""CHEM""",223.034188
"""MACH""",204.820513
"""HEALTH""",176.760684
"""ELECEQP""",134.239316
…,…
"""BANKS""",16.0
"""INDCONG""",12.65812
"""AIRLINE""",12.196581
"""MARINE""",12.0


## 24. 每个行业每天成交额最大的一只股票代码是什么？

In [None]:
(
    data
    .with_columns(
        amount_max = pl.col("amount").max().over(['industry', 'date'])
    )
    .group_by(
        ["date", "industry"]
    )
    .agg(
        pl.col("symbol").filter(pl.col("amount") == pl.col("amount_max"))
    )
    .sort(["date", "industry"])
)

date,industry,symbol
i64,str,list[str]
20120104,"""AERODEF""","[""000768.SZ""]"
20120104,"""AIRLINE""","[""600029.SH""]"
20120104,"""AUTO""","[""600104.SH""]"
20120104,"""BANKS""","[""600036.SH""]"
20120104,"""BEV""","[""600519.SH""]"
…,…,…
20120629,"""REALEST""","[""600048.SH""]"
20120629,"""RETAIL""","[""600739.SH""]"
20120629,"""SOFTWARE""","[""300104.SZ""]"
20120629,"""TRDDIST""","[""600366.SH""]"


## 25. 每个行业每天最大成交额是最小成交额的几倍？

In [None]:
(
    data
    .group_by(
        ["date", "industry"]
    )
    .agg(
        (pl.col("amount").max())/(pl.col("amount").min())
    )
    .sort(
        ["date", "industry"]
    )
)

date,industry,amount
i64,str,f64
20120104,"""AERODEF""",12.819244
20120104,"""AIRLINE""",47.384072
20120104,"""AUTO""",inf
20120104,"""BANKS""",11.622244
20120104,"""BEV""",inf
…,…,…
20120629,"""REALEST""",inf
20120629,"""RETAIL""",inf
20120629,"""SOFTWARE""",inf
20120629,"""TRDDIST""",inf


## 26. 每个行业每天成交额最大的5只股票的成交额总和是多少？

In [None]:
(
    data
    .sort(['date', 'industry', 'amount'], descending = [False, False, True])
    .group_by(["date", 'industry'])
    .agg(
        pl.col("amount").head(5).sum()
    )
    .sort(['date', "industry"])
)

date,industry,amount
i64,str,f64
20120104,"""AERODEF""",3.8476e8
20120104,"""AIRLINE""",3.2177e8
20120104,"""AUTO""",8.5999e8
20120104,"""BANKS""",2.0891e9
20120104,"""BEV""",1.8691e9
…,…,…
20120629,"""REALEST""",1.5692e9
20120629,"""RETAIL""",7.6047e8
20120629,"""SOFTWARE""",8.2100e8
20120629,"""TRDDIST""",8.37608599e8


## 27. 每个行业每天成交额超过该行业中股票成交额80%分位数的股票的平均收益率是多少？

In [None]:
(
    data
    .select(symbol = pl.col('symbol'),
            date = pl.col('date'),
            industry = pl.col('industry'),
            ret = (pl.col('pre_close')/pl.col('close')) - 1,
            amount = pl.col('amount'),
            amount_80p = pl.col('amount').quantile(0.8).over(['date','industry'])
    )
    .filter(pl.col('amount') > pl.col('amount_80p'))
    .group_by(["date", "industry"])
    .agg(
        aver_ret = pl.col("ret").mean()
    )
    .sort(['date', "industry"])
)

date,industry,aver_ret
i64,str,f64
20120104,"""AERODEF""",-0.003143
20120104,"""AIRLINE""",0.035986
20120104,"""AUTO""",0.018819
20120104,"""BANKS""",0.007952
20120104,"""BEV""",0.060174
…,…,…
20120629,"""REALEST""",-0.001288
20120629,"""RETAIL""",-0.012304
20120629,"""SOFTWARE""",-0.020639
20120629,"""TRDDIST""",0.005932


## 28. 每天成交额最大的10%的股票的平均收益率和成交额最小的10%的股票的平均收益率的相关系数是多少？

In [None]:
(
    data
    .with_columns(
        ret = pl.col('close')/pl.col('pre_close') - 1,
        amount_10p_top = pl.col('amount').quantile(0.9).over('date'),
        amount_10p_bottom = pl.col('amount').quantile(0.1).over('date')
    )
    .select(
        ret_amount_10p_top = pl.col("ret").filter(pl.col("amount") >= pl.col("amount_10p_top")).mean().over("date"),
        ret_amount_10p_bottom = pl.col('ret').filter(pl.col("amount") <= pl.col("amount_10p_bottom")).mean().over('date')
    )
    .select(
        pl.corr(a = "ret_amount_10p_top", b = "ret_amount_10p_bottom")
    )
)

ret_amount_10p_top
f64
0.888088


## 29. 每天哪些行业的平均成交额高于全市场平均成交额？

In [None]:
(
    data
    .with_columns(
        amount_mkt_aver = pl.col('amount').mean().over('date'),
        amount_ind_aver = pl.col('amount').mean().over(['date', 'industry'])
    )
    .filter(
        pl.col('amount_mkt_aver') < pl.col('amount_ind_aver')
    )
    .select(
        pl.col('date'),
        pl.col('industry')
    )
    .unique()
    .sort(["date", "industry"])
)

date,industry
i64,str
20120104,"""AERODEF"""
20120104,"""BANKS"""
20120104,"""BEV"""
20120104,"""CONMAT"""
20120104,"""DVFININS"""
…,…
20120629,"""INDCONG"""
20120629,"""MARINE"""
20120629,"""MEDIA"""
20120629,"""MTLMIN"""


## 30. 每天每个股票对市场的超额收益率是多少？

In [None]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = (pl.col("close")/pl.col("pre_close") - 1).over("date"),
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over('date')
    )
    .with_columns(
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over("date")
    )
    .select(
        "stkcd_ret",
        "mkt_ret",
        "symbol",
        "date",
        pds.lin_reg(
            pl.col("mkt_ret"),
            target = pl.col("stkcd_ret"),
            add_bias = True,
            return_pred = True
        )
        .over(pl.col("symbol"))
        .alias("prediction")
    )
    .unnest("prediction")
    .select(
        "symbol",
        "date",
        pl.col("resid").alias("stkcd_ret_excess")
    )
)


symbol,date,stkcd_ret_excess
str,i64,f64
"""600000.SH""",20120104,0.00258
"""600000.SH""",20120105,0.039481
"""600000.SH""",20120106,0.002335
"""600000.SH""",20120109,0.005234
"""600000.SH""",20120110,-0.008163
…,…,…
"""300331.SZ""",20120629,0.0
"""300332.SZ""",20120628,1.1102e-16
"""300332.SZ""",20120629,2.7756e-17
"""300333.SZ""",20120628,-1.3878e-17


## 31. 每天每个股票对市场去除自身的超额收益率是多少？

In [None]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = (pl.col("close")/pl.col("pre_close") - 1).over("date"),
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over('date'),
        stkcd_weighted_ret = (pl.col("close")/pl.col("pre_close") - 1)*(pl.col("capt")/pl.col("capt").sum().over('date'))
    )
    .with_columns(
        total_mkt_ret = pl.col("stkcd_weighted_ret").sum().over('date')
    )
    .with_columns(
        mkt_ret_ex_stock = (pl.col("total_mkt_ret") - pl.col("stkcd_weighted_ret"))/(1 - pl.col("stkcd_weight"))
    )
    .select(
        "symbol",
        "date",
        pds.lin_reg(
            pl.col("mkt_ret_ex_stock"),
            target = pl.col("stkcd_ret"),
            add_bias = True,
            return_pred = True
        )
        .over(pl.col("symbol"))
        .alias("prediction")
    )
    .unnest("prediction")
    .select(
        "symbol",
        "date",
        pl.col("resid").alias("stkcd_ret_ex_stock")
    )[:5]
)


symbol,date,stkcd_ret_ex_stock
str,i64,f64
"""600000.SH""",20120104,0.002536
"""600000.SH""",20120105,0.039663
"""600000.SH""",20120106,0.002371
"""600000.SH""",20120109,0.005372
"""600000.SH""",20120110,-0.008107


## 32. 每天每个股票对行业的超额收益率是多少？

In [None]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = (pl.col("close")/pl.col("pre_close") - 1).over("date"),
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"]),
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_ind_weight")).sum().over(["date", "industry"])
    )
    .select(
        "symbol",
        "date",
        "industry",
        pds.lin_reg(
            pl.col("ind_ret"),
            target = pl.col("stkcd_ret"),
            add_bias = True,
            return_pred = True
        )
        .over(pl.col("symbol"))
        .alias("prediction")
    )
    .unnest("prediction")
    .select(
        "symbol",
        "date",
        "industry",
        pl.col("resid").alias("stkcd_ret_ex_ind")
    )[:5]
)   

symbol,date,industry,stkcd_ret_ex_ind
str,i64,str,f64
"""600000.SH""",20120104,"""BANKS""",-0.000246
"""600000.SH""",20120105,"""BANKS""",0.008228
"""600000.SH""",20120106,"""BANKS""",-0.000771
"""600000.SH""",20120109,"""BANKS""",0.009151
"""600000.SH""",20120110,"""BANKS""",0.0016


## 33. 每天每个股票对市场的去除自身的超额收益率是多少？


In [None]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = (pl.col("close")/pl.col("pre_close") - 1).over("date"),
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"]),
        stkcd_ind_weighted_ret = (pl.col("close")/pl.col("pre_close") - 1)*(pl.col("capt")/pl.col("capt").sum().over(["date", "industry"]))
    )
    .with_columns(
        total_ind_ret = pl.col("stkcd_ind_weighted_ret").sum().over(["date", "industry"])
    )
    .with_columns(
        ind_ret_ex_stock = (pl.col("total_ind_ret") - pl.col("stkcd_ind_weighted_ret"))/(1 - pl.col("stkcd_ind_weight"))
    )
    .select(
        "symbol",
        "date",
        "industry",
        pds.lin_reg(
            pl.col("ind_ret_ex_stock"),
            target = pl.col("stkcd_ret"),
            add_bias = True,
            return_pred = True
        )
        .over(pl.col('symbol'))
        .alias("prediction")
    ).unnest("prediction")
    .select(
         "symbol",
        "date",
        "industry",
        pl.col("resid").alias("stkcd_ret_ex_ind")
    )[:5]
)

symbol,date,industry,stkcd_ret_ex_ind
str,i64,str,f64
"""600000.SH""",20120104,"""BANKS""",-0.000392
"""600000.SH""",20120105,"""BANKS""",0.009031
"""600000.SH""",20120106,"""BANKS""",-0.00071
"""600000.SH""",20120109,"""BANKS""",0.009998
"""600000.SH""",20120110,"""BANKS""",0.001874


## 34. 每个股票每天对市场的超额收益率与对行业的超额收益率的相关系数如何？

In [None]:
import polars_ds as pds

data_mkt = (
    data
    .with_columns(
        stkcd_ret = (pl.col("close")/pl.col("pre_close") - 1).over("date"),
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over('date')
    )
    .with_columns(
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over("date")
    )
    .select(
        "stkcd_ret",
        "mkt_ret",
        "symbol",
        "date",
        pds.lin_reg(
            pl.col("mkt_ret"),
            target = pl.col("stkcd_ret"),
            add_bias = True,
            return_pred = True
        )
        .over(pl.col("symbol"))
        .alias("prediction")
    )
    .unnest("prediction")
    .select(
        "symbol",
        "date",
        pl.col("resid").alias("stkcd_ret_excess")
    )
)
data_ind = (
    data
    .with_columns(
        stkcd_ret = (pl.col("close")/pl.col("pre_close") - 1).over("date"),
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"]),
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_ind_weight")).sum().over(["date", "industry"])
    )
    .select(
        "symbol",
        "date",
        "industry",
        pds.lin_reg(
            pl.col("ind_ret"),
            target = pl.col("stkcd_ret"),
            add_bias = True,
            return_pred = True
        )
        .over(pl.col("symbol"))
        .alias("prediction")
    )
    .unnest("prediction")
    .select(
        "symbol",
        "date",
        "industry",
        pl.col("resid").alias("stkcd_ind_ret_excess")
    )
)

data_mkt.join(
    data_ind,
    on = ["date", "symbol"],
    how = "inner"
).select(
    "symbol",
    pl.corr(a = "stkcd_ret_excess", b = "stkcd_ind_ret_excess").over('symbol').alias("corr")
).unique(
).sort("symbol")[:5]

symbol,corr
str,f64
"""000001.SZ""",0.761699
"""000002.SZ""",0.767403
"""000004.SZ""",0.834381
"""000005.SZ""",0.99309
"""000006.SZ""",0.923893


## 35. 每天有哪些行业的平均收益率超过市场平均收益率？

In [None]:
(
    data
    .with_columns(
        stkcd_ret = (pl.col("close")/pl.col("pre_close") - 1).over("date"),
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"]),
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over(["date", "industry"]),
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over("date")
    )
    .filter(
        pl.col("ind_ret") > pl.col("mkt_ret")
    )
    .select(
        "date",
        "industry"
    )
    .unique()
    .sort(["date", "industry"])
)

date,industry
i64,str
20120104,"""AERODEF"""
20120104,"""AIRLINE"""
20120104,"""AUTO"""
20120104,"""BANKS"""
20120104,"""BEV"""
…,…
20120628,"""REALEST"""
20120628,"""RETAIL"""
20120628,"""SOFTWARE"""
20120628,"""TRDDIST"""


## 36. 每天每个行业对市场的超额收益率是多少？

In [None]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = (pl.col("close")/pl.col("pre_close") - 1).over("date"),
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"]),
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over(["date", "industry"]),
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over("date")
    )
    .select(
        "date",
        "industry",
        "ind_ret",
        "mkt_ret"
    )
    .unique()
    .sort(["industry", "date"])
    .with_columns(
        pds.lin_reg(
            pl.col("mkt_ret"),
            target = pl.col("ind_ret"),
            add_bias = True,
            return_pred = True
        )
        .over(pl.col("industry"))
        .alias("prediction")
    )
    .unnest("prediction")
    .select(
        "date",
        "industry",
        pl.col("resid").alias("ind_ret_excess")
    )[:5]
)


date,industry,ind_ret_excess
i64,str,f64
20120104,"""AERODEF""",0.013798
20120105,"""AERODEF""",0.003314
20120106,"""AERODEF""",-0.023299
20120109,"""AERODEF""",-0.005169
20120110,"""AERODEF""",0.002565


## 37. 每天每个行业去除本行业后的市场超额收益率是多少？

In [6]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = (pl.col("close")/pl.col("pre_close") - 1),
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"]),
        stkcd_mkt_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date"])
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_ind_weight")).sum().over(["date", "industry"]),
        market_ret = (pl.col("stkcd_ret")*pl.col("stkcd_mkt_weight")).sum().over(["date"]),
        ind_mkt_weight = (pl.col("stkcd_ind_weight")/(pl.col("stkcd_mkt_weight").sum())).over(["date"])
    )
    .with_columns(
        mkt_ret_ex_ind = (pl.col("market_ret") - pl.col("ind_ret"))/(1 - pl.col("ind_mkt_weight"))
    )
    .select(
        "date",
        "industry",
        "ind_ret",
        "mkt_ret_ex_ind"
    )
    .unique()
    .sort(["industry", "date"])
    .with_columns(
        pds.lin_reg(
            pl.col("mkt_ret_ex_ind"),
            target = pl.col("ind_ret"),
            add_bias = True,
            return_pred = True
        )
        .over(pl.col("industry"))
        .alias("prediction")
    )
    .unnest("prediction")
    .select(
        "date",
        "industry",
        pl.col("resid").alias("ind_ret_excess")
    )[:5]
)

date,industry,ind_ret_excess
i64,str,f64
20120104,"""AERODEF""",-0.018141
20120104,"""AERODEF""",-0.017508
20120104,"""AERODEF""",-0.017
20120104,"""AERODEF""",-0.019237
20120104,"""AERODEF""",-0.017052


## 38. 每天分别有多少股票是最近连续3个交易日上涨、下跌的？

In [12]:
(
    data
    .with_columns(
        up = pl.when(pl.col("close") > pl.col("pre_close")).then(1).otherwise(0),
        down = pl.when(pl.col("close") < pl.col("pre_close")).then(1).otherwise(0)
    )
    .with_columns(
        up_num_3 = pl.col("up").rolling_sum(3).over("symbol"),
        down_num_3 = pl.col("down").rolling_sum(3).over("symbol")
    )
    .select(
        date = pl.col("date"),
        up_num_3 = pl.col("symbol").filter(pl.col("up_num_3") == 3).n_unique().over("date"),
        down_num_3 = pl.col("symbol").filter(pl.col("down_num_3") == 3).n_unique().over("date")
    )
    .unique()
    .sort(["date", "up_num_3", "down_num_3"])
)



date,up_num_3,down_num_3
i64,u32,u32
20120104,0,0
20120105,0,0
20120106,16,535
20120109,71,11
20120110,1401,2
…,…,…
20120625,26,866
20120626,30,663
20120627,48,511
20120628,90,470


## 39. 每天分别有多少股票是最近连续3个交易日收益率超过当天市场平均收益率？

In [13]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over("date")
    )
    .with_columns(
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over("date")
    )
    .with_columns(
        stkcd_ex_mkt = pl.when(pl.col("stkcd_ret") > pl.col("mkt_ret")).then(1).otherwise(0)
    )
    .with_columns(
        stkcd_ex_mkt_num_3 = pl.col("stkcd_ex_mkt").rolling_sum(3).over("symbol")
    )
    .select(
        date = pl.col("date"),
        stkcd_ex_mkt_num_3 = pl.col("symbol").filter(pl.col("stkcd_ex_mkt_num_3") == 3).n_unique().over("date")
    )
    .unique()
    .sort(["date", "stkcd_ex_mkt_num_3"])
)

date,stkcd_ex_mkt_num_3
i64,u32
20120104,0
20120105,0
20120106,81
20120109,95
20120110,785
…,…
20120625,251
20120626,258
20120627,242
20120628,275


## 40. 每天分别有多少股票是最新5个交易日中至少有4个交易日的收益率超过当天市场平均收益率？

In [15]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over("date")
    )
    .with_columns(
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over("date")
    )
    .with_columns(
        stkcd_ex_mkt = pl.when(pl.col("stkcd_ret") > pl.col("mkt_ret")).then(1).otherwise(0)
    )
    .with_columns(
        stkcd_ex_mkt_num_5 = pl.col("stkcd_ex_mkt").rolling_sum(5).over("symbol")
    )
    .select(
        date = pl.col("date"),
        stkcd_ex_mkt_num_5 = pl.col("symbol").filter(pl.col("stkcd_ex_mkt_num_5") >= 4).n_unique().over("date")
    )
    .unique()
    .sort(["date", "stkcd_ex_mkt_num_5"])
)

date,stkcd_ex_mkt_num_5
i64,u32
20120104,0
20120105,0
20120106,0
20120109,0
20120110,235
…,…
20120625,405
20120626,338
20120627,365
20120628,375


## 41. 每个月中，个股月收益超过市场月收益1倍以上的股票有哪些？

In [39]:
(
    data
    .with_columns(
        date_ym = pl.col("date").cast(str).str.slice(0, 6),
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum().over("date")))
    )
    .with_columns(
        mkt_close = (pl.col("close")*pl.col("stkcd_weight")).sum().over("date"),
        stkcd_m_ret = (pl.col("close").last()/pl.col("close").first() - 1).over(["symbol", "date_ym"]),
    )
    .with_columns(
        mkt_m_ret = (pl.col("mkt_close").last().over(["symbol", "date_ym"])/pl.col("mkt_close").first().over(["symbol", "date_ym"]) - 1)
    )
    .select(
        symbol = pl.col("symbol"),
        date_ym = pl.col("date_ym"),
        stkcd_m_ret = pl.col("stkcd_m_ret"),
        mkt_m_ret = pl.col("mkt_m_ret")
    )
    .unique()
    .filter(
        pl.col("stkcd_m_ret") > 2*pl.col("mkt_m_ret")
    )
    .sort(["date_ym", "symbol"])
)


symbol,date_ym,stkcd_m_ret,mkt_m_ret
str,str,f64,f64
"""000001.SZ""","""201201""",0.097625,0.023567
"""000006.SZ""","""201201""",0.081281,0.023567
"""000008.SZ""","""201201""",0.060423,0.023567
"""000009.SZ""","""201201""",0.277228,0.023567
"""000012.SZ""","""201201""",0.146172,0.023567
…,…,…,…
"""601991.SH""","""201206""",0.110454,-0.034187
"""601996.SH""","""201206""",0.135563,-0.034187
"""601998.SH""","""201206""",-0.056604,-0.034187
"""603000.SH""","""201206""",-0.007499,-0.034187


## 42. 每个月中，个股月收益超过行业月收益1倍以上的股票有哪些？

In [7]:
(
    data
    .with_columns(
        date_ym = pl.col("date").cast(str).str.slice(0, 6),
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum().over(["date", "industry"])))
    )
    .with_columns(
        stkcd_m_ret = (pl.col("close").last().over(["symbol", "date_ym"]))/(pl.col("close").first().over(["symbol", "date_ym"])) - 1,
        ind_close = (pl.col("close")*pl.col("stkcd_ind_weight")).sum().over(["date", "industry"]),
    )
    .with_columns(
        ind_m_ret = (pl.col("ind_close").last().over(["symbol", "date_ym"])/pl.col("ind_close").first().over(["symbol", "date_ym"]) - 1)
    )
    .filter(
        pl.col("stkcd_m_ret") > 2 * pl.col("ind_m_ret")
    )
    .select(
        symbol = pl.col("symbol"),
        date_ym = pl.col("date_ym"),
        stkcd_m_ret = pl.col("stkcd_m_ret"),
        ind_m_ret = pl.col("ind_m_ret")
    )
    .unique()
    .sort(["date_ym", "symbol"])
)

symbol,date_ym,stkcd_m_ret,ind_m_ret
str,str,f64,f64
"""000004.SZ""","""201201""",0.045512,-0.05855
"""000006.SZ""","""201201""",0.081281,0.037408
"""000007.SZ""","""201201""",0.014472,-0.020458
"""000016.SZ""","""201201""",-0.012698,-0.010977
"""000017.SZ""","""201201""",0.132841,-0.062537
…,…,…,…
"""603000.SH""","""201206""",-0.007499,-0.069421
"""603001.SH""","""201206""",0.055708,0.007003
"""603002.SH""","""201206""",-0.09396,-0.106101
"""603333.SH""","""201206""",-0.11546,-0.102132


## 43. 每个股票的收益率对市场收益率的相关系数最高的10个股票是哪些？

In [11]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over("date")
    )
    .with_columns(
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over("date")
    )
    .select(
        "symbol",
        "date",
        pl.corr("stkcd_ret", "mkt_ret").over("symbol").alias("corr")
    )
    .unique()
    .filter(
        ~pl.col("corr").is_nan()
    )
    .sort(["date", "corr"], descending = True)
    [:10]
)


symbol,date,corr
str,i64,f64
"""300331.SZ""",20120629,1.0
"""600508.SH""",20120629,0.927955
"""601101.SH""",20120629,0.909776
"""000685.SZ""",20120629,0.897915
"""601666.SH""",20120629,0.893953
"""002082.SZ""",20120629,0.89325
"""002212.SZ""",20120629,0.89192
"""601001.SH""",20120629,0.891893
"""000089.SZ""",20120629,0.891447
"""601168.SH""",20120629,0.888806


## 44. 每个行业日收益率的历史波动率是多少？（用日收益率计算标准差）

In [12]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"])
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_ind_weight")).sum().over(["date", "industry"])
    )
    .select(
        industry = pl.col("industry"),
        ind_vol = pl.col("ind_ret").std().over("industry")
    )
    .unique()
    .sort(["industry", "ind_vol"])
)

industry,ind_vol
str,f64
"""AERODEF""",0.017359
"""AIRLINE""",0.017032
"""AUTO""",0.01499
"""BANKS""",0.008073
"""BEV""",0.016319
…,…
"""REALEST""",0.017284
"""RETAIL""",0.015696
"""SOFTWARE""",0.0178
"""TRDDIST""",0.017118


## 45. 各个行业的日收益率的相关系数矩阵如何？哪两个行业相关性最高、最低？

In [24]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"])
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_ind_weight")).sum().over(["date", "industry"])
    )
    .select(
        industry = pl.col("industry"),
        date = pl.col("date"),
        ind_ret = pl.col("ind_ret")
    )
    .pivot(
        "industry",
        index = "date",
        values = "ind_ret",
        aggregate_function = "first"
    )
    .select(
        pl.exclude("date")
    )
    .corr()
)

BANKS,RDRLTRAN,MTLMIN,AUTO,REALEST,UTILITIE,MARINE,ENERGY,AIRLINE,DVFININS,MACH,MEDIA,AERODEF,CNSTENG,HDWRSEMI,TRDDIST,CONSSERV,HEALTH,BEV,HOUSEDUR,LEISLUX,CHEM,ELECEQP,MATERIAL,FOODPROD,INDCONG,RETAIL,BLDPROD,CONMAT,PERSPRD,SOFTWARE,COMSERV
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,0.718676,0.626276,0.606801,0.585352,0.608247,0.581819,0.760872,0.654099,0.701848,0.60012,0.519133,0.465783,0.627839,0.558363,0.630844,0.522336,0.473658,0.383583,0.572934,0.544504,0.560457,0.59309,0.481197,0.506564,0.5047,0.559214,0.492436,0.572595,0.529474,0.478655,0.49313
0.718676,1.0,0.831253,0.874041,0.794202,0.81905,0.846844,0.856754,0.850983,0.829757,0.855312,0.776697,0.696612,0.862651,0.821953,0.882893,0.7894,0.6761,0.49379,0.78476,0.837324,0.842615,0.838604,0.770055,0.752622,0.775722,0.825533,0.77538,0.81591,0.766715,0.73035,0.771725
0.626276,0.831253,1.0,0.823657,0.709485,0.786093,0.822583,0.848265,0.796893,0.771522,0.87193,0.790843,0.720849,0.864339,0.848576,0.915207,0.808085,0.707666,0.54891,0.774076,0.851209,0.907845,0.877598,0.790778,0.801223,0.817896,0.795802,0.803298,0.824352,0.790274,0.778547,0.762735
0.606801,0.874041,0.823657,1.0,0.828531,0.825702,0.809432,0.804894,0.818696,0.799564,0.903551,0.811508,0.800998,0.897974,0.877243,0.878464,0.853259,0.74409,0.533968,0.855163,0.871725,0.879147,0.879306,0.802517,0.811961,0.820294,0.86246,0.853847,0.838886,0.837697,0.822306,0.849278
0.585352,0.794202,0.709485,0.828531,1.0,0.744441,0.726297,0.742121,0.73425,0.750546,0.831989,0.724874,0.691247,0.85542,0.766616,0.823417,0.837292,0.653898,0.490988,0.796468,0.811534,0.785785,0.796434,0.775337,0.725734,0.80031,0.771229,0.794407,0.839655,0.723175,0.701977,0.784265
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.492436,0.77538,0.803298,0.853847,0.794407,0.797645,0.780425,0.730419,0.721121,0.71691,0.885429,0.853001,0.764393,0.87636,0.925031,0.881484,0.897564,0.825389,0.584932,0.846976,0.925965,0.924797,0.930992,0.865592,0.880963,0.859329,0.876892,1.0,0.819237,0.892116,0.88467,0.908562
0.572595,0.81591,0.824352,0.838886,0.839655,0.746566,0.797809,0.774167,0.767873,0.776073,0.921058,0.732443,0.739309,0.916646,0.809606,0.851671,0.837624,0.682142,0.5254,0.775264,0.829256,0.851568,0.845613,0.794888,0.766511,0.823928,0.786332,0.819237,1.0,0.750224,0.754734,0.788934
0.529474,0.766715,0.790274,0.837697,0.723175,0.811728,0.740172,0.740646,0.732164,0.73053,0.872875,0.853583,0.770809,0.849566,0.921828,0.874387,0.899692,0.895213,0.647726,0.835697,0.922456,0.89953,0.91234,0.834067,0.930906,0.814293,0.920144,0.892116,0.750224,1.0,0.895815,0.893613
0.478655,0.73035,0.778547,0.822306,0.701977,0.783975,0.74961,0.680883,0.733805,0.722164,0.871498,0.880056,0.783704,0.829065,0.955865,0.845122,0.900991,0.847162,0.613208,0.80672,0.925557,0.907963,0.929265,0.837999,0.900352,0.834709,0.867108,0.88467,0.754734,0.895815,1.0,0.91585


## 46. 各个行业的收益率对市场收益率的相关系数由高到低排列如何？

In [26]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"]),
        stkcd_mkt_weight = (pl.col("capt")/(pl.col("capt").sum())).over("date")
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_ind_weight")).sum().over(["date", "industry"]),
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_mkt_weight")).sum().over("date")
    )
    .select(
        industry = pl.col("industry"),
        corr = pl.corr("ind_ret", "mkt_ret").over("industry")
    )
    .unique()
    .sort(["corr", "industry"], descending = [True, False])
)


industry,corr
str,f64
"""TRDDIST""",0.951901
"""MACH""",0.950965
"""CHEM""",0.945402
"""ELECEQP""",0.944826
"""CNSTENG""",0.944062
…,…
"""REALEST""",0.847987
"""HEALTH""",0.825455
"""AERODEF""",0.795422
"""BANKS""",0.738928


## 47. 每个月总成交额比上个月下降幅度最大的行业是哪个？

In [34]:
(
    data
    .with_columns(
        date_ym = pl.col("date").cast(str).str.slice(0, 6)
    )
    .select(
        industry = pl.col("industry"),
        date_ym = pl.col("date_ym"),
        ind_amount = pl.col("amount").sum().over(["date_ym", "industry"])
    )
    .unique()
    .sort(["industry", "date_ym"])
    .with_columns(
        ind_amount_diff = pl.col("ind_amount")-pl.col("ind_amount").shift(1).over("industry")
    )
    .filter(
        pl.col("ind_amount_diff") == pl.col("ind_amount_diff").min().over("date_ym")
    )
    .select(
        industry = pl.col("industry"),
        date_ym = pl.col("date_ym")
    )
    .unique()
    .sort(["industry", "date_ym"])
)

industry,date_ym
str,str
"""CONMAT""","""201203"""
"""MARINE""","""201202"""
"""MTLMIN""","""201204"""
"""MTLMIN""","""201206"""
"""REALEST""","""201205"""


## 48. 数据当中各个股票的最大回撤幅度是多少？（最大回撤是从一个高点到低点的降幅的最大值）

In [47]:
(
    data
    .with_columns(
        future_min = pl.col("low")
            .rolling_min(
                window_size = 117,
                min_periods = 1
            ).over("symbol")
    )
    .with_columns(
        drawdown = (pl.col("high")/pl.col("low") - 1)
    )
    .group_by("symbol")
    .agg(
        max_drawdown = pl.col("drawdown").max()
    )
    .sort(["symbol"])
)


symbol,max_drawdown
str,f64
"""000001.SZ""",0.045177
"""000002.SZ""",0.063712
"""000004.SZ""",0.105495
"""000005.SZ""",0.124088
"""000006.SZ""",0.135246
…,…
"""603002.SH""",0.12855
"""603123.SH""",0.230435
"""603128.SH""",0.094364
"""603333.SH""",0.103484


## 49. 每只股票的胜率是多少？（胜率是每天收益率为正数的概率）

In [50]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
    )
    .group_by("symbol")
    .agg(
        win_rate = pl.col("stkcd_ret").filter(pl.col("stkcd_ret") > 0).count()/pl.col("stkcd_ret").count()
    )
    .sort(["symbol"])
)


symbol,win_rate
str,f64
"""000001.SZ""",0.418803
"""000002.SZ""",0.470085
"""000004.SZ""",0.512821
"""000005.SZ""",0.153846
"""000006.SZ""",0.512821
…,…
"""603002.SH""",0.433333
"""603123.SH""",0.317073
"""603128.SH""",0.478261
"""603333.SH""",0.358974


## 50. 每只股票的盈亏比是多少？（盈亏比是正收益之和与负收益之和的比值的绝对值）

In [51]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
    )
    .group_by("symbol")
    .agg(
        win_lose_ratio = pl.col("stkcd_ret").filter(pl.col("stkcd_ret") > 0).sum()/pl.col("stkcd_ret").filter(pl.col("stkcd_ret") < 0).abs().sum()
    )
    .sort(["symbol"])
)

symbol,win_lose_ratio
str,f64
"""000001.SZ""",0.971861
"""000002.SZ""",1.279686
"""000004.SZ""",1.082869
"""000005.SZ""",0.553243
"""000006.SZ""",1.371993
…,…
"""603002.SH""",3.327521
"""603123.SH""",1.415443
"""603128.SH""",1.266408
"""603333.SH""",1.020868


## 51. 市场的胜率是多少？（市场收益率为正的概率）

In [53]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over("date")
    )
    .with_columns(
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over("date"),
        date = pl.col("date")
    )
    .unique()
    .select(
        mkt_win_rate = pl.col("mkt_ret").filter(pl.col("mkt_ret") > 0).count()/pl.col("mkt_ret").count()
    )
)

mkt_win_rate
f64
0.486139


## 52. 市场的盈亏比是多少？（市场中每个股票的市值加权正收益和市值加权负收益之比）

In [55]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_weight = (pl.col("capt")/(pl.col("capt").sum())).over("date")
    )
    .select(
        date = pl.col("date"),
        mkt_ret = (pl.col("stkcd_ret")*pl.col("stkcd_weight")).sum().over("date")
    )
    .unique()
    .select(
        mkt_win_lose_ratio = pl.col("mkt_ret").filter(pl.col("mkt_ret") > 0).sum()/pl.col("mkt_ret").filter(pl.col("mkt_ret") < 0).abs().sum()
    )
)

mkt_win_lose_ratio
f64
1.147373


## 53. 每个行业的胜率是多少？

In [56]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"])
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_ind_weight")).sum().over(["date", "industry"])
    )
    .group_by("industry")
    .agg(
        ind_win_rate = pl.col("ind_ret").filter(pl.col("ind_ret") > 0).count()/pl.col("ind_ret").count()
    )
    .sort(["industry"])
)

industry,ind_win_rate
str,f64
"""AERODEF""",0.517012
"""AIRLINE""",0.477225
"""AUTO""",0.468606
"""BANKS""",0.461538
"""BEV""",0.546176
…,…
"""REALEST""",0.529745
"""RETAIL""",0.52108
"""SOFTWARE""",0.553402
"""TRDDIST""",0.547009


## 54. 每个行业的盈亏比是多少？（行业盈亏比是行业内每个股票的市值加权的正收益率和市值加权的负收益率之比）

In [57]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_ind_weight = (pl.col("capt")/(pl.col("capt").sum())).over(["date", "industry"])
    )
    .with_columns(
        ind_ret = (pl.col("stkcd_ret")*pl.col("stkcd_ind_weight")).sum().over(["date", "industry"])
    )
    .group_by("industry")
    .agg(
        ind_win_lose_ratio = pl.col("ind_ret").filter(pl.col("ind_ret") > 0).sum()/pl.col("ind_ret").filter(pl.col("ind_ret") < 0).abs().sum()
    )
    .sort(["industry"])

)

industry,ind_win_lose_ratio
str,f64
"""AERODEF""",0.982985
"""AIRLINE""",1.070115
"""AUTO""",1.113772
"""BANKS""",1.015873
"""BEV""",1.279969
…,…
"""REALEST""",1.365078
"""RETAIL""",1.101875
"""SOFTWARE""",1.062945
"""TRDDIST""",1.201983


## 55. 是否存在股票的月成交额超过所在行业当月中某天一天总成交额的情况？

In [61]:
(
    data
    .with_columns(
        date_ym = pl.col("date").cast(str).str.slice(0, 6),
        ind_d_amount = pl.col("amount").sum().over(["date", "industry"])
    )
    .with_columns(
        stkcd_m_amount = pl.col("amount").sum().over(["date_ym", "symbol"])
    )
    .filter(
        pl.col("stkcd_m_amount") > pl.col("ind_d_amount")
    )
    .select(
        "symbol",
        "date_ym",
        "stkcd_m_amount",
        "date",
        "ind_d_amount"
    )
    .unique()
    .sort(["symbol", "date_ym"])
)


symbol,date_ym,stkcd_m_amount,date,ind_d_amount
str,str,f64,i64,f64
"""000001.SZ""","""201201""",2.9727e9,20120116,2.8639e9
"""000001.SZ""","""201202""",7.8830e9,20120213,5.2303e9
"""000001.SZ""","""201202""",7.8830e9,20120210,5.1547e9
"""000001.SZ""","""201202""",7.8830e9,20120223,5.7850e9
"""000001.SZ""","""201202""",7.8830e9,20120203,7.1481e9
…,…,…,…,…
"""603128.SH""","""201206""",2.0597e9,20120613,5.7742e8
"""603128.SH""","""201206""",2.0597e9,20120625,5.7251e8
"""603128.SH""","""201206""",2.0597e9,20120627,9.1629e8
"""603128.SH""","""201206""",2.0597e9,20120604,7.7235e8


## 56. 每天每个行业编入、编出的股票各有多少？

In [41]:
(
    data
    .sort(["symbol", "date"])
    .group_by(["date", "industry"])
    .agg(
        symbol = pl.col("symbol")
    )
    .sort(["industry", "date"])
    .with_columns(
        symbol_prev = pl.col("symbol").shift(1).over("industry")
    )
    .filter(
        pl.col("symbol_prev").is_not_null()
    )
    .select(
        "date",
        "industry",
        industry_in = pl.col("symbol").list.set_intersection(pl.col("symbol_prev")).list.set_symmetric_difference(pl.col("symbol")).len().over(["date", "industry"]) - 1,
        industry_out = pl.col("symbol_prev").list.set_intersection(pl.col("symbol")).list.set_symmetric_difference(pl.col("symbol_prev")).len().over(["date", "industry"]) - 1
    )
    .sort(["industry", "date"])
)

date,industry,industry_in,industry_out
i64,str,u32,u32
20120105,"""AERODEF""",0,0
20120106,"""AERODEF""",0,0
20120109,"""AERODEF""",0,0
20120110,"""AERODEF""",0,0
20120111,"""AERODEF""",0,0
…,…,…,…
20120625,"""UTILITIE""",0,0
20120626,"""UTILITIE""",0,0
20120627,"""UTILITIE""",0,0
20120628,"""UTILITIE""",0,0


## 57. 每天每个行业内股票收益率的标准差是多少？

In [66]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1
    )
    .group_by(["date", "industry"])
    .agg(
        ind_ret_std = pl.col("stkcd_ret").std()
    )
    .sort(["date", "industry"])
)


date,industry,ind_ret_std
i64,str,f64
20120104,"""AERODEF""",0.028369
20120104,"""AIRLINE""",0.011949
20120104,"""AUTO""",0.024044
20120104,"""BANKS""",0.008446
20120104,"""BEV""",0.02569
…,…,…
20120629,"""REALEST""",0.018059
20120629,"""RETAIL""",0.015018
20120629,"""SOFTWARE""",0.023618
20120629,"""TRDDIST""",0.024542


## 58. 每天每个行业内股票收益率的标准差的相关性如何？

In [68]:
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1
    )
    .group_by(["date", "industry"])
    .agg(
        ind_ret_std = pl.col("stkcd_ret").std()
    )
    .pivot(
        "industry",
        index = "date",
        values = "ind_ret_std",
        aggregate_function = "first"
    )
    .select(
        pl.exclude("date")
    )
    .corr()
)
## 59. 每天计算出成交额的 z-score （减去均值除以标准差）, 该指标能解释下一天个股超额收益率的多少比例？

CNSTENG,BEV,CONMAT,MARINE,ENERGY,BANKS,SOFTWARE,CHEM,TRDDIST,AIRLINE,RETAIL,HDWRSEMI,HOUSEDUR,MTLMIN,REALEST,FOODPROD,MATERIAL,PERSPRD,INDCONG,HEALTH,ELECEQP,AERODEF,COMSERV,CONSSERV,BLDPROD,MACH,AUTO,RDRLTRAN,DVFININS,UTILITIE,MEDIA,LEISLUX
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,0.211685,0.234814,0.178241,0.085793,-0.010451,0.169888,0.161459,0.279617,0.30127,0.292674,0.273154,0.261188,0.218982,0.214717,0.219454,0.008943,0.243457,0.261523,0.138424,0.232307,0.216828,0.201579,0.145106,0.31394,0.323047,0.324969,0.255077,0.086317,0.239629,0.020133,0.198186
0.211685,1.0,0.118357,0.091399,0.175524,0.060737,0.122781,0.114483,0.162351,0.022503,0.205947,0.18187,0.190613,0.367896,0.079432,0.368608,0.101116,0.198618,0.233059,0.088106,0.325258,0.024979,0.13949,0.17656,0.352289,0.225041,0.263742,0.053848,0.130485,0.173413,0.154699,0.155075
0.234814,0.118357,1.0,0.371161,0.234726,0.203366,0.036888,0.146228,0.267208,0.170596,0.201671,0.254507,0.292689,0.414449,0.349023,0.40125,-0.1032,0.220347,0.266069,0.19814,0.345226,0.390574,0.288058,0.14413,0.210249,0.45348,0.289746,0.200052,0.063296,0.282961,0.049189,0.207454
0.178241,0.091399,0.371161,1.0,0.282463,0.138152,0.059437,0.362684,0.311701,0.269741,0.291913,0.198822,0.264346,0.184248,0.344272,0.289301,-0.005328,0.225455,0.182146,0.186337,0.43471,0.139021,0.275018,0.298295,0.15774,0.343466,0.398957,0.336982,0.128921,0.240108,0.044119,0.224438
0.085793,0.175524,0.234726,0.282463,1.0,0.094106,0.207162,0.098007,0.233825,0.162216,0.114812,0.100129,0.260636,0.298089,0.119741,0.279178,-0.034098,0.092908,0.042534,0.236946,0.270104,0.178778,0.079359,0.236467,0.152216,0.227108,0.299223,0.096474,0.04324,0.323743,0.044128,0.100461
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.255077,0.053848,0.200052,0.336982,0.096474,0.132156,0.085293,0.542983,0.500018,0.238502,0.306438,0.253383,0.338366,0.347636,0.492157,0.339467,-0.000864,0.336614,0.185453,0.200296,0.424786,0.23462,0.319976,0.269201,0.405853,0.50416,0.460378,1.0,0.121807,0.330821,-0.041206,0.219908
0.086317,0.130485,0.063296,0.128921,0.04324,0.053598,-0.042095,0.094325,0.188584,0.192232,0.7349,-0.0284,0.073474,0.143136,0.074021,0.149385,-0.051507,0.110229,0.110727,0.211297,0.14321,0.013695,0.048544,0.027886,0.064916,0.155739,0.214957,0.121807,1.0,0.205193,-0.040323,0.091705
0.239629,0.173413,0.282961,0.240108,0.323743,0.263463,0.267398,0.236075,0.36098,0.132329,0.385845,0.276219,0.492923,0.471838,0.357547,0.554736,0.032338,0.286892,0.319275,0.418128,0.457004,0.257165,0.32081,0.286851,0.34981,0.43198,0.322791,0.330821,0.205193,1.0,0.063469,0.377272
0.020133,0.154699,0.049189,0.044119,0.044128,-0.038437,0.016342,-0.002682,0.095836,0.004517,0.056686,0.067494,0.089544,0.063296,0.070418,0.153969,-0.006598,0.043118,0.094028,0.081821,0.063065,0.055022,0.064871,-0.032764,0.002654,0.077862,0.101366,-0.041206,-0.040323,0.063469,1.0,0.112526


## 59. 每天计算出成交额的 z-score （减去均值除以标准差）, 该指标能解释下一天个股超额收益率的多少比例？

In [16]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        amount_z_score = (pl.col("amount") - pl.col("amount").mean().over("date"))/pl.col("amount").std().over("date"),
    )
    .select(
        "symbol",
        "date",
        pl.col("amount_z_score").shift(1).over("symbol").alias("amount_z_score"),
        "stkcd_ret"
    )
    .with_columns(
        pds.lin_reg(
            pl.col("amount_z_score"),
            target = pl.col("stkcd_ret"),
            add_bias = True,
            return_pred = True
        )
        .alias("prediction")
    )
    .unnest("prediction")
    .filter(
        ~pl.col("pred").is_nan()
    )
    .select(
        1 - (pl.col("resid").pow(2).sum())
        /(pl.col("stkcd_ret") - pl.col("stkcd_ret").mean().pow(2)).sum()
    )
)

literal
f64
0.263594


## 60. 每个股票的收益率和300、500指数收益率可以回归出一个截距项和2个beta，这两个beta的分布如何？

In [51]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1
    )
    .select(
        date = pl.col("date"),
        symbol = pl.col("symbol"),
        stkcd_ret = pl.col("stkcd_ret"),
        index_300_ret = (pl.col("stkcd_ret") * pl.col("index_w300")).sum().over("date"),
        index_500_ret = (pl.col("stkcd_ret") * pl.col("index_w500")).sum().over("date")
    )
    .select(
        "symbol",
        pds.lin_reg(    
            pl.col("index_300_ret"), pl.col("index_500_ret"),
            target = pl.col("stkcd_ret"),
            add_bias = False
        )
        .over("symbol")
        .alias("betas")
    )
    .unique()
    .select(
        [
        pl.col("betas").list.get(0).alias("beta_300"),
        pl.col("betas").list.get(1).alias("beta_500")
        ]
    )
    .select(
        beta_300_mean = pl.col("beta_300").mean(),
        beta_500_mean = pl.col("beta_500").mean(),
        beta_300_var = pl.col("beta_300").var(),
        beta_500_var = pl.col("beta_500").var()
    )
)

beta_300_mean,beta_500_mean,beta_300_var,beta_500_var
f64,f64,f64,f64
-0.114669,1.083144,1.572318,1.77177


## 61. 每天开盘后到最高价涨幅最大的100只股票同样也是全天(昨收到今收)涨幅最大的100只股票的比例是多少?

In [75]:
(
    data
    .with_columns(
        open_high = pl.col("high") / pl.col("open") - 1,
        close_preclose = pl.col("close") / pl.col("pre_close") - 1
    )
    .group_by("date")
    .agg(
        # date = pl.col("date"),
        open_high_100 = pl.col("symbol").sort_by("open_high", descending = True).head(100),
        close_preclose_100 = pl.col("symbol").sort_by("close_preclose", descending = True).head(100)
    )
    .sort("date")
    .select(
        date = pl.col("date"),
        diff = pl.col("open_high_100").list.set_intersection(pl.col("close_preclose_100")).list.len()
    )
    .select(
        pl.col("date").filter(pl.col("diff") == 100).count()/pl.col("date").count()
    )
)

date
f64
0.0


## 62. 每天计算最近三天每天对市场的超额收益率都排进当天前100的股票有哪些?

In [112]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_weight = ((pl.col("capt")/(pl.col("capt").sum())) - 1).over('date')
    )
    .select(
        date = pl.col("date"),
        symbol = pl.col("symbol"),
        stkcd_ret = pl.col("stkcd_ret"),
        mkt_ret = (pl.col("stkcd_ret") * pl.col("stkcd_weight")).sum().over("date")
    )
    .with_columns(
        pds.lin_reg(
            pl.col("mkt_ret"),
            target = pl.col("stkcd_ret"),
            add_bias = True,
            return_pred = True
        )
        .over("symbol")
        .alias("prediction")
    )
    .unnest("prediction")
    .sort(["date", "resid"], descending = [False, True])
    .with_columns(
        rank = pl.col("resid").rank(descending=True).over("date")
    )
    .with_columns(
        is_top_100 = pl.when(pl.col("rank") <= 100).then(1).otherwise(0)
    )
    .sort(["symbol", "date"])
    .with_columns(
        rolling_top_100 = pl.col("is_top_100").rolling_sum(window_size=3).over("symbol")
    )
    .filter(
        pl.col("rolling_top_100") == 3
    )
    .group_by("date")
    .agg(
        pl.col("symbol")
    )
    .sort("date")
)

date,symbol
i64,list[str]
20120109,"[""600462.SH""]"
20120110,"[""000791.SZ""]"
20120111,"[""000034.SZ"", ""000552.SZ"", … ""600792.SH""]"
20120112,"[""000034.SZ"", ""000552.SZ"", … ""600792.SH""]"
20120113,"[""000552.SZ"", ""000856.SZ"", … ""600792.SH""]"
…,…
20120621,"[""002180.SZ"", ""300205.SZ"", ""300328.SZ""]"
20120625,"[""002180.SZ"", ""002684.SZ"", ""300250.SZ""]"
20120627,"[""000971.SZ"", ""002320.SZ"", … ""600896.SH""]"
20120628,"[""002596.SZ"", ""300084.SZ""]"


## 63. 每天计算最近三天每天对行业的超额收益率都排进当天行业前30%的股票有哪些?

In [114]:
import polars_ds as pds
(
    data
    .with_columns(
        stkcd_ret = pl.col("close")/pl.col("pre_close") - 1,
        stkcd_weight = ((pl.col("capt")/(pl.col("capt").sum())) - 1).over(['date', "industry"])
    )
    .select(
        date = pl.col("date"),
        symbol = pl.col("symbol"),
        industry = pl.col("industry"),
        stkcd_ret = pl.col("stkcd_ret"),
        ind_ret = (pl.col("stkcd_ret") * pl.col("stkcd_weight")).sum().over(["date", "industry"])
    )
    .with_columns(
        pds.lin_reg(
            pl.col("ind_ret"),
            target = pl.col("stkcd_ret"),
            add_bias = True,
            return_pred = True
        )
        .over("symbol")
        .alias("prediction")
    )
    .unnest("prediction")
    .sort(["date", "resid"], descending = [False, True])
    .with_columns(
        rank = pl.col("resid").rank(descending=True).over("date")
    )
    .with_columns(
        is_top_100 = pl.when(pl.col("rank") <= 100).then(1).otherwise(0)
    )
    .sort(["symbol", "date"])
    .with_columns(
        rolling_top_100 = pl.col("is_top_100").rolling_sum(window_size=3).over("symbol")
    )
    .filter(
        pl.col("rolling_top_100") == 3
    )
    .group_by("date")
    .agg(
        pl.col("symbol")
    )
    .sort("date")
)

date,symbol
i64,list[str]
20120109,"[""002164.SZ"", ""600462.SH""]"
20120110,"[""000791.SZ""]"
20120111,"[""000034.SZ"", ""000791.SZ"", … ""600792.SH""]"
20120112,"[""000034.SZ"", ""000552.SZ"", … ""600792.SH""]"
20120113,"[""000552.SZ"", ""000856.SZ"", … ""600792.SH""]"
…,…
20120625,"[""002180.SZ"", ""002549.SZ"", … ""300326.SZ""]"
20120626,"[""600401.SH""]"
20120627,"[""000971.SZ"", ""002320.SZ"", … ""600679.SH""]"
20120628,"[""002596.SZ"", ""300084.SZ""]"
