# Import Data

In [2]:
# We only need two packages here
library(data.table)
library(stringr)

# set `data_path` to your dir
data_path <- "C:/Users/rossz/OneDrive/App/R/stock-datatable-key/data"
setwd(data_path)

# read into data
data <- readRDS("stock-market-data.rds")
data[1:5] # show top 5 obs

symbol,date,pre_close,open,high,low,close,volume,amount,adj_factor,capt,index_w50,index_w300,index_w500,industry
600000.SH,20120104,8.49,8.54,8.56,8.39,8.41,34201379,290229551,6.655275,125500555680,0.04640928,0.02125936,0,BANKS
600000.SH,20120105,8.41,8.47,8.82,8.47,8.65,132116203,1144753023,6.655275,129082022192,0.04640928,0.02125936,0,BANKS
600000.SH,20120106,8.65,8.63,8.78,8.62,8.71,61778687,537043761,6.655275,129977388820,0.04640928,0.02125936,0,BANKS
600000.SH,20120109,8.71,8.72,8.99,8.68,8.95,80136249,711429611,6.655275,133558855331,0.04640928,0.02125936,0,BANKS
600000.SH,20120110,8.95,8.95,9.1,8.88,9.07,72004632,647206633,6.655275,135349588587,0.04640928,0.02125936,0,BANKS


# Answer Keys

## 1. 哪些股票的代码中包含"8"这个数字？

In [3]:
data[str_detect(symbol, "8"), unique(symbol)]

## 2. 每天上涨和下跌的股票各有多少？

In [8]:
data[, 
     .(num = uniqueN(symbol)), 
     keyby = .(date, updown = ifelse(close - pre_close > 0, "UP", "DOWN"))]

date,updown,num
20120104,DOWN,2129
20120104,UP,191
20120105,DOWN,2188
20120105,UP,132
20120106,DOWN,879
20120106,UP,1444
20120109,DOWN,121
20120109,UP,2202
20120110,DOWN,105
20120110,UP,2220


## 3. 每天每个交易所上涨、下跌的股票各有多少？

In [5]:
data[, .(num = uniqueN(symbol)), by = .(date, situ = ifelse(close - pre_close > 0, "UP", "DOWN"), market_type = str_sub(symbol, start = -2, end = -1))]

date,situ,market_type,num
20120104,DOWN,SH,836
20120105,UP,SH,74
20120106,UP,SH,583
20120109,UP,SH,881
20120110,UP,SH,888
20120111,DOWN,SH,580
20120112,UP,SH,365
20120113,DOWN,SH,867
20120116,DOWN,SH,871
20120117,UP,SH,884


#### *4. 沪深300成分股中，每天上涨、下跌的股票各有多少？*

In [6]:
data[index_w300 > 0, .(num = uniqueN(symbol)), by = .(date, situ = ifelse(close - pre_close > 0, "UP", "DOWN"))]

date,situ,num
20120104,DOWN,280
20120105,UP,50
20120106,UP,202
20120109,UP,292
20120110,UP,295
20120111,DOWN,220
20120112,UP,112
20120113,DOWN,285
20120116,DOWN,284
20120117,UP,295


#### *5. 每天每个行业各有多少只股票？*


In [6]:
data[, .(stk_num = uniqueN(symbol)), by = .(date, industry)]

date,industry,stk_num
<int>,<chr>,<int>
20120104,BANKS,16
20120105,BANKS,16
20120106,BANKS,16
20120109,BANKS,16
20120110,BANKS,16
20120111,BANKS,16
20120112,BANKS,16
20120113,BANKS,16
20120116,BANKS,16
20120117,BANKS,16


#### *6. 股票数最大的行业和总成交额最大的行业是否总是同一个行业？*

##### 6.1 *Key 1*

In [3]:
data1 <- data[, .(trd_amount = sum(amount), stk_num = uniqueN(symbol)), by = .(date, industry)]
data1[data1[, .I[trd_amount == max(trd_amount)], by = date]$V1]$industry == data1[data1[, .I[stk_num == max(stk_num)], by = date]$V1]$industry

##### 6.2 *Key 2*

In [4]:
data[, .(trd_amount = sum(amount), stk_num = uniqueN(symbol)), by = .(date, industry)
    ][, .SD[trd_amount == max(trd_amount) & stk_num == max(stk_num), .(industry)], keyby = .(date)]

date,industry
<int>,<chr>
20120104,HDWRSEMI
20120106,HDWRSEMI
20120213,HDWRSEMI
20120216,HDWRSEMI
20120217,HDWRSEMI
20120220,HDWRSEMI
20120221,HDWRSEMI
20120222,HDWRSEMI
20120301,HDWRSEMI
20120302,HDWRSEMI


#### *7. 每天涨幅超过5%、跌幅超过5%的股票各有多少？*

In [10]:
data[(close - pre_close)/pre_close > 0.05 | (close - pre_close)/pre_close < -0.05, .(symbol_amount = uniqueN(symbol)), by = .(tag = ifelse((close - pre_close)/pre_close > 0.05, "up5%ext", "down5%ext"), date)]

tag,date,symbol_amount
<chr>,<int>,<int>
up5%ext,20120227,47
up5%ext,20120228,50
up5%ext,20120319,109
up5%ext,20120117,1430
up5%ext,20120106,51
up5%ext,20120111,71
down5%ext,20120116,514
up5%ext,20120314,23
up5%ext,20120320,23
down5%ext,20120328,1112


#### *8. 每天涨幅前10的股票的总成交额和跌幅前10的股票的总成交额比例是多少？*

In [9]:
data[, .SD, keyby = .(date, violet = ((close - pre_close)/pre_close))
    ][, .(ratio = sum(amount[1:10])/sum(amount[(.N-10):.N])), by = date]

date,ratio
<int>,<dbl>
20120104,0.4872509
20120105,0.4865540
20120106,0.6407234
20120109,0.3406203
20120110,1.7591140
20120111,0.6983067
20120112,2.2585934
20120113,0.2505291
20120116,1.2686356
20120117,0.3230138


#### *9. 每天开盘涨停的股票与收盘涨停的股票各有多少？（涨停按照收益率超过1.5%的标准计算）*

In [13]:
data[(close/pre_close - 1) > 0.015 | (open/pre_close - 1) > 0.015, .SD, by = .(date, symbol, tag = ifelse((close/pre_close - 1) > 0.015, "closelimit", "openlimit"))
    ][, .(symbol_amount = uniqueN(symbol)), keyby = .(date, tag)]

date,tag,symbol_amount
<int>,<chr>,<int>
20120104,closelimit,70
20120104,openlimit,297
20120105,closelimit,60
20120105,openlimit,22
20120106,closelimit,743
20120106,openlimit,29
20120109,closelimit,2142
20120109,openlimit,3
20120110,closelimit,2125
20120110,openlimit,1


#### *10. 每天统计最近3天出现过开盘涨停、跌停的股票各有多少只？*

In [16]:
data[(close/pre_close - 1) > 0.015 | (open/pre_close - 1) > 0.015, .SD, by = .(date, symbol, tag = ifelse((close/pre_close - 1) > 0.015, "closelimit", "openlimit"))
    ][, uniqueN(symbol), keyby = .(tag, date)
    ][, .(symbol_amount = {
        a <- vector()
        for (i in 4:.N) {
            a[i] <- sum(V1[(i-3):(i-1)])
        }
        a
    }, date), by = .(tag)]

tag,symbol_amount,date
<chr>,<int>,<int>
closelimit,,20120104
closelimit,,20120105
closelimit,,20120106
closelimit,873,20120109
closelimit,2945,20120110
closelimit,5010,20120111
closelimit,4621,20120112
closelimit,2737,20120113
closelimit,668,20120116
closelimit,354,20120117


#### *11. 股票每天的成交额变化率和收益率的相关性如何？*

In [17]:
data[, .(amount_change = {
    a <- vector()
    for (i in 2:.N) {
        a[i] <- amount[i]/amount[i-1] - 1
    }
    a
    }, ret = close/pre_close - 1, symbol = symbol)
    ][is.finite(amount_change), na.omit(.SD)
    ][, cor(amount_change, ret)]

#### *12. 每天每个行业的总成交额变化率和行业收益率的相关性如何？*

In [19]:
data[, .(ind_amount = sum(amount), weight = capt/sum(capt), ret = close/pre_close - 1), keyby = .(date, industry)
    ][, .(ind_ret = sum(weight * ret), ind_amount = ind_amount), keyby = .(industry, date)
    ][, unique(.SD)
    ][, .(ind_amount_change = {
        a <- vector()
        for (i in 2:.N) {
            a[i] <- ind_amount[i]/ind_amount[i-1] - 1
        }
        a
    }, ind_ret = ind_ret), keyby = .(industry)
    ][!is.na(ind_amount_change), cor(ind_amount_change, ind_ret)]

#### *13. 每天市场的总成交额变化率和市场收益率相关性如何？*

In [21]:
data[, .(mkt_amount = sum(amount), weight = capt/sum(capt), ret = close/pre_close - 1), keyby = date
    ][, .(mkt_ret = sum(weight * ret), mkt_amount = mkt_amount), keyby = date
    ][, unique(.SD)
    ][, .(mkt_amount_change = {
        a <- vector()
        for (i in 2:.N) {
            a[i] <- mkt_amount[i]/mkt_amount[i-1]-1
        }
        a
    }, mkt_ret = mkt_ret)
    ][!is.na(mkt_amount_change), cor(mkt_amount_change, mkt_ret)]

#### *14. 每天市场的总成交额的变化率和所有股票收益率的标准差相关性如何？*

In [23]:
data[, .(mkt_amount = sum(amount), ret = close/pre_close - 1, symbol = symbol), keyby = date
    ][, .(ret_sd = unique(sd(ret)), mkt_amount = unique(mkt_amount)), keyby = date
    ][, .(mkt_amount_change = {
        a <- vector()
        for (i in 2:.N) {
            a[i] <- mkt_amount[i]/mkt_amount[i-1] - 1
        }
        a
    }, ret_sd = ret_sd)
    ][!is.na(mkt_amount_change), cor(mkt_amount_change, ret_sd)]

#### *15. 每天每个行业的总成交额变化率和行业内股票收益率的标准差相关性如何？*

In [24]:
data[, .(ind_amount = sum(amount), ret = close/pre_close - 1, symbol = symbol), keyby = .(industry, date)
    ][, .(ind_ret_sd = unique(sd(ret)), ind_amount = unique(ind_amount)), keyby = .(industry, date)
    ][, .(ind_amount_change = {
        a <- vector()
        for (i in 2:.N) {
            a[i] <- ind_amount[i]/ind_amount[i-1] - 1
        }
        a
    }, ind_ret_sd = ind_ret_sd), keyby = industry
    ][!is.na(ind_amount_change), cor(ind_ret_sd, ind_amount_change)]

#### *16. 上证50、沪深300、中证500指数成分股中，沪股和深股各有多少？*

In [25]:
list(data[index_w50 > 0, .(stkcd_amount = uniqueN(symbol), type = "index_w50"), by = .(type = ifelse(str_detect(symbol, "SH"), "SH", "SZ"))], 
    data[index_w300 > 0, .(stkcd_amount = uniqueN(symbol), type = "index_w300"), by = .(type = ifelse(str_detect(symbol, "SH"), "SH", "SZ"))], 
    data[index_w500 > 0, .(stkcd_amount = uniqueN(symbol), type = "index_w500"), by = .(type = ifelse(str_detect(symbol, "SH"), "SH", "SZ"))]) %>% rbindlist()

type,stkcd_amount,type
<chr>,<int>,<chr>.1
SH,54,index_w50
SH,205,index_w300
SZ,95,index_w300
SH,297,index_w500
SZ,248,index_w500


#### *17. 上证50、沪深300、中证500指数成分股中，行业分布如何？*

In [26]:
list(data[index_w50 > 0, .(stkcd_amount = uniqueN(symbol), type = "index_w50"), by = .(industry)], 
    data[index_w300 > 0, .(stkcd_amount = uniqueN(symbol), type = "index_w300"), by = .(industry)], 
    data[index_w500 > 0, .(stkcd_amount = uniqueN(symbol), type = "index_w500"), by = .(industry)]) %>% rbindlist()

industry,stkcd_amount,type
<chr>,<int>,<chr>
BANKS,11,index_w50
MTLMIN,10,index_w50
ENERGY,7,index_w50
DVFININS,6,index_w50
MACH,4,index_w50
REALEST,2,index_w50
HDWRSEMI,1,index_w50
TRDDIST,1,index_w50
CNSTENG,3,index_w50
ELECEQP,1,index_w50


#### *18. 每天上证50、沪深300、中证500指数成分股的总成交额各是多少？*

In [27]:
list(data[index_w50 > 0, .(amount = sum(amount) %>% unique(), type = "index_w50"), by = .(date)], 
    data[index_w300 > 0, .(amount = sum(amount) %>% unique(), type = "index_w300"), by = .(date)], 
    data[index_w500 > 0, .(amount = sum(amount) %>% unique(), type = "index_w500"), by = .(date)]) %>% rbindlist()

date,amount,type
<int>,<dbl>,<chr>
20120104,11372037211,index_w50
20120105,15174601949,index_w50
20120106,11457969686,index_w50
20120109,19783087939,index_w50
20120110,25840613156,index_w50
20120111,17961005017,index_w50
20120112,16699936305,index_w50
20120113,16871980689,index_w50
20120116,12436137122,index_w50
20120117,24730323911,index_w50


#### *19. 上证50、沪深300、中证500指数日收益率的历史波动率是多少？*

In [29]:
list(data[, .(index_ret = sum(index_w50 * (close/pre_close - 1))), keyby = .(date)
        ][, .(vol = sd(index_ret), type = "index_w50")],
    data[, .(index_ret = sum(index_w300 * (close/pre_close - 1))), keyby = .(date)
        ][, .(vol = sd(index_ret), type = "index_w300")],
    data[, .(index_ret = sum(index_w500 * (close/pre_close - 1))), keyby = .(date)
        ][, .(vol = sd(index_ret), type = "index_w500")]) %>% rbindlist()

vol,type
<dbl>,<chr>
0.01234205,index_w50
0.01336797,index_w300
0.01607531,index_w500


#### *20. 上证50、沪深300、中证500指数日收益率的相关系数矩阵？*

In [31]:
data[, .(index_w50_ret = sum(index_w50 * (close/pre_close - 1)), index_w300_ret = sum(index_w300 * (close/pre_close - 1)), index_w500_ret = sum(index_w500 * (close/pre_close - 1))), keyby = date
    ][, cor(.SD[, -1])]

Unnamed: 0,index_w50_ret,index_w300_ret,index_w500_ret
index_w50_ret,1.0,0.9760849,0.8476975
index_w300_ret,0.9760849,1.0,0.933358
index_w500_ret,0.8476975,0.933358,1.0


#### *21. 上证50、沪深300、去除上证50的沪深300指数日收益率的相关系数矩阵？*

In [32]:
data[, .(index_w50_ret = sum(index_w50 * (close/pre_close - 1)), index_w300_ret = sum(index_w300 * (close/pre_close - 1)), index_w300_50 = ifelse(index_w50 == 0 & index_w300 > 0, index_w300, 0), close = close, pre_close = pre_close), keyby = date       ][, .(index_w50_ret = unique(index_w50_ret), index_w300_ret = unique(index_w300_ret), index_w300_50_ret = sum(index_w300_50 * (close/pre_close - 1)) %>% unique()), keyby = date
    ][, cor(.SD[, -1])]

Unnamed: 0,index_w50_ret,index_w300_ret,index_w300_50_ret
index_w50_ret,1.0,0.9760849,0.9294381
index_w300_ret,0.9760849,1.0,0.9874054
index_w300_50_ret,0.9294381,0.9874054,1.0


#### *22. 每天沪深300指数成分占比最大的10只股票是哪些？*

In [34]:
data[order(date, index_w300), .(symbol = symbol[1:10]), by = .(date)]

date,symbol
<int>,<chr>
20120104,600004.SH
20120104,600006.SH
20120104,600007.SH
20120104,600011.SH
20120104,600012.SH
20120104,600017.SH
20120104,600018.SH
20120104,600020.SH
20120104,600021.SH
20120104,600022.SH


#### *23. 各个行业的平均每日股票数量从大到小排序是什么？*

In [35]:
data[, .(stkcd_amount = uniqueN(symbol)), keyby = .(date, industry)
    ][order(date, -stkcd_amount)]

date,industry,stkcd_amount
<int>,<chr>,<int>
20120104,HDWRSEMI,224
20120104,CHEM,219
20120104,MACH,201
20120104,HEALTH,172
20120104,ELECEQP,131
20120104,REALEST,131
20120104,MTLMIN,128
20120104,LEISLUX,106
20120104,FOODPROD,99
20120104,AUTO,85


#### *24. 每个行业每天成交额最大的一只股票代码是什么？*

In [36]:
data[order(-amount), .(symbol = symbol[1]), keyby = .(date, industry)]

date,industry,symbol
<int>,<chr>,<chr>
20120104,AERODEF,000768.SZ
20120104,AIRLINE,600029.SH
20120104,AUTO,600104.SH
20120104,BANKS,600036.SH
20120104,BEV,600519.SH
20120104,BLDPROD,601636.SH
20120104,CHEM,600160.SH
20120104,CNSTENG,601117.SH
20120104,COMSERV,000544.SZ
20120104,CONMAT,600585.SH


#### *25. 每个行业每天最大成交额是最小成交额的几倍？*

In [37]:
data[order(-amount) & amount > 0, .(times = amount[1]/amount[.N]), keyby = .(date, industry)]

date,industry,times
<int>,<chr>,<dbl>
20120104,AERODEF,1.6822610
20120104,AIRLINE,17.0340980
20120104,AUTO,0.3134478
20120104,BANKS,4.5180305
20120104,BEV,0.2922401
20120104,BLDPROD,0.7188041
20120104,CHEM,1.0349111
20120104,CNSTENG,0.5009316
20120104,COMSERV,1.0394275
20120104,CONMAT,4.2984205


#### *26. 每个行业每天成交额最大的5只股票和成交额总和是多少？*

In [38]:
data[order(-amount), .(symbol = symbol[1:5], amount = sum(amount[1:5])), keyby = .(date, industry)]

date,industry,symbol,amount
<int>,<chr>,<chr>,<dbl>
20120104,AERODEF,000768.SZ,384758246
20120104,AERODEF,600316.SH,384758246
20120104,AERODEF,002025.SZ,384758246
20120104,AERODEF,600893.SH,384758246
20120104,AERODEF,600118.SH,384758246
20120104,AIRLINE,600029.SH,321770512
20120104,AIRLINE,600221.SH,321770512
20120104,AIRLINE,601111.SH,321770512
20120104,AIRLINE,600115.SH,321770512
20120104,AIRLINE,000099.SZ,321770512


#### *27. 每个行业每天成交额超过该行业中股票成交额80%分位数的股票的平均收益率是多少？*

In [39]:
data[, .(symbol = symbol, ret = close/pre_close - 1, amount = amount, industry, date)
    ][, .SD[amount > quantile(amount, 0.8)], keyby = .(date, industry)
    ][, .(aver_ret = mean(ret)), keyby = .(date, industry)]

date,industry,aver_ret
<int>,<chr>,<dbl>
20120104,AERODEF,0.005714392
20120104,AIRLINE,-0.024726379
20120104,AUTO,-0.017813417
20120104,BANKS,-0.007846674
20120104,BEV,-0.056254460
20120104,BLDPROD,-0.031741456
20120104,CHEM,-0.032022583
20120104,CNSTENG,-0.026837227
20120104,COMSERV,-0.032827341
20120104,CONMAT,-0.032062156


#### *28. 每天成交额最大的10%的股票的平均收益率和成交额最小的10%的股票的平均收益率的相关系数是多少？*

In [40]:
data[, .(symbol = symbol, ret = close/pre_close - 1, amount = amount, date)
    ][, .(ret_aver = mean(ret)), keyby = .(date, tag = ifelse(amount > quantile(amount, 0.9), "max10%"，ifelse(amount < quantile(amount, 0.1), "min10%", "others")))
    ][, .SD[-3], keyby = .(date)]

date,tag,ret_aver
<int>,<chr>,<dbl>
20120104,max10%,-1.810573e-02
20120104,min10%,-1.790767e-02
20120105,max10%,-2.175878e-02
20120105,min10%,-3.048809e-02
20120106,max10%,5.994575e-03
20120106,min10%,4.869812e-03
20120109,max10%,4.670463e-02
20120109,min10%,2.662675e-02
20120110,max10%,4.483562e-02
20120110,min10%,1.404222e-02


#### *29. 每天哪些行业的平均成交额高于全市场平均成交额？*

In [41]:
data[, .(amount_aver = mean(amount), industry = industry, amount = amount), keyby = .(date)
    ][, .(amount_ind_aver = mean(amount), amount_aver), keyby = .(date, industry)
    ][amount_ind_aver > amount_aver, unique(.SD[, 1:2])]

date,industry
<int>,<chr>
20120104,AERODEF
20120104,BANKS
20120104,BEV
20120104,CONMAT
20120104,DVFININS
20120104,ENERGY
20120104,HOUSEDUR
20120104,INDCONG
20120104,MEDIA
20120104,MTLMIN


#### *30. 每天每个股票对市场的超额收益率是多少？*

In [42]:
data[, .(stkcd_ret = close/pre_close - 1, weight = capt/sum(capt), symbol), keyby = date
    ][, .(mkt_ret = sum(weight*stkcd_ret), stkcd_ret, symbol), keyby = date
    ][, .(alpha = coef(lm(stkcd_ret ~ mkt_ret))[1], beta = coef(lm(stkcd_ret ~ mkt_ret))[2], mkt_ret, stkcd_ret, symbol, date)
    ][, .(abnr_ret = stkcd_ret - alpha - beta * mkt_ret), keyby = .(date, symbol)]

date,symbol,abnr_ret
<int>,<chr>,<dbl>
20120104,000001.SZ,-0.0075757072
20120104,000002.SZ,0.0012644428
20120104,000004.SZ,-0.0022436150
20120104,000005.SZ,0.0200060760
20120104,000006.SZ,0.0102499784
20120104,000007.SZ,0.0701580517
20120104,000008.SZ,-0.0302091320
20120104,000009.SZ,-0.0533884194
20120104,000010.SZ,0.0200060760
20120104,000011.SZ,0.0021778750


#### *31. 每天每个股票对市场去除自身的超额收益率是多少？*

In [43]:
data[, .(stkcd_ret = close/pre_close - 1, symbol, weight =  capt/(sum(capt) - capt)),keyby = date
    ][, .(mkt_ret = sum(weight*stkcd_ret), stkcd_ret, symbol), keyby = date
    ][, .(alpha = coef(lm(stkcd_ret ~ mkt_ret))[1], beta = coef(lm(stkcd_ret ~ mkt_ret))[2], mkt_ret, stkcd_ret, symbol, date)
    ][, .(abnr_ret = stkcd_ret - alpha - beta * mkt_ret), keyby = .(date, symbol)]

date,symbol,abnr_ret
<int>,<chr>,<dbl>
20120104,000001.SZ,-0.0077629463
20120104,000002.SZ,0.0010772037
20120104,000004.SZ,-0.0024308541
20120104,000005.SZ,0.0198188369
20120104,000006.SZ,0.0100627393
20120104,000007.SZ,0.0699708126
20120104,000008.SZ,-0.0303963711
20120104,000009.SZ,-0.0535756585
20120104,000010.SZ,0.0198188369
20120104,000011.SZ,0.0019906359


#### *32. 每天每个股票对行业的超额收益率是多少？*

In [44]:
data[, .(stkcd_ret = close/pre_close - 1, symbol, weight = capt/sum(capt)), keyby = .(industry,date)
    ][, .(ind_ret = sum(weight*stkcd_ret), stkcd_ret, symbol), keyby = .(industry, date)
    ][, .(alpha = coef(lm(stkcd_ret ~ ind_ret))[1], beta = coef(lm(stkcd_ret ~ ind_ret))[2], ind_ret, stkcd_ret, symbol, date), keyby = .(industry)
    ][, .(abnr_ret = stkcd_ret - alpha - beta * ind_ret), keyby = .(date, symbol)]

date,symbol,abnr_ret
<int>,<chr>,<dbl>
20120104,000001.SZ,-0.0193325437
20120104,000002.SZ,0.0009002525
20120104,000004.SZ,0.0070727953
20120104,000005.SZ,0.0196418857
20120104,000006.SZ,0.0098857881
20120104,000007.SZ,0.0681032179
20120104,000008.SZ,-0.0279492664
20120104,000009.SZ,-0.0412176610
20120104,000010.SZ,0.0278375982
20120104,000011.SZ,0.0018136847


#### *33. 每天每个股票对行业去除自身的超额收益率是多少？*

In [45]:
data[, .(stkcd_ret = close/pre_close - 1, symbol, weight = capt/(sum(capt) - capt)), keyby = .(industry,date)
    ][, .(ind_ret = sum(weight*stkcd_ret), stkcd_ret, symbol), keyby = .(industry, date)
    ][, .(alpha = coef(lm(stkcd_ret ~ ind_ret))[1], beta = coef(lm(stkcd_ret ~ ind_ret))[2], ind_ret, stkcd_ret, symbol, date), keyby = .(industry)
    ][, .(abnr_ret = stkcd_ret - alpha - beta * ind_ret), keyby = .(date, symbol)]

date,symbol,abnr_ret
<int>,<chr>,<dbl>
20120104,000001.SZ,-0.0196847604
20120104,000002.SZ,0.0006789611
20120104,000004.SZ,0.0071187283
20120104,000005.SZ,0.0194205943
20120104,000006.SZ,0.0096644967
20120104,000007.SZ,0.0679612938
20120104,000008.SZ,-0.0286016284
20120104,000009.SZ,-0.0411302307
20120104,000010.SZ,0.0277827474
20120104,000011.SZ,0.0015923933


#### *34. 每个股票每天对市场的超额收益率与对行业的超额收益率的相关系数如何？*

In [46]:
mkt <- data[, .(stkcd_ret = close/pre_close - 1, weight = capt/sum(capt), symbol), keyby = date
    ][, .(mkt_ret = sum(weight/100*stkcd_ret), stkcd_ret, symbol), keyby = date
    ][, .(alpha = coef(lm(stkcd_ret ~ mkt_ret))[1], beta = coef(lm(stkcd_ret ~ mkt_ret))[2], mkt_ret, stkcd_ret, symbol, date)
    ][, .(abnr_mkt_ret = stkcd_ret - alpha - beta * mkt_ret), keyby = .(date, symbol)]
ind <- data[, .(stkcd_ret = close/pre_close - 1, symbol, weight = capt/sum(capt)), keyby = .(industry,date)
    ][, .(ind_ret = sum(weight/100*stkcd_ret), stkcd_ret, symbol), keyby = .(industry, date)
    ][, .(alpha = coef(lm(stkcd_ret ~ ind_ret))[1], beta = coef(lm(stkcd_ret ~ ind_ret))[2], ind_ret, stkcd_ret, symbol, date), keyby = .(industry)
    ][, .(abnr_ind_ret = stkcd_ret - alpha - beta * ind_ret), keyby = .(date, symbol)]
cor(mkt[ind, on = .(date, symbol)][, 3:4])
rm(mkt, ind)

Unnamed: 0,abnr_mkt_ret,abnr_ind_ret
abnr_mkt_ret,1.0,0.946769
abnr_ind_ret,0.946769,1.0


#### *35. 每天有哪些行业的平均收益率超过市场平均收益率？*

In [47]:
data[, .(stkcd_ret = close/pre_close - 1, symbol, ind_weight = capt/sum(capt), capt), keyby = .(industry,date)
    ][, .(ind_ret = sum(ind_weight*stkcd_ret), stkcd_ret, symbol, capt), keyby = .(industry, date)
    ][, .(ind_ret, mkt_weight = capt/sum(capt), stkcd_ret, industry), keyby = date
    ][, .(ind_ret, mkt_ret = sum(mkt_weight*stkcd_ret), industry), keyby = date
    ][ind_ret > mkt_ret, unique(.SD)]

date,ind_ret,mkt_ret,industry
<int>,<dbl>,<dbl>,<chr>
20120104,-0.0084033878,-0.015816484,AERODEF
20120104,-0.0130576592,-0.015816484,AUTO
20120104,-0.0067372496,-0.015816484,BANKS
20120104,-0.0037138310,-0.015816484,ENERGY
20120104,-0.0113401442,-0.015816484,HOUSEDUR
20120104,-0.0099071718,-0.015816484,RDRLTRAN
20120104,-0.0092269648,-0.015816484,UTILITIE
20120105,-0.0120803353,-0.014367639,AIRLINE
20120105,-0.0085605288,-0.014367639,AUTO
20120105,0.0150350080,-0.014367639,BANKS


#### *36. 每天每个行业对市场的超额收益率是多少？*

In [48]:
data[, .(stkcd_ret = close/pre_close - 1, symbol, ind_weight = capt/sum(capt), capt), keyby = .(industry,date)
    ][, .(ind_ret = sum(ind_weight*stkcd_ret), stkcd_ret, symbol, capt), keyby = .(industry, date)
    ][, .(ind_ret, mkt_weight = capt/sum(capt), stkcd_ret, industry), keyby = date
    ][, .(ind_ret, mkt_ret = sum(mkt_weight*stkcd_ret), industry), keyby = date
    ][, unique(.SD)
    ][, .(alpha = coef(lm(ind_ret ~ mkt_ret))[1], beta = coef(lm(ind_ret ~ mkt_ret))[2], ind_ret, mkt_ret, industry, date)
    ][, .(abnr_ret = ind_ret - alpha - beta*mkt_ret, date), by = industry]

industry,abnr_ret,date
<chr>,<dbl>,<int>
AERODEF,0.0098993621,20120104
AERODEF,-0.0087391716,20120105
AERODEF,-0.0282474476,20120106
AERODEF,-0.0062830802,20120109
AERODEF,0.0053427162,20120110
AERODEF,0.0033921876,20120111
AERODEF,0.0051834662,20120112
AERODEF,-0.0276084669,20120113
AERODEF,-0.0160809883,20120116
AERODEF,0.0035080100,20120117


#### *37. 每天每个行业对去除本行业后的市场超额收益是多少？*

In [49]:
data[, .(stkcd_ret = close/pre_close - 1, symbol, ind_weight = capt/sum(capt), capt), keyby = .(industry,date)
    ][, .(ind_ret = sum(ind_weight*stkcd_ret), stkcd_ret, symbol, capt), keyby = .(industry, date)
    ][, .(ind_capt = sum(capt), ind_ret), keyby = .(industry, date)
    ][, unique(.SD)
    ][, .(ind_mkt_weight = ind_capt/(sum(ind_capt)-ind_capt), ind_ret, industry), keyby = date
    ][, .(mkt_ret = sum(ind_mkt_weight * ind_ret), ind_ret, industry), by = date
    ][, .(alpha = coef(lm(ind_ret ~ mkt_ret))[1], beta = coef(lm(ind_ret ~ mkt_ret))[2], ind_ret, mkt_ret, industry, date)
    ][, .(abnr_ret = ind_ret - alpha - beta*mkt_ret, date), by = industry]

industry,abnr_ret,date
<chr>,<dbl>,<int>
AERODEF,0.0094213482,20120104
AERODEF,-0.0101032674,20120105
AERODEF,-0.0286978408,20120106
AERODEF,-0.0065539745,20120109
AERODEF,0.0057905729,20120110
AERODEF,0.0036557266,20120111
AERODEF,0.0050883770,20120112
AERODEF,-0.0286276061,20120113
AERODEF,-0.0164406859,20120116
AERODEF,0.0034710088,20120117


#### *38. 每天分别有多少股票是最近连续3个交易日上涨、下跌的？*

In [50]:
data[, .(stkcd_ret = close/pre_close - 1), keyby = .(symbol, date)
    ][, {
        l <- list()
        b1 <- stkcd_ret > 0
        b2 <- stkcd_ret < 0
        for (t in 4:.N) {
            l[[t]] <- as.list(c(r3day_up = mean(c(b1[t-1], b1[t-2], b1[t-3])), r3day_dn = mean(c(b2[t-1], b1[t-2], b1[t-3])), date = date[t]))
        }
        rbindlist(l)
    }    
    , keyby = .(symbol)
    ][!is.na(date), .(stkcd_amount = uniqueN(symbol)), keyby = .(date, tag = ifelse(r3day_up == 1, "r3day_up", ifelse(r3day_dn == 1, "r3day_dn", "others")))
    ][tag == "r3day_dn"|tag == "r3day_up"]

date,tag,stkcd_amount
<dbl>,<chr>,<int>
20120109,r3day_dn,17
20120109,r3day_up,16
20120110,r3day_dn,1
20120110,r3day_up,71
20120111,r3day_dn,6
20120111,r3day_up,1401
20120112,r3day_dn,1126
20120112,r3day_up,932
20120113,r3day_dn,641
20120113,r3day_up,272


#### *39. 每天分别有多少股票是最近连续3个交易日收益率超过当天市场平均收益率？*

In [51]:
data[, .(stkcd_ret = close/pre_close - 1, mkt_weight = capt/sum(capt), symbol), keyby = .(date)
    ][, .(mkt_ret = sum(stkcd_ret * mkt_weight), symbol, stkcd_ret), keyby = .(date)
    ][, .(r3day_ret = ifelse(stkcd_ret > mkt_ret, 1, 0)), keyby = .(symbol, date)
    ][, {
        l <- list()
        for (t in 4:.N) {
            l[[t]] <- as.list(c(r3day_ret = mean(c(r3day_ret[t-1], r3day_ret[t-2], r3day_ret[t-3])), date = date[t]))
        }
        rbindlist(l)
    }, keyby = symbol
    ][r3day_ret == 1 & !is.na(date), .(stkcd_amount = uniqueN(symbol)), keyby = date]

date,stkcd_amount
<dbl>,<int>
20120109,81
20120110,95
20120111,785
20120112,905
20120113,392
20120116,153
20120117,145
20120118,50
20120119,169
20120120,238


#### *40. 每天分别有多少股票是最新5个交易日中至少有4个交易日的收益率超过当天市场平均收益率？*

In [53]:
data[, .(stkcd_ret = close/pre_close - 1, mkt_weight = capt/sum(capt), symbol), keyby = .(date)
    ][, .(mkt_ret = sum(stkcd_ret * mkt_weight), symbol, stkcd_ret), keyby = .(date)
    ][, .(r3day_ret = ifelse(stkcd_ret > mkt_ret, 1, 0)), keyby = .(symbol, date)
    ][, {
        l <- list()
        for (t in 6:.N) {
            l[[t]] <- as.list(c(r3day_ret = mean(c(r3day_ret[t-1], r3day_ret[t-2], r3day_ret[t-3], r3day_ret[t-4], r3day_ret[t-5])), date = date[t]))
        }
        rbindlist(l)
    }, keyby = symbol
    ][r3day_ret > 0.8 & !is.na(date), .(stkcd_amount = uniqueN(symbol)), keyby = date]

date,stkcd_amount
<dbl>,<int>
20120111,14
20120112,22
20120113,184
20120116,48
20120117,18
20120118,13
20120119,8
20120120,12
20120130,35
20120131,36


#### *41. 每个月中，个股月收益超过市场月收益1倍以上的股票有哪些？*

In [54]:
data[, .(stkcd_ret = close/pre_close - 1, mkt_weight = capt/sum(capt), symbol), keyby = .(date)
    ][, .(mkt_ret = sum(mkt_weight * stkcd_ret), stkcd_ret, symbol, date_ym = str_sub(date, start = 1, end = 6)), keyby = .(date)
    ][, .(stkcd_m_ret = mean(stkcd_ret), date, mkt_ret), keyby = .(symbol, date_ym)
    ][, .(stkcd_m_ret, symbol, date, mkt_m_ret = mean(mkt_ret)), keyby = .(date_ym)
    ][stkcd_m_ret > 2*mkt_m_ret, .(symbol = unique(symbol)), keyby = .(date_ym)]

date_ym,symbol
<chr>,<chr>
201201,000009.SZ
201201,000012.SZ
201201,000017.SZ
201201,000018.SZ
201201,000030.SZ
201201,000034.SZ
201201,000039.SZ
201201,000040.SZ
201201,000043.SZ
201201,000059.SZ


#### *42. 每个月中，个股月收益超过行业月收益1倍以上的股票有哪些？*

In [56]:
data[, .(stkcd_ret = close/pre_close - 1, ind_weight = capt/sum(capt), symbol), keyby = .(industry, date)
    ][, .(ind_ret = sum(ind_weight * stkcd_ret), stkcd_ret, symbol, date_ym = str_sub(date, start = 1, end = 6)), keyby = .(industry, date)
    ][, .(stkcd_m_ret = mean(stkcd_ret), date, ind_ret), keyby = .(symbol, date_ym)
    ][, .(stkcd_m_ret, symbol, date, ind_m_ret = mean(ind_ret)), keyby = .(date_ym)
    ][stkcd_m_ret > 2*ind_m_ret, .(symbol = unique(symbol)), keyby = .(date_ym)]

date_ym,symbol
<chr>,<chr>
201201,000001.SZ
201201,000006.SZ
201201,000007.SZ
201201,000009.SZ
201201,000012.SZ
201201,000017.SZ
201201,000018.SZ
201201,000020.SZ
201201,000021.SZ
201201,000022.SZ


#### *43. 每个股票的收益率对市场收益率的相关系数最高的10个股票是哪些？*

In [57]:
data[, .(stkcd_ret = close/pre_close - 1, mkt_weight = capt/sum(capt), symbol), keyby = .(date)
    ][, .(stkcd_ret, mkt_ret = sum(mkt_weight * stkcd_ret), symbol), keyby = .(date)
    ][, .(cor_coef = cor(stkcd_ret, mkt_ret)), keyby = symbol
    ][order(-cor_coef), .SD[1:10]]

"the standard deviation is zero"

symbol,cor_coef
<chr>,<dbl>
300331.SZ,1.0
600508.SH,0.9279554
601101.SH,0.9097758
000685.SZ,0.8979148
601666.SH,0.8939529
002082.SZ,0.8932496
002212.SZ,0.89192
601001.SH,0.8918933
000089.SZ,0.8914474
601168.SH,0.8888058


#### *44. 每个行业日收益率的历史波动率是多少？（用日收益率计算标准差）*

In [58]:
data[, .(stkcd_ret = close/pre_close - 1, ind_weight = capt/sum(capt), symbol), keyby = .(industry, date)
    ][, .(ind_ret = sum(stkcd_ret * ind_weight)), keyby = .(industry, date)
    ][, .(ind_vol = sd(ind_ret)), keyby = .(industry)]

industry,ind_vol
<chr>,<dbl>
AERODEF,0.017499519
AIRLINE,0.017127041
AUTO,0.015096513
BANKS,0.008105859
BEV,0.016407569
BLDPROD,0.016608454
CHEM,0.017265568
CNSTENG,0.015309948
COMSERV,0.017256929
CONMAT,0.021499559


#### *45. 各个行业的日收益率的相关系数矩阵如何？哪两个行业相关性最高、最低？*

In [59]:
cor.coef <- data[, .(stkcd_ret = close/pre_close - 1, ind_weight = capt/sum(capt), symbol, capt), keyby = .(industry, date)
    ][, .(ind_ret = sum(stkcd_ret * ind_weight)), keyby = .(industry, date)
    ][,  dcast(.SD, date ~ industry, value.var = "ind_ret")
    ][, cor(.SD[, -1])]
which(max(cor.coef[cor.coef != 1]) == cor.coef, arr.ind = TRUE)
which(min(cor.coef) == cor.coef, arr.ind = TRUE)
rm(cor.coef)

Unnamed: 0,row,col
HDWRSEMI,16,13
ELECEQP,13,16


Unnamed: 0,row,col
BEV,5,4
BANKS,4,5


#### *46. 各个行业的收益率对市场收益率的相关系数由高到低排列如何？*

In [60]:
data[, .(stkcd_ret = close/pre_close - 1, symbol, ind_weight = capt/sum(capt), capt), keyby = .(industry,date)
    ][, .(ind_ret = sum(ind_weight*stkcd_ret), stkcd_ret, symbol, capt), keyby = .(industry, date)
    ][, .(ind_ret, mkt_weight = capt/sum(capt), stkcd_ret, industry), keyby = date
    ][, .(ind_ret, mkt_ret = sum(mkt_weight*stkcd_ret), industry), keyby = date
    ][, unique(.SD)
    ][, .(cor_coef = cor(ind_ret, mkt_ret)), keyby = .(industry)
    ][order(-cor_coef)]

industry,cor_coef
<chr>,<dbl>
TRDDIST,0.9519013
MACH,0.9511527
CHEM,0.9455305
ELECEQP,0.9451663
CNSTENG,0.9446049
LEISLUX,0.9349492
HDWRSEMI,0.9285804
ENERGY,0.9156957
AUTO,0.9156441
RETAIL,0.91432


#### *47. 每个月总成交额比上个月下降幅度最大的行业是哪个？*

In [61]:
data[, .(date_ym = str_sub(date, start = 1, end = 6), amount), keyby = .(industry, date)
    ][, .(ind_m_amount = sum(amount)), keyby = .(industry, date_ym)
    ][, .(dn_m_range = {
        a <- vector()
        for (t in 2:.N) {
            a[t] <- ind_m_amount[t] - ind_m_amount[t-1]
        }
        a 
    }, date_ym = date_ym), keyby = industry
    ][!is.na(dn_m_range), .SD[min(dn_m_range) == dn_m_range & dn_m_range < 0], keyby = date_ym]

date_ym,industry,dn_m_range
<chr>,<chr>,<dbl>
201203,CONMAT,-21752549725
201204,MTLMIN,-165775634730
201205,REALEST,-4528375663
201206,MTLMIN,-123271994315


#### *48. 数据当中各个股票的最大回撤幅度是多少？（最大回撤是从一个高点到低点的降幅的最大值）*

In [62]:
data[, .(max_rtrt = max(high) - min(low)), keyby = symbol]

symbol,max_rtrt
<chr>,<dbl>
000001.SZ,3.33
000002.SZ,2.43
000004.SZ,2.93
000005.SZ,1.74
000006.SZ,3.00
000007.SZ,10.72
000008.SZ,5.98
000009.SZ,5.32
000010.SZ,4.95
000011.SZ,4.26


#### *49. 每只股票的胜率是多少？（胜率是每天收益率为正数的概率）*

In [63]:
data[, .(stkcd_ret = close/pre_close - 1), keyby = .(symbol, date)
    ][, .(gain_ratio = sum(stkcd_ret > 0)/.N), keyby = symbol]

symbol,gain_ratio
<chr>,<dbl>
000001.SZ,0.4188034
000002.SZ,0.4700855
000004.SZ,0.5128205
000005.SZ,0.1538462
000006.SZ,0.5128205
000007.SZ,0.5555556
000008.SZ,0.4444444
000009.SZ,0.4957265
000010.SZ,0.3076923
000011.SZ,0.5299145


#### *50. 每只股票的盈亏比是多少？（盈亏比是正收益之和与负收益之和的比值的绝对值）*

In [64]:
data[, .(stkcd_ret = close/pre_close - 1), keyby = .(symbol, date)
    ][, .(stkcd_ret_pn = sum(stkcd_ret)), keyby = .(symbol, tag = ifelse(stkcd_ret > 0, "gain", "loss"))
    ][, .(gl_ratio = abs(stkcd_ret_pn[1]/stkcd_ret_pn[.N])), keyby = symbol]

symbol,gl_ratio
<chr>,<dbl>
000001.SZ,0.9718614
000002.SZ,1.2796856
000004.SZ,1.0828690
000005.SZ,0.5532427
000006.SZ,1.3719934
000007.SZ,1.8032631
000008.SZ,1.7014617
000009.SZ,0.9844944
000010.SZ,0.9652698
000011.SZ,1.1896667


#### *51. 市场的胜率是多少？（市场收益率为正的概率）*

In [65]:
data[, .(stkcd_ret = close/pre_close - 1, mkt_weight = capt/sum(capt)), keyby = date
    ][, .(mkt_ret = sum(stkcd_ret * mkt_weight)), keyby = date
    ][, .(gain_ratio = sum(mkt_ret > 0)/.N)]

gain_ratio
<dbl>
0.4871795


#### *52. 市场的盈亏比是多少？（市场中每个股票的市值加权正收益和市值加权负收益之比）*

In [66]:
data[, .(stkcd_ret = close/pre_close - 1, mkt_weight = capt/sum(capt)), keyby = date
    ][, .(mkt_ret = sum(stkcd_ret * mkt_weight)), keyby = date
    ][, .(mkt_ret_pn = sum(mkt_ret)), keyby = .(tag = ifelse(mkt_ret > 0, "gain", "loss"))
    ][, .(gl_ratio = abs(mkt_ret_pn[1]/mkt_ret_pn[.N]))]

gl_ratio
<dbl>
1.147373


#### *53. 每个行业的胜率是多少？*

In [67]:
data[, .(stkcd_ret = close/pre_close - 1, ind_weight = capt/sum(capt)), keyby = .(industry, date)
    ][, .(ind_ret = sum(stkcd_ret * ind_weight)), keyby = .(industry, date)
    ][, .(gain_ratio = sum(ind_ret > 0)/.N), keyby = industry]

industry,gain_ratio
<chr>,<dbl>
AERODEF,0.5213675
AIRLINE,0.4786325
AUTO,0.4700855
BANKS,0.4615385
BEV,0.5470085
BLDPROD,0.5384615
CHEM,0.4957265
CNSTENG,0.5470085
COMSERV,0.5213675
CONMAT,0.5299145


#### *54. 每个行业的盈亏比是多少？（行业盈亏比是行业内每个股票的市值加权的正收益率和市值加权的负收益率之比）*

In [68]:
data[, .(stkcd_ret = close/pre_close - 1, ind_weight = capt/sum(capt)), keyby = .(industry, date)
    ][, .(ind_ret = sum(stkcd_ret * ind_weight)), keyby = .(industry, date)
    ][, .(ind_ret_pn = sum(ind_ret)), keyby = .(industry, tag = ifelse(ind_ret > 0, "gain", "loss"))
    ][, .(gl_ratio = abs(ind_ret_pn[1]/ind_ret_pn[.N])), keyby = industry]

industry,gl_ratio
<chr>,<dbl>
AERODEF,0.9973617
AIRLINE,1.0715381
AUTO,1.120609
BANKS,1.0158729
BEV,1.2812672
BLDPROD,1.2698791
CHEM,1.1149151
CNSTENG,1.2415376
COMSERV,1.1990343
CONMAT,1.0274717


#### *55. 是否存在股票的月成交额超过所在行业当月中某天一天总成交额的情况？*

In [69]:
data[, .(date_ym = str_sub(date, start = 1, end = 6), amount, industry), keyby = .(symbol, date)
    ][, .(stkcd_m_amount = sum(amount), amount, industry, date), keyby = .(date_ym, symbol)
    ][, .(stkcd_m_amount, ind_d_amount = sum(amount), date_ym, symbol), keyby = .(industry, date)
    ][stkcd_m_amount > ind_d_amount]

industry,date,stkcd_m_amount,ind_d_amount,date_ym,symbol
<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>
AERODEF,20120104,1460111191,493331236,201201,000768.SZ
AERODEF,20120104,976641005,493331236,201201,002025.SZ
AERODEF,20120104,1125117546,493331236,201201,600118.SH
AERODEF,20120104,1522650678,493331236,201201,600316.SH
AERODEF,20120104,675067956,493331236,201201,600879.SH
AERODEF,20120104,1231862218,493331236,201201,600893.SH
AERODEF,20120105,1460111191,548805394,201201,000768.SZ
AERODEF,20120105,976641005,548805394,201201,002025.SZ
AERODEF,20120105,1125117546,548805394,201201,600118.SH
AERODEF,20120105,1522650678,548805394,201201,600316.SH


#### *56. 每天每个行业编入、编出的股票各有多少？*

In [70]:
data[, .(symbol_list = list(symbol)), keyby = .(industry, date)
    ][, {
        l <- list()
        for (i in 2:.N) {
            intec <- intersect(symbol_list[[i]], symbol_list[[i-1]]) %>% length()
            len1 <- length(symbol_list[[i]])
            len2 <- length(symbol_list[[i-1]])
            l[[i]] <- list(export = len2-intec, import = len1-intec)
        }
        rbindlist(l)
    }, keyby = industry]

industry,export,import
<chr>,<int>,<int>
AERODEF,0,0
AERODEF,0,0
AERODEF,0,0
AERODEF,0,0
AERODEF,0,0
AERODEF,0,0
AERODEF,0,0
AERODEF,0,0
AERODEF,0,0
AERODEF,0,0


#### *57. 每天每个行业内股票收益率的标准差是多少？*

In [71]:
data[, .(stkcd_ret = close/pre_close - 1), keyby = .(symbol, industry, date)
    ][, .(ret_sd = sd(stkcd_ret)), keyby = .(industry, date)]

industry,date,ret_sd
<chr>,<int>,<dbl>
AERODEF,20120104,0.028368795
AERODEF,20120105,0.029020206
AERODEF,20120106,0.031931409
AERODEF,20120109,0.013774933
AERODEF,20120110,0.014172288
AERODEF,20120111,0.012716938
AERODEF,20120112,0.015646738
AERODEF,20120113,0.015327578
AERODEF,20120116,0.013199788
AERODEF,20120117,0.012681424


#### *58. 每天每个行业内股票收益率的标准差的相关性如何？*

In [72]:
data[, .(stkcd_ret = close/pre_close - 1), keyby = .(symbol, industry, date)
    ][, .(ret_sd = sd(stkcd_ret)), keyby = .(industry, date)
    ][, dcast(.SD, date ~ industry, value.var = "ret_sd")
    ][, cor(.SD[, -1])]

Unnamed: 0,AERODEF,AIRLINE,AUTO,BANKS,BEV,BLDPROD,CHEM,CNSTENG,COMSERV,CONMAT,...,MATERIAL,MEDIA,MTLMIN,PERSPRD,RDRLTRAN,REALEST,RETAIL,SOFTWARE,TRDDIST,UTILITIE
AERODEF,1.0,0.115230313,0.33195634,0.080215236,0.02497865,0.235179203,0.188611021,0.21682816,0.25731553,0.39057378,...,-0.0394749594,0.05502238,0.23774679,0.07375119,0.234620048,0.24081523,0.19727892,0.461432767,0.12838253,0.25716471
AIRLINE,0.11523031,1.0,0.22235548,0.024461977,0.02250268,0.119219798,0.167029633,0.3012702,0.12523655,0.17059602,...,-0.0658453902,0.004517255,0.15329979,0.20779311,0.2385023137,0.23494478,0.27578735,0.009783945,0.32039923,0.13232896
AUTO,0.33195634,0.222355484,1.0,0.072839622,0.26374233,0.520673263,0.369540478,0.32496927,0.57458415,0.28974566,...,0.1203428296,0.101366286,0.34657502,0.40223562,0.4603776921,0.45936853,0.4289623,0.208918543,0.36367545,0.32279054
BANKS,0.08021524,0.024461977,0.07283962,1.0,0.06073737,0.160753394,0.157010691,-0.01045057,0.01772396,0.20336602,...,-0.0382677674,-0.038437239,0.13709376,0.14952446,0.1321562769,0.19262828,0.11673246,-0.042061979,0.07288153,0.26346287
BEV,0.02497865,0.022502678,0.26374233,0.06073737,1.0,0.352289128,0.11448273,0.21168533,0.13948967,0.11835709,...,0.1011156571,0.154698742,0.3678963,0.19861833,0.0538476134,0.07943199,0.20594709,0.122780772,0.16235076,0.17341258
BLDPROD,0.2351792,0.119219798,0.52067326,0.160753394,0.35228913,1.0,0.414073336,0.31393958,0.3659904,0.21024854,...,0.0446785532,0.002654193,0.38414423,0.27733789,0.4058525744,0.32102245,0.27553625,0.22212433,0.2729603,0.34981005
CHEM,0.18861102,0.167029633,0.36954048,0.157010691,0.11448273,0.414073336,1.0,0.16145852,0.12885738,0.14622828,...,0.0257120843,-0.002682042,0.33883559,0.18941091,0.5429834091,0.36418893,0.30988914,0.149509809,0.4211665,0.23607496
CNSTENG,0.21682816,0.301270197,0.32496927,-0.010450569,0.21168533,0.313939577,0.161458518,1.0,0.20157868,0.23481372,...,0.00894301,0.020132783,0.21898239,0.24345692,0.2550771013,0.2147173,0.2926741,0.169888413,0.27961681,0.239629
COMSERV,0.25731553,0.125236551,0.57458415,0.017723962,0.13948967,0.3659904,0.128857378,0.20157868,1.0,0.28805764,...,-0.0338347782,0.064870736,0.32010187,0.29391535,0.3199755964,0.24826972,0.16413563,0.169654134,0.21244947,0.32080986
CONMAT,0.39057378,0.17059602,0.28974566,0.203366025,0.11835709,0.210248543,0.146228278,0.23481372,0.28805764,1.0,...,-0.1031998953,0.049188757,0.41444866,0.22034691,0.2000524703,0.34902288,0.20167059,0.036887861,0.26720844,0.28296124


#### *59. 每天计算出成交额的 z-score （减去均值除以标准差）, 该指标能解释下一天个股超额收益率的多少比例？*

In [73]:
data[, .(stkcd_ret = close/pre_close - 1, weight = capt/sum(capt), symbol, amount), keyby = date
    ][, .(mkt_ret = sum(weight*stkcd_ret), stkcd_ret, symbol, amount), keyby = date
    ][, .(alpha = coef(lm(stkcd_ret ~ mkt_ret))[1], beta = coef(lm(stkcd_ret ~ mkt_ret))[2], mkt_ret, stkcd_ret, symbol, date, amount)
    ][, .(abnr_ret = stkcd_ret - alpha - beta * mkt_ret, amount), keyby = .(date, symbol)
    ][, .(abnr_ret, zscore = (amount - mean(amount))/ sd(amount), symbol), keyby = date
    ][, .(abnr_lead_ret = shift(abnr_ret, n = 1L, type = "lead"), zscore, symbol), keyby = symbol
    ][, summary(lm(abnr_lead_ret ~ zscore))$r.squared]

#### *60. 每个股票的收益率和300、500指数收益率可以回归出一个截距项和2个beta，这两个beta的分布如何？*

In [74]:
data[, .(stkcd_ret = close/pre_close - 1, ind_w300_ret = sum((close/pre_close - 1) * index_w300), ind_w500_ret = sum((close/pre_close - 1) * index_w500), symbol), keyby = date
    ][, .(coef_beta = coef(lm(stkcd_ret ~ ind_w300_ret + ind_w500_ret))[-1], tag = c("ind_w300_ret", "ind_w500_ret")), keyby = symbol
    ][!is.na(coef_beta), .(sample_mean = mean(coef_beta), sample_sd = sd(coef_beta)), keyby = tag]

tag,sample_mean,sample_sd
<chr>,<dbl>,<dbl>
ind_w300_ret,-0.1583057,0.9317869
ind_w500_ret,1.1264276,0.8311509


#### *61. 每天开盘后到最高价涨幅最大的100只股票同样也是全天(昨收到今收)涨幅最大的100只股票的比例是多少?*

In [75]:
data[, .(high_open = high/open - 1, clo_pre = close/pre_close - 1), by = .(symbol, date)
    ][order(date, -high_open), .(ho_symbol = list(symbol[1:100])), keyby = date
    ][data[, .(high_open = high/open - 1, clo_pre = close/pre_close - 1), by = .(symbol, date)    
        ][order(date, -clo_pre), .(cp_symbol = list(symbol[1:100])), keyby = date], on = .(date)
    ][, {
        l <- list()
        for (i in 1:.N) {
            l[[i]] <- list(intec = intersect(cp_symbol[[i]], ho_symbol[[i]]))
        }
        rbindlist(l)
    }, keyby = date
    ][, .(ratio = .N/100), keyby = date]

date,ratio
<int>,<dbl>
20120104,0.52
20120105,0.52
20120106,0.61
20120109,0.70
20120110,0.59
20120111,0.55
20120112,0.51
20120113,0.63
20120116,0.46
20120117,0.32


#### *62. 每天计算最近三天每天对市场的超额收益率都排进当天前100的股票有哪些?*

In [76]:
data[, .(stkcd_ret = close/pre_close - 1, weight = capt/sum(capt), symbol), keyby = date
    ][, .(mkt_ret = sum(weight*stkcd_ret), stkcd_ret, symbol), keyby = date
    ][, .(alpha = coef(lm(stkcd_ret ~ mkt_ret))[1], beta = coef(lm(stkcd_ret ~ mkt_ret))[2], mkt_ret, stkcd_ret, symbol, date)
    ][, .(abnr_ret = stkcd_ret - alpha - beta * mkt_ret), keyby = .(date, symbol)
    ][order(date, -abnr_ret), .(symbol = list(symbol[1:100])), keyby = date
    ][, {
        l <- list()
        for (t in 4:.N) {
            l[[t]] <- list(symbol = Reduce(intersect, list(symbol[[t]], symbol[[t-1]], symbol[[t-2]])), date = date[[t]])
        }
        rbindlist(l)
    }]

"Column 1 ['symbol'] of item 4 is length 0. This (and 10 others like it) has been filled with NA (NULL for list columns) to make each item uniform."

symbol,date
<chr>,<int>
,20120109
000791.SZ,20120110
002118.SZ,20120111
600971.SH,20120111
000791.SZ,20120111
000552.SZ,20120111
600792.SH,20120111
600740.SH,20120111
000034.SZ,20120111
000835.SZ,20120111


#### *63. 每天计算最近三天每天对行业的超额收益率都排进当天行业前30%的股票有哪些?*

In [77]:
data[, .(stkcd_ret = close/pre_close - 1, symbol, weight = capt/sum(capt)), keyby = .(industry,date)
    ][, .(ind_ret = sum(weight*stkcd_ret), stkcd_ret, symbol), keyby = .(industry, date)
    ][, .(alpha = coef(lm(stkcd_ret ~ ind_ret))[1], beta = coef(lm(stkcd_ret ~ ind_ret))[2], ind_ret, stkcd_ret, symbol, date), keyby = .(industry)
    ][, .(abnr_ret = stkcd_ret - alpha - beta * ind_ret), keyby = .(date, symbol)
    ][order(date, -abnr_ret) & abnr_ret > quantile(abnr_ret, 0.7), .(symbol = list(symbol)), keyby = date
    ][, {
        l <- list()
        for (t in 4:.N) {
            l[[t]] <- list(symbol = Reduce(intersect, list(symbol[[t]], symbol[[t-1]], symbol[[t-2]])), date = date[[t]])
        }
        rbindlist(l)
    }]

symbol,date
<chr>,<int>
000039.SZ,20120109
000045.SZ,20120109
000411.SZ,20120109
000422.SZ,20120109
000518.SZ,20120109
000528.SZ,20120109
000607.SZ,20120109
000623.SZ,20120109
000666.SZ,20120109
000707.SZ,20120109
