### [`1. Python多因子选股策略实践`](https://datawhalechina.github.io/whale-quant/#/./ch04_%E9%87%8F%E5%8C%96%E9%80%89%E8%82%A1%E7%AD%96%E7%95%A5/ch04_%E9%87%8F%E5%8C%96%E9%80%89%E8%82%A1%E7%AD%96%E7%95%A5?id=_45-python%e5%a4%9a%e5%9b%a0%e5%ad%90%e9%80%89%e8%82%a1%e7%ad%96%e7%95%a5%e5%ae%9e%e8%b7%b5)

多因子选股模型是一个用来选择股票投资组合的策略，它考虑了多个与预期收益相关的因子。这种模型的理念基于这样的理论：单一因子可能无法全面捕捉到市场的所有变化，而多个因子的组合可以提供更全面、更稳定的预测。

多因子选股模型的核心思想是通过多个因子的组合来选择股票，因子可以通过历史数据来计算，然后用来预测未来的股票表现，以期获取更全面、更稳定的预测。这些因子可以包括基本面因子、技术分析因子、宏观经济因子等。例如：

- `基本面因子`：包括市盈率（PE）、市净率（PB）、营业收入增长率等
- `技术分析因子`：包括动量（Momentum）、波动率（Volatility）等
- `宏观经济因子`：包括利率、通货膨胀率等

在实现多因子选股模型时，需要进行以下步骤：

- `确定目标和约束条件`: 明确多因子模型要达到的投资目标收益率、风险水平等要求。同时考虑实际的投资约束,例如组合数目限制、行业比例限制等。

- `选择因子并计算`: 根据目标和约束条件,选择合适的股票因子,如 PE, PB 等。收集数据计算得到每只股票的各因子值。

- `异常值处理`: 检查数据中的异常值和错报数据,进行处理和滤除,保证因子值的质量。

- `因子标准化`: 因为不同因子的取值范围差异很大,需要进行标准化处理,例如去均值和缩放等。

- `确定因子权重`: 根据因子的重要性给予不同权重,通常通过统计方法比如主成分分析来确定。

- `构建多因子模型`: 结合因子值和权重,建立多因子评分模型,得到各股票的综合评分。

- `股票筛选和组合优化`: 根据评分进行股票筛选,并进行组合优化,获得符合目标和约束条件的优化组合。

- `回测和调整模型`: 使用历史数据回测多因子模型的效果,根据结果进行调整和改进。


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# 读取和准备数据
df = pd.read_csv('stock_data.csv')
X = df[['PE', 'PB', 'ROE']]  # 特征因子
y = df['Returns']  # 目标变量

# 拆分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# 标准化处理
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 构建线性回归模型
model = LinearRegression()
model.fit(X_train, y_train)

# 查看模型系数,确定因子权重
print('Factor weights:', model.coef_)

# 使用模型预测测试数据的收益
y_pred = model.predict(X_test)

# 创建一个DataFrame来存储股票的预测收益
predicted_returns = pd.DataFrame({
    'Stock': X_test.index,
    'Predicted return': y_pred
})

# 根据预测的收益选择股票
selected_stocks = predicted_returns[predicted_returns['Predicted return'] > 0.1]

# print('Selected stocks:', selected_stocks)

### [`2. 【手把手教你】Python量化Fama-French三因子模型`](https://www.shangyexinzhi.com/article/391399.html)


In [1]:
import asyncio
from pylab import mpl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import matplotlib as mpl
import datetime
import time
sns.set()

# 正常显示画图时出现的中文和负号
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

In [2]:
from data_api import TSDataAPI
from stock_downloader import FetchStockData, FetchStockDataAsync
from db_client import PostgreSQLClient, DBOperations

In [3]:
postgresql_client = PostgreSQLClient()

In [4]:
cur = postgresql_client.conn.cursor()
cur.execute(
    """
    SELECT DISTINCT trade_date
    FROM basic_daily
    ORDER BY trade_date ASC;
    """
)

In [5]:
df_result = cur.fetchall()

In [7]:
[i[0] for i in df_result]

['20231009',
 '20231010',
 '20231011',
 '20231012',
 '20231013',
 '20231016',
 '20231017',
 '20231018',
 '20231019',
 '20231020',
 '20231023',
 '20231024',
 '20231025',
 '20231026',
 '20231027',
 '20231030',
 '20231031',
 '20231101',
 '20231102',
 '20231103',
 '20231106',
 '20231107',
 '20231108']

In [4]:
pro = TSDataAPI().pro
df_cal = pro.trade_cal(
    start_date='20231101',
    end_date='20231107',
    exchange="SSE",
    is_open=1
)

In [18]:
df_cal.cal_date

0    20231107
1    20231106
2    20231103
3    20231102
4    20231101
Name: cal_date, dtype: object

In [5]:
df_daily = pro.daily(trade_date=df_cal.cal_date.values[0])

In [6]:
df_daily.head()

Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,amount
0,600232.SH,20231107,5.77,5.84,5.76,5.84,5.77,0.07,1.2132,35945.0,20878.532
1,600233.SH,20231107,13.8,13.8,13.51,13.6,13.83,-0.23,-1.6631,77051.6,104751.329
2,600234.SH,20231107,7.46,7.5,7.36,7.49,7.45,0.04,0.5369,19573.0,14572.93
3,600235.SH,20231107,5.82,5.88,5.76,5.86,5.83,0.03,0.5146,45619.0,26597.841
4,600236.SH,20231107,5.56,5.64,5.55,5.61,5.56,0.05,0.8993,106508.4,59713.484


In [17]:
(", ").join(df_daily.columns.to_list())

'ts_code, trade_date, open, high, low, close, pre_close, change, pct_chg, vol, amount'

In [10]:
df_daily_basic = pro.daily_basic(trade_date=df_cal.cal_date.values[0])

In [11]:
df_daily_basic.head()

Unnamed: 0,ts_code,trade_date,close,turnover_rate,turnover_rate_f,volume_ratio,pe,pe_ttm,pb,ps,ps_ttm,dv_ratio,dv_ttm,total_share,float_share,free_share,total_mv,circ_mv
0,688553.SH,20231107,15.11,0.5981,0.7715,0.71,25.7062,62.4035,1.726,4.2863,6.8655,,,42360.0,29113.5369,22568.8373,640059.6,439905.5426
1,300109.SZ,20231107,19.89,1.3016,1.4042,0.95,22.0149,15.3198,1.8546,4.3189,4.0977,2.6165,2.524,32260.1837,28515.1984,26430.6791,641655.0538,567167.2962
2,000070.SZ,20231107,9.33,10.4718,16.8278,2.81,632.8899,,3.8748,2.0041,1.7791,0.0,,90034.476,88857.8626,55295.3633,840021.6611,829043.8581
3,300566.SZ,20231107,18.07,3.3782,4.0515,1.2,83.9213,42.7907,2.772,2.4052,2.2754,0.5499,0.275,26374.155,22819.8783,19027.4107,476580.9809,412355.2009
4,600227.SH,20231107,2.87,0.8245,0.9985,0.63,,,1.9075,1.9303,2.1514,0.0,,169313.4201,127817.9547,105540.0845,485929.5157,366837.53


In [12]:
df_daily_basic.dtypes

ts_code             object
trade_date          object
close              float64
turnover_rate      float64
turnover_rate_f    float64
volume_ratio       float64
pe                 float64
pe_ttm             float64
pb                 float64
ps                 float64
ps_ttm             float64
dv_ratio           float64
dv_ttm             float64
total_share        float64
float_share        float64
free_share         float64
total_mv           float64
circ_mv            float64
dtype: object

In [4]:
fetchStockData = FetchStockData(tspro=pro)
fetchStockDataAsync = FetchStockDataAsync(tspro=pro)

In [5]:
fetchStockData(df=df_cal)

20231107
fetch_daily_data execution time - 2.901522159576416
20231106
fetch_daily_data execution time - 3.0413129329681396
20231103
fetch_daily_data execution time - 2.8988940715789795
20231102
fetch_daily_data execution time - 3.0667920112609863
20231101
fetch_daily_data execution time - 2.960963726043701
__call__ execution time - 14.87171220779419


In [7]:
loop = asyncio.get_event_loop()
# run_main = partial(batch_process_df, df_cal, fetch_daily_data)
task = loop.create_task(fetchStockDataAsync(df=df_cal))
await task
results = task.result()

__call__ execution time - 0.0


In [24]:
from functools import partial
loop = asyncio.get_event_loop()
run_main = partial(batch_process_df, df_cal, fetch_daily_data)
task = loop.create_task(run_main())
await task
results = task.result()

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [None]:
import pickle

# Specify the filename for the pickle file
pickle_file = 'tushare_results.pkl'

# Save the list to the pickle file
with open(pickle_file, 'wb') as file:
    pickle.dump(results, file)

5

In [12]:
df_daily, df_basic = result[0]

In [15]:
df_basic

Unnamed: 0,ts_code,trade_date,close,turnover_rate,turnover_rate_f,volume_ratio,pe,pe_ttm,pb,ps,ps_ttm,dv_ratio,dv_ttm,total_share,float_share,free_share,total_mv,circ_mv
0,600230.SH,20190110,16.35,1.2399,2.3066,0.93,5.2484,4.9992,1.8584,1.5245,1.3917,2.1844,2.1844,41186.3502,41186.3502,22139.6078,6.733968e+05,6.733968e+05
1,600237.SH,20190110,3.61,1.7732,2.1301,1.46,145.0977,210.9461,1.6434,2.5049,2.3495,0.0000,,56436.9565,56436.9565,46980.8285,2.037374e+05,2.037374e+05
2,002465.SZ,20190110,8.23,1.0221,1.4284,0.47,64.7363,51.0995,2.3013,5.6640,5.1844,0.9722,0.9722,230694.3384,207050.8115,148158.2821,1.898614e+06,1.704028e+06
3,300732.SZ,20190110,34.85,1.7935,2.3308,1.12,19.6150,15.8734,2.7024,4.8295,4.3494,0.7971,0.7971,12960.0000,8432.6346,6488.6346,4.516560e+05,2.938773e+05
4,600007.SH,20190110,13.20,0.0905,0.4675,0.94,20.9452,17.7866,1.9742,4.8062,4.2742,2.4242,2.4242,100728.2534,100728.2534,19492.2293,1.329613e+06,1.329613e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3554,000505.SZ,20190110,5.78,0.8225,1.1821,0.61,30.5847,33.5823,1.7984,0.5006,0.4991,0.0000,,68579.0364,40612.7806,28256.5843,3.963868e+05,2.347419e+05
3555,000983.SZ,20190110,5.36,0.6117,1.3415,0.94,10.7644,9.8299,0.8557,0.5894,0.5450,0.7463,0.7463,315120.0000,315119.7660,143698.2552,1.689043e+06,1.689042e+06
3556,300359.SZ,20190110,6.21,1.7081,2.3802,1.25,59.2979,52.1937,1.9505,3.8123,4.8209,0.3225,0.3225,63300.3422,50473.4710,36222.0264,3.930951e+05,3.134403e+05
3557,002853.SZ,20190110,17.35,1.5637,1.5637,0.96,26.1709,19.9559,2.7281,3.2611,2.5685,0.9222,0.9222,15534.1500,5803.0745,5803.0745,2.695175e+05,1.006833e+05


In [20]:
data = []
for date in df_cal.cal_date:
    df_daily = pro.daily(trade_date=date)
    df_basic = pro.daily_basic(trade_date=date)
    break
    # df = pd.merge(df_daily, df_basic, on='ts_code', how='inner')
    # smb, hml = cal_smb_hml(df)
    # data.append([date, smb, hml])
    # print(date, smb, hml)
# df_tfm = pd.DataFrame(data, columns=['trade_date', 'SMB', 'HML'])
# df_tfm['trade_date'] = pd.to_datetime(df_tfm.trade_date)
# df_tfm = df_tfm.set_index('trade_date')
# df_tfm.to_csv('df_three_factor_model.csv')
# df_tfm.head()

In [21]:
df_daily.head()

Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,amount
0,600000.SH,20190110,9.94,10.02,9.92,9.96,9.99,-0.03,-0.3003,159235.66,158718.755
1,600004.SH,20190110,9.87,9.87,9.65,9.77,9.89,-0.12,-1.2133,136903.65,133965.136
2,600006.SH,20190110,3.79,3.8,3.73,3.76,3.82,-0.06,-1.5707,83100.83,31284.461
3,600007.SH,20190110,13.34,13.41,13.1,13.2,13.39,-0.19,-1.419,9112.99,12060.299
4,600008.SH,20190110,3.44,3.52,3.44,3.5,3.44,0.06,1.7442,181633.1,63315.058


In [22]:
df_basic.head()

Unnamed: 0,ts_code,trade_date,close,turnover_rate,turnover_rate_f,volume_ratio,pe,pe_ttm,pb,ps,ps_ttm,dv_ratio,dv_ttm,total_share,float_share,free_share,total_mv,circ_mv
0,600230.SH,20190110,16.35,1.2399,2.3066,0.93,5.2484,4.9992,1.8584,1.5245,1.3917,2.1844,2.1844,41186.3502,41186.3502,22139.6078,673396.8,673396.8
1,600237.SH,20190110,3.61,1.7732,2.1301,1.46,145.0977,210.9461,1.6434,2.5049,2.3495,0.0,,56436.9565,56436.9565,46980.8285,203737.4,203737.4
2,002465.SZ,20190110,8.23,1.0221,1.4284,0.47,64.7363,51.0995,2.3013,5.664,5.1844,0.9722,0.9722,230694.3384,207050.8115,148158.2821,1898614.0,1704028.0
3,300732.SZ,20190110,34.85,1.7935,2.3308,1.12,19.615,15.8734,2.7024,4.8295,4.3494,0.7971,0.7971,12960.0,8432.6346,6488.6346,451656.0,293877.3
4,600007.SH,20190110,13.2,0.0905,0.4675,0.94,20.9452,17.7866,1.9742,4.8062,4.2742,2.4242,2.4242,100728.2534,100728.2534,19492.2293,1329613.0,1329613.0


In [None]:
# 获取数据
wanke = pro.daily(ts_code='000002.SZ',
                  start_date='20170101', end_date='20190110')
pingan = pro.daily(ts_code='601318.SH',
                   start_date='20170101', end_date='20190110')
maotai = pro.daily(ts_code='600519.SH',
                   start_date='20170101', end_date='20190110')
wanhua = pro.daily(ts_code='002415.SZ',
                   start_date='20170101', end_date='20190110')
keda = pro.daily(ts_code='002230.SZ',
                 start_date='20170101', end_date='20190110')
gzA = pro.index_daily(ts_code='399317.SZ',
                      start_date='20170101', end_date='20190110')

# 仅保留收益率数据，且用日期作为index
# 然后按照日期排序（增序）
stock_list = [wanke, pingan, maotai, wanhua, keda, gzA]

for stock in stock_list:
    stock.index = pd.to_datetime(stock.trade_date)

df_stock = pd.concat([stock.pct_chg / 100 for stock in stock_list], axis=1)
df_stock.columns = ['wanke', 'pingan', 'maotai', 'wanhua', 'keda', 'gzA']
df_stock = df_stock.sort_index(ascending=True)
df_stock.head()

# 这里gzA是指国证A股指数，更能代表整个A股市场。

In [None]:
# 整合数据，并简单探索
df = pd.merge(df_stock, df_tfm, left_index=True, right_index=True, how='inner')
df = df.fillna(0)
rf = 1.032 ** (1/360) - 1
df = df - rf
df2 = df.copy()
df = df['20180101':]
df.head()

In [None]:
# 观察数据间的相关性
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), cmap='bwr')

In [None]:
# 收益率时序图
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()

plt.figure(figsize=(10, 5))
for col in df.columns:
    plt.plot(df[col], label=col)

plt.title('日收益率时序图 (2018至今)', fontsize=20)
plt.legend()

In [None]:
# 累计收益率时序图
plt.figure(figsize=(10, 5))

for col in df.columns:
    plt.plot((df[col] + 1).cumprod() - 1, label=col)

plt.title('累计收益率时序图 (2017至今)', fontsize=20)
plt.legend()

In [None]:
# 类似R语言的统计模型库statsmodels接口
import statsmodels.api as sm

stock_names = {
    'wanke': '万科A',
    'pingan': '中国平安',
    'maotai': '贵州茅台',
    'wanhua': '万华化学',
    'keda': '科大讯飞'
}

params = pd.DataFrame()

for stock in ['wanke', 'pingan', 'maotai', 'wanhua', 'keda']:
    model = sm.OLS(df[stock], sm.add_constant(
        df[['gzA', 'SMB', 'HML']].values))
    result = model.fit()
    params[stock_names[stock]] = result.params
    print(stock_names[stock] + '\n')
    print(result.summary())

In [None]:
params.index = ['Alpha', '市场因子', '规模因子', '价值因子']
params

In [None]:
# 将Alpha*100
params.loc['Alpha'] = params.loc['Alpha']*100
params

In [None]:
from pyecharts import Bar

bar = Bar("个股收益归因分析", width=800, height=450, title_text_size=15)
bar.add("万科A", params.index, params['万科A'].round(3), is_splitline_show=False)
bar.add("中国平安", params.index, params['中国平安'].round(3), is_splitline_show=False)
bar.add("贵州茅台", params.index, params['贵州茅台'].round(3), is_splitline_show=False)
bar.add("万华化学", params.index, params['万华化学'].round(3), is_splitline_show=False)
bar.add("科大讯飞", params.index, params['科大讯飞'].round(3), is_splitline_show=False)
bar

In [11]:
assert ("1" in ["1", "2"])