# calculate ambiguity by conditional variance

In [1]:
import pandas as pd
import numpy as np

panel = pd.read_csv('total_panel.csv', parse_dates=['trade_date'], index_col=['code','trade_date'])

AMB_df = pd.read_csv('AMB_22day.csv', parse_dates=['trade_date'], index_col=['trade_date'])
AMB_df = AMB_df.stack().reset_index(level=1).rename(columns={'level_1':'code', 0:'AMB'}).set_index('code', append=True)
AMB_df = AMB_df.swaplevel().sort_index()

RV_df = pd.read_csv('RV_1day.csv', parse_dates=['trade_date'], index_col=['trade_date'])
RV_df = RV_df.stack().reset_index(level=1).rename(columns={'level_1':'code', 0:'RV'}).set_index('code', append=True)
RV_df = RV_df.swaplevel().sort_index()

In [2]:
panel = panel.join(AMB_df)
panel = panel.join(RV_df)

In [3]:
panel[['AMB', 'RV']] = panel[['AMB', 'RV']].groupby('code').apply(lambda x: x.interpolate(method='linear', \
                                                                limit_direction='both')).reset_index(level=0, drop=True)
panel = panel[~panel.index.get_level_values(level=0).str.startswith(('200','201','900'))] # 去掉B股

# 替换超过一定量的替换要用loc，这个限度时大于等于3个
panel.loc[(slice(None), slice(None)), ['var', 'skew', 'kurt']] = panel[['var', 'skew', 'kurt']].groupby('code').\
                                                                apply(lambda x: x.interpolate(method='linear', \
                                                                limit_direction='both')).reset_index(level=0, drop=True)

In [4]:
fama_df = pd.read_csv(f'fama_data.csv', parse_dates=['trade_date'], index_col=['MarkettypeID','trade_date'])
# fama五因子模型

# P9714：沪深A股和创业板和科创板
# Portfolios1：2*3投资组合划分方法
# RiskPremium1：[市场风险溢价因子(流通市值加权)]
# SMB1 [市值因子(流通市值加权)]
# HML1 [帐面市值比因子(流通市值加权)]
# RMW1 [盈利能力因子(流通市值加权)]
# CMA1 [投资模式因子(流通市值加权)]
fama_name = ['RiskPremium1', 'SMB1', 'HML1', 'RMW1', 'CMA1']
fama_df = fama_df.loc[('P9714', slice(None)), :]
fama_df = fama_df[fama_df.Portfolios==1]
fama_df = fama_df.loc[(slice(None), slice(None)), fama_name]
fama_df = fama_df.reset_index('MarkettypeID', drop=True)
panel = panel.join(fama_df, on='trade_date')

In [5]:
# 面板中不能有na值
if panel.isna().sum().sum() != 0:
    raise ValueError("data have NA")

In [20]:
import pandas as pd
import numpy as np
from tqdm import notebook
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import  LinearRegression

print(panel.shape)
# df_1=panel.iloc[:2000,:]
df_1=panel.iloc[:,:]

def lin_regress(data, R_wind, e_wind):
    if len(data)>=R_wind:
        # 数据处理
        x_column = ['vol', 'turnoverrate', 'skew', 'kurt', 'Institutional_holding', 'Analyst_AVGRate', 
                    'RiskPremium1', 'SMB1', 'HML1', 'RMW1', 'CMA1']
        Y_column = ['AMB']
        X = data[x_column].values
        Y = np.log(data[Y_column].values)
        Y_tsub1 = Y[:-e_wind]
        X_tsub1 = X[:-e_wind]
        Y_t = Y[e_wind:]
        # 当前一期数据用于预测下一期
        y_t = Y[-1].reshape(1,-1)
        x_t = X[-1].reshape(1,-1)
        # ols回归
        scaler = StandardScaler()
        scaler.fit(X_tsub1)
        X_tsub1 = scaler.transform(X_tsub1)
        pca = PCA(n_components=0.95)
        X_tsub1_pca = pca.fit_transform(X_tsub1)
        Y_X_tsub1_pca = np.concatenate([Y_tsub1, X_tsub1_pca], axis=1)
        lr=LinearRegression()
        lr.fit(Y_X_tsub1_pca, Y_t)
        # 预测
        x_t = scaler.transform(x_t)
        x_t_pca = pca.transform(x_t)
        y_x_t_pca = np.concatenate([y_t, x_t_pca], axis=1)
        # y_e = np.exp(lr.predict(y_x_t_pca) )
        
        predicted_values = lr.predict(y_x_t_pca) 
        clipped_values = np.clip(predicted_values, Y.min()-Y.std(), Y.max()+Y.std()) # 避免出现异常值，也避免指数溢出
        y_e = np.exp(clipped_values)
        return [y_e[0][0]]
    else:
        return [np.nan]

R_wind = 66 # 滚动窗口
e_wind = 5 # 预测窗口
AMB_e_df=pd.DataFrame((lin_regress(x, R_wind, e_wind) for x in notebook.tqdm(df_1.rolling(R_wind))),columns=[f'AMB_{e_wind}e'])
AMB_e_df.index = df_1.index # 增加索引
AMB_e_df = AMB_e_df.groupby('code').apply(lambda x: x.iloc[R_wind-1:,:]).reset_index(level=0, drop=True) # 去除无效回归
AMB_e_df.to_csv(f'AMB_{e_wind}e.csv')

(2377260, 18)


0it [00:00, ?it/s]