In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

## Import Data

In [2]:
df = pd.read_csv('data/stock_data/data_sz50_cy_lhb_sen.csv', index_col=0)
sz = pd.read_csv('data/stock_data/emd_涨跌幅_sz.csv', index_col=0)
cy = pd.read_csv('data/stock_data/emd_涨跌幅_cy.csv', index_col=0)

print(df.shape, sz.shape, cy.shape)

(394, 15) (394, 8) (394, 7)


In [3]:
# Merge three dfs into one 
df = pd.concat([df, sz], axis=1, sort=True)
df = pd.concat([df, cy], axis=1, sort=False)

print(df.shape)
df.head()

(394, 30)


Unnamed: 0,收盘_sz,开盘_sz,高_sz,低_sz,交易量_sz,涨跌幅_sz,date,收盘_cy,开盘_cy,高_cy,...,emd_涨跌幅_sz_6,emd_涨跌幅_sz_7,emd_涨跌幅_sz_8,emd_涨跌幅_cy_1,emd_涨跌幅_cy_2,emd_涨跌幅_cy_3,emd_涨跌幅_cy_4,emd_涨跌幅_cy_5,emd_涨跌幅_cy_6,emd_涨跌幅_cy_7
0,2931.88,2915.97,2936.85,2900.87,3.93B,2.09,20180222,1677.76,1661.56,1679.79,...,0.93872,-0.934991,-0.236881,0.298323,-1.972296,1.849848,-1.205973,-1.245964,1.095513,-1.165342
1,2957.17,2941.42,2966.75,2926.36,3.82B,0.86,20180223,1668.83,1674.62,1681.24,...,-0.756677,-0.332565,0.829638,1.369459,-0.260992,-1.732051,-1.172848,1.739993,0.059579,-1.034574
2,2973.79,2973.81,2986.81,2933.52,5.02B,0.56,20180226,1729.15,1683.15,1733.71,...,-1.486195,-0.431272,1.621853,-1.343849,1.204914,-1.036658,0.84923,0.795858,-1.06791,1.77809
3,2927.1,2979.77,2979.77,2918.01,4.62B,-1.57,20180227,1743.54,1729.94,1757.37,...,0.169594,-0.487137,0.72941,-2.689865,3.331181,1.686321,-3.25795,-1.727923,2.332402,0.222133
4,2878.67,2899.0,2905.57,2866.46,3.88B,-1.65,20180228,1753.63,1730.15,1773.73,...,1.116136,1.117061,-1.51849,-0.639171,-1.056843,0.64087,-0.245377,0.056866,-0.259212,0.280557


## Correlation Matrices

In [4]:
# Correlation between sz50 and cyb
temp = df[['sen','涨跌幅_sz', '涨跌幅_cy', 'emd_涨跌幅_sz_1', 'emd_涨跌幅_sz_4', 'emd_涨跌幅_cy_1', 'emd_涨跌幅_cy_4']]
temp.corr()

Unnamed: 0,sen,涨跌幅_sz,涨跌幅_cy,emd_涨跌幅_sz_1,emd_涨跌幅_sz_4,emd_涨跌幅_cy_1,emd_涨跌幅_cy_4
sen,1.0,0.371479,0.551547,0.023434,-0.103434,-0.003524,-0.074396
涨跌幅_sz,0.371479,1.0,0.69044,-0.028648,-0.062449,0.089344,-0.02984
涨跌幅_cy,0.551547,0.69044,1.0,-0.048134,-0.154628,0.032343,-0.158057
emd_涨跌幅_sz_1,0.023434,-0.028648,-0.048134,1.0,0.122608,0.046431,0.058354
emd_涨跌幅_sz_4,-0.103434,-0.062449,-0.154628,0.122608,1.0,0.013843,0.151957
emd_涨跌幅_cy_1,-0.003524,0.089344,0.032343,0.046431,0.013843,1.0,0.184748
emd_涨跌幅_cy_4,-0.074396,-0.02984,-0.158057,0.058354,0.151957,0.184748,1.0


In [5]:
# Correlation within sz50
temp = df[['sen','涨跌幅_sz', 'emd_涨跌幅_sz_1', 'emd_涨跌幅_sz_2', 'emd_涨跌幅_sz_3', 
           'emd_涨跌幅_sz_4', 'emd_涨跌幅_sz_5', 'emd_涨跌幅_sz_6', 'emd_涨跌幅_sz_7' ]]
temp.corr()

Unnamed: 0,sen,涨跌幅_sz,emd_涨跌幅_sz_1,emd_涨跌幅_sz_2,emd_涨跌幅_sz_3,emd_涨跌幅_sz_4,emd_涨跌幅_sz_5,emd_涨跌幅_sz_6,emd_涨跌幅_sz_7
sen,1.0,0.371479,0.023434,-0.003965,-0.018929,-0.103434,0.052558,-0.055899,-0.038483
涨跌幅_sz,0.371479,1.0,-0.028648,-0.047204,-0.029874,-0.062449,-0.045887,-0.03868,0.033326
emd_涨跌幅_sz_1,0.023434,-0.028648,1.0,0.292885,-0.159487,0.122608,-0.12192,-0.0185,0.077992
emd_涨跌幅_sz_2,-0.003965,-0.047204,0.292885,1.0,-0.151685,0.190395,-0.138947,0.046748,0.071578
emd_涨跌幅_sz_3,-0.018929,-0.029874,-0.159487,-0.151685,1.0,-0.030062,0.255393,-0.036489,-0.17004
emd_涨跌幅_sz_4,-0.103434,-0.062449,0.122608,0.190395,-0.030062,1.0,0.163747,0.008874,0.028218
emd_涨跌幅_sz_5,0.052558,-0.045887,-0.12192,-0.138947,0.255393,0.163747,1.0,0.207003,-0.072847
emd_涨跌幅_sz_6,-0.055899,-0.03868,-0.0185,0.046748,-0.036489,0.008874,0.207003,1.0,0.124637
emd_涨跌幅_sz_7,-0.038483,0.033326,0.077992,0.071578,-0.17004,0.028218,-0.072847,0.124637,1.0


In [6]:
# Correlation within cy
temp = df[['sen','涨跌幅_cy', 'emd_涨跌幅_cy_1', 'emd_涨跌幅_cy_2', 'emd_涨跌幅_cy_3', 
           'emd_涨跌幅_cy_4', 'emd_涨跌幅_cy_5', 'emd_涨跌幅_cy_6', 'emd_涨跌幅_cy_7' ]]
temp.corr()

Unnamed: 0,sen,涨跌幅_cy,emd_涨跌幅_cy_1,emd_涨跌幅_cy_2,emd_涨跌幅_cy_3,emd_涨跌幅_cy_4,emd_涨跌幅_cy_5,emd_涨跌幅_cy_6,emd_涨跌幅_cy_7
sen,1.0,0.551547,-0.003524,-0.037167,-0.037193,-0.074396,-0.065983,-0.05425,0.002418
涨跌幅_cy,0.551547,1.0,0.032343,0.001104,0.03641,-0.158057,0.024804,-0.110379,-0.037275
emd_涨跌幅_cy_1,-0.003524,0.032343,1.0,0.040308,0.193766,0.184748,0.137725,-0.097066,-0.13609
emd_涨跌幅_cy_2,-0.037167,0.001104,0.040308,1.0,0.054846,0.007879,0.03899,0.090079,0.034002
emd_涨跌幅_cy_3,-0.037193,0.03641,0.193766,0.054846,1.0,0.192552,0.118606,0.021209,-0.105766
emd_涨跌幅_cy_4,-0.074396,-0.158057,0.184748,0.007879,0.192552,1.0,0.121116,0.183533,0.172234
emd_涨跌幅_cy_5,-0.065983,0.024804,0.137725,0.03899,0.118606,0.121116,1.0,0.264336,0.058701
emd_涨跌幅_cy_6,-0.05425,-0.110379,-0.097066,0.090079,0.021209,0.183533,0.264336,1.0,0.314011
emd_涨跌幅_cy_7,0.002418,-0.037275,-0.13609,0.034002,-0.105766,0.172234,0.058701,0.314011,1.0


## DID

In [7]:
# Import date data 
# t_0: dates in which the sentiment index rises 1sd below than the last day
# t_1: dates in which the sentiment index drops 1sd above than the last day

day_t_0 = pd.read_csv('data/day_t_0.csv', index_col=0)
day_t_1 = pd.read_csv('data/day_t_1.csv', index_col=0)

day_t_0 = pd.DataFrame(day_t_0['0'].str.lstrip("['"))
day_t_0 = pd.DataFrame(day_t_0['0'].str.rstrip("']"))
day_t_0 = pd.DataFrame(set(day_t_0['0'])) # to omit replicates

day_t_1 = pd.DataFrame(day_t_1['0'].str.lstrip("['"))
day_t_1 = pd.DataFrame(day_t_1['0'].str.rstrip("']"))

In [8]:
t_0 = []
for i in df['date']:
    t_0.append(str(i) in day_t_1['0'].values)
    
df['t'] = t_0
df['t'] = df['t'].eq(True).astype(int)

In [9]:
def did(y1_d1, y0_d0, y):
    '''Run Difference in difference method using Linear Regression
    
    Arguments:
        y1_d1: treatment group 
        y0_d0: control group 
        y: dependent variable
        
    Return:
        Regression result
    '''
    d0 = df.loc[:, [str(y0_d0), 't']]
    d1 = df.loc[:, [str(y1_d1), 't']]
    
    d0.loc[:,'d'] = 0
    d1.loc[:,'d'] = 1
    
    d0.columns = [str(y), 't', 'd']
    d1.columns = [str(y), 't', 'd']

    df_reg = d0.append(d1)
    df_reg.reset_index(inplace=True, drop=True)
    df_reg['t*d'] = df_reg['t']*df_reg['d']

    x = df_reg[['t', 'd', 't*d']]
    x = sm.add_constant(x)
    y = df_reg[str(y)]
    model = sm.OLS(y,x).fit()
    
    return model.summary()

In [10]:
did('涨跌幅_cy', '涨跌幅_sz', '漲跌幅_上證_創業')

0,1,2,3
Dep. Variable:,漲跌幅_上證_創業,R-squared:,0.098
Model:,OLS,Adj. R-squared:,0.095
Method:,Least Squares,F-statistic:,28.52
Date:,"Sat, 06 Mar 2021",Prob (F-statistic):,1.66e-17
Time:,01:55:18,Log-Likelihood:,-1440.5
No. Observations:,788,AIC:,2889.0
Df Residuals:,784,BIC:,2908.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.1520,0.088,-1.721,0.086,-0.325,0.021
t,0.6313,0.174,3.637,0.000,0.291,0.972
d,-0.2173,0.125,-1.739,0.082,-0.462,0.028
t*d,0.8449,0.245,3.442,0.001,0.363,1.327

0,1,2,3
Omnibus:,36.804,Durbin-Watson:,1.928
Prob(Omnibus):,0.0,Jarque-Bera (JB):,113.039
Skew:,0.03,Prob(JB):,2.8399999999999998e-25
Kurtosis:,4.854,Cond. No.,6.46
