# Import the necessary packages 

### Run pip install statsmodels and pip install pandas-datareader if not already installed

In [201]:
import pandas as pd
import numpy as np
from datetime import date, datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols 
from sklearn.linear_model import LogisticRegression
import pandas_datareader.data as web

# Load data sets & process data

### Load S&P 500, Shanghai Composite Index & iShares China large-cap ETF data from 1/2/2018 to 12/31/2019 (trade war period)

In [202]:
start_date = '2018-01-02'
end_date = '2019-12-31'

#S&P is traded in EST time: 9.30 a.m. to 4p.m.
spx = web.DataReader('^GSPC', data_source = 'yahoo', start = start_date, end = end_date)
spx = spx.reset_index()

#Shanghai Composite Index is in China time: GMT+8
sse = web.DataReader('000001.SS', data_source = 'yahoo', start = start_date, end = end_date)
sse = sse.reset_index()

#FXI: iShares China large-cap ETF, traded in EST
fxi = web.DataReader('FXI', data_source = 'yahoo', start = start_date, end = end_date)
fxi = fxi.reset_index()


### Calculate change in prices between Close (previous day) to Open, then Open - Close (same day)

In [203]:
spx['CloseOpen'] = np.log(spx['Open']) - np.log(spx['Close'].shift(1))
spx['OpenClose'] = np.log(spx['Close'])- np.log(spx['Open'])
spx = spx.dropna()
print(spx.head(5))

sse['CloseOpen'] = np.log(sse['Open']) - np.log(sse['Close'].shift(1))
sse['OpenClose'] = np.log(sse['Close'])- np.log(sse['Open'])
sse = sse.dropna()
print(sse.head(5))

fxi['CloseOpen'] = np.log(fxi['Open']) - np.log(fxi['Close'].shift(1))
fxi['OpenClose'] = np.log(fxi['Close'])- np.log(fxi['Open'])
fxi = fxi.dropna()



        Date         High          Low         Open        Close      Volume  \
1 2018-01-03  2714.370117  2697.770020  2697.850098  2713.060059  3538660000   
2 2018-01-04  2729.290039  2719.070068  2719.310059  2723.989990  3695260000   
3 2018-01-05  2743.449951  2727.919922  2731.330078  2743.149902  3236620000   
4 2018-01-08  2748.510010  2737.600098  2742.669922  2747.709961  3242650000   
5 2018-01-09  2759.139893  2747.860107  2751.149902  2751.290039  3453480000   

     Adj Close  CloseOpen  OpenClose  
1  2713.060059   0.000756   0.005622  
2  2723.989990   0.002301   0.001720  
3  2743.149902   0.002691   0.004318  
4  2747.709961  -0.000175   0.001836  
5  2751.290039   0.001251   0.000051  
        Date         High          Low         Open        Close    Volume  \
1 2018-01-03  3379.915039  3345.289062  3347.742920  3369.107910  213800.0   
2 2018-01-04  3392.825928  3365.294922  3371.000000  3385.709961  207000.0   
3 2018-01-05  3402.069092  3380.245117  3386.464111

### Load lstm balanced data set: data set of Trump's Trade war related tweets with sentiment categorized using LSTM algorithm: 1 being positive and -1 being negative sentiment

In [205]:
lstm_bal = pd.read_csv('../output/results/LSTM_balanced_spm_results.csv')    
lstm_bal['Date'] = [datetime.strptime(x,'%Y-%m-%d') for x in lstm_bal['real_Date']]

lstm_sse = pd.read_csv('../output/results/LSTM_balanced_sse_results.csv')    
lstm_sse['Date'] = [datetime.strptime(x,'%Y-%m-%d') for x in lstm_sse['real_Date']]

# A is sentiment aggregated over Open-Close, B is  sentiment aggregated over Close-Open 

### Load textblob balanced data set: data set of Trump's tweets Trade war related tweets with sentiment categorized using textblob algorithm: 1 being positive, -1 being negative, and 0 being neutral sentiment

In [206]:
textblob_spm = pd.read_csv('../output/results/textblob_prediction_data_spm.csv', index_col = 0)
textblob_spm['Date'] = [datetime.strptime(x,'%Y-%m-%d') for x in textblob_spm['real_Date']]

textblob_sse = pd.read_csv('../output/results/textblob_prediction_data_sse.csv', index_col = 0)
textblob_sse['Date'] = [datetime.strptime(x,'%Y-%m-%d') for x in textblob_sse['real_Date']]

textblob_spm.head(5)
# A is sentiment aggregated over Open-Close, B is  sentiment aggregated over Close-Open 
print(textblob_sse.head(5))

    real_Date    A    B       Date
0  2018-01-02  1.0  NaN 2018-01-02
1  2018-01-04  NaN  0.0 2018-01-04
2  2018-01-07  1.0  NaN 2018-01-07
3  2018-01-09  NaN  1.0 2018-01-09
4  2018-01-11  1.0  NaN 2018-01-11


# Merging financial data with sentiment data (neutral sentiments are dropped)

#### Neutral sentiments are dropped, along with periods of Open-Close or Close-Open with no tweets

## 1. Merging S&P 500 with lstm data set

In [207]:
spx_reg = spx.merge(lstm_bal, how = 'outer', on = ['Date'])

#Split data set into Close-Open and Open-Close
spx_reg_CO = spx_reg[['Date','CloseOpen','B']]
spx_reg_CO = spx_reg_CO.dropna()

spx_reg_OC = spx_reg[['Date','OpenClose','A']]
spx_reg_OC = spx_reg_OC.dropna()

#Sort into Positive and Negative Columns
spx_reg_CO['Positive'] = (spx_reg_CO['B'] == 1)
spx_reg_CO['Negative'] = (spx_reg_CO['B'] == -1)
spx_reg_OC['Positive'] = (spx_reg_OC['A'] == 1)
spx_reg_OC['Negative'] = (spx_reg_OC['A'] == -1)
spx_reg_CO['Positive'] = [int(x==True) for x in spx_reg_CO['Positive']]
spx_reg_CO['Negative'] = [int(x==True) for x in spx_reg_CO['Negative']]
spx_reg_OC['Positive'] = [int(x==True) for x in spx_reg_OC['Positive']]
spx_reg_OC['Negative'] = [int(x==True) for x in spx_reg_OC['Negative']]
print(spx_reg_CO.head(5))
print(spx_reg_OC.head(5))

         Date  CloseOpen    B  Positive  Negative
7  2018-01-11   0.001723  1.0         1         0
11 2018-01-18  -0.000057  1.0         1         0
15 2018-01-24   0.002213  1.0         1         0
21 2018-02-01  -0.002610  1.0         1         0
22 2018-02-02  -0.004639  1.0         1         0
        Date  OpenClose    A  Positive  Negative
0 2018-01-03   0.005622  0.0         0         0
1 2018-01-04   0.001720  1.0         1         0
3 2018-01-08   0.001836  1.0         1         0
5 2018-01-10   0.000976  1.0         1         0
6 2018-01-11   0.005286  0.0         0         0


## 2. Merging S&P 500 with textblob data set

In [208]:
spx_textblob = spx.merge(textblob_spm, how = 'outer', on = ['Date'])

#Split data set into Close-Open and Open-Close
spx_textblob_CO = spx_textblob[['Date','CloseOpen','B']]
spx_textblob_CO = spx_textblob_CO.dropna()

spx_textblob_OC = spx_textblob[['Date','OpenClose','A']]
spx_textblob_OC = spx_textblob_OC.dropna()

#Sort into Positive and Negative Columns
spx_textblob_CO['Positive'] = (spx_textblob_CO['B'] == 1)
spx_textblob_CO['Negative'] = (spx_textblob_CO['B'] == -1)
spx_textblob_OC['Positive'] = (spx_textblob_OC['A'] == 1)
spx_textblob_OC['Negative'] = (spx_textblob_OC['A'] == -1)
spx_textblob_CO['Positive'] = [int(x==True) for x in spx_textblob_CO['Positive']]
spx_textblob_CO['Negative'] = [int(x==True) for x in spx_textblob_CO['Negative']]
spx_textblob_OC['Positive'] = [int(x==True) for x in spx_textblob_OC['Positive']]
spx_textblob_OC['Negative'] = [int(x==True) for x in spx_textblob_OC['Negative']]
print(spx_textblob_CO.head(5))
print(spx_textblob_OC.head(5))


         Date  CloseOpen    B  Positive  Negative
7  2018-01-11   0.001723  1.0         1         0
11 2018-01-18  -0.000057 -1.0         0         1
15 2018-01-24   0.002213 -1.0         0         1
21 2018-02-01  -0.002610  1.0         1         0
22 2018-02-02  -0.004639  1.0         1         0
         Date  OpenClose    A  Positive  Negative
0  2018-01-03   0.005622  0.0         0         0
3  2018-01-08   0.001836  1.0         1         0
5  2018-01-10   0.000976  1.0         1         0
6  2018-01-11   0.005286  1.0         1         0
10 2018-01-17   0.006289  1.0         1         0


## 3. Merging Shanghai Composite Index with lstm data set

In [209]:
sse_reg = sse.merge(lstm_sse, how = 'outer', on = ['Date'])
sse_reg_CO = sse_reg[['Date','CloseOpen','B']]
sse_reg_CO = sse_reg_CO.dropna()

sse_reg_OC = sse_reg[['Date','OpenClose','A']]
sse_reg_OC = sse_reg_OC.dropna()

sse_reg_CO['Positive'] = (sse_reg_CO['B'] == 1)
sse_reg_CO['Negative'] = (sse_reg_CO['B'] == -1)
sse_reg_OC['Positive'] = (sse_reg_OC['A'] == 1)
sse_reg_OC['Negative'] = (sse_reg_OC['A'] == -1)
sse_reg_CO['Positive'] = [int(x==True) for x in sse_reg_CO['Positive']]
sse_reg_CO['Negative'] = [int(x==True) for x in sse_reg_CO['Negative']]
sse_reg_OC['Positive'] = [int(x==True) for x in sse_reg_OC['Positive']]
sse_reg_OC['Negative'] = [int(x==True) for x in sse_reg_OC['Negative']]
print(sse_reg_CO.head(5))
print(sse_reg_OC.head(5))

        Date  CloseOpen    B  Positive  Negative
1 2018-01-04   0.000561  0.0         0         0
2 2018-01-05   0.000223  1.0         1         0
4 2018-01-09  -0.000988  1.0         1         0
7 2018-01-11  -0.001828  1.0         1         0
8 2018-01-12  -0.000428  0.0         0         0
         Date  OpenClose    A  Positive  Negative
6  2018-01-11   0.002853  1.0         1         0
12 2018-01-18   0.007184  1.0         1         0
24 2018-02-02   0.012456  1.0         1         0
25 2018-02-05   0.021982 -1.0         0         1
28 2018-02-08  -0.005806  1.0         1         0


## 4. Merging Shanghai Composite Index with textblob data set

In [210]:
sse_textblob = sse.merge(textblob_sse, how = 'outer', on = ['Date'])
sse_textblob_CO = sse_textblob[['Date','CloseOpen','B']]
sse_textblob_CO = sse_textblob_CO.dropna()

sse_textblob_OC = sse_textblob[['Date','OpenClose','A']]
sse_textblob_OC = sse_textblob_OC.dropna()

sse_textblob_CO['Positive'] = (sse_textblob_CO['B'] == 1)
sse_textblob_CO['Negative'] = (sse_textblob_CO['B'] == -1)
sse_textblob_OC['Positive'] = (sse_textblob_OC['A'] == 1)
sse_textblob_OC['Negative'] = (sse_textblob_OC['A'] == -1)
sse_textblob_CO['Positive'] = [int(x==True) for x in sse_textblob_CO['Positive']]
sse_textblob_CO['Negative'] = [int(x==True) for x in sse_textblob_CO['Negative']]
sse_textblob_OC['Positive'] = [int(x==True) for x in sse_textblob_OC['Positive']]
sse_textblob_OC['Negative'] = [int(x==True) for x in sse_textblob_OC['Negative']]
print(sse_textblob_CO.head(5))
print(sse_textblob_OC.head(5))


         Date  CloseOpen    B  Positive  Negative
1  2018-01-04   0.000561  0.0         0         0
4  2018-01-09  -0.000988  1.0         1         0
7  2018-01-11  -0.001828  1.0         1         0
8  2018-01-12  -0.000428  1.0         1         0
12 2018-01-18   0.001511  0.0         0         0
         Date  OpenClose    A  Positive  Negative
6  2018-01-11   0.002853  1.0         1         0
23 2018-02-02   0.012456  1.0         1         0
24 2018-02-05   0.021982 -1.0         0         1
27 2018-02-08  -0.005806  0.0         0         0
37 2018-03-01   0.011881  1.0         1         0


## 5. Merging FXI with lstm data set

In [211]:
fxi_reg = fxi.merge(lstm_bal, how = 'outer', on = ['Date'])
fxi_reg_CO = fxi_reg[['Date','CloseOpen','B']]
fxi_reg_CO = fxi_reg_CO.dropna()

fxi_reg_OC = fxi_reg[['Date','OpenClose','A']]
fxi_reg_OC = fxi_reg_OC.dropna()

fxi_reg_CO['Positive'] = (fxi_reg_CO['B'] == 1)
fxi_reg_CO['Negative'] = (fxi_reg_CO['B'] == -1)
fxi_reg_OC['Positive'] = (fxi_reg_OC['A'] == 1)
fxi_reg_OC['Negative'] = (fxi_reg_OC['A'] == -1)
fxi_reg_CO['Positive'] = [int(x==True) for x in fxi_reg_CO['Positive']]
fxi_reg_CO['Negative'] = [int(x==True) for x in fxi_reg_CO['Negative']]
fxi_reg_OC['Positive'] = [int(x==True) for x in fxi_reg_OC['Positive']]
fxi_reg_OC['Negative'] = [int(x==True) for x in fxi_reg_OC['Negative']]
print(fxi_reg_CO.head(5))
print(fxi_reg_OC.head(5))

         Date  CloseOpen    B  Positive  Negative
7  2018-01-11  -0.000617  1.0         1         0
11 2018-01-18   0.003543  1.0         1         0
15 2018-01-24   0.007140  1.0         1         0
21 2018-02-01  -0.011649  1.0         1         0
22 2018-02-02   0.002309  1.0         1         0
        Date  OpenClose    A  Positive  Negative
0 2018-01-03   0.006873  0.0         0         0
1 2018-01-04  -0.000207  1.0         1         0
3 2018-01-08   0.005146  1.0         1         0
5 2018-01-10   0.000206  1.0         1         0
6 2018-01-11   0.007586  0.0         0         0



## 6. Merging FXI with textblob data set¶

In [180]:
fxi_textblob = fxi.merge(textblob_spm, how = 'outer', on = ['Date'])
fxi_textblob_CO = fxi_textblob[['Date','CloseOpen','B']]
fxi_textblob_CO = fxi_textblob_CO.dropna()

fxi_textblob_OC = fxi_textblob[['Date','OpenClose','A']]
fxi_textblob_OC = fxi_textblob_OC.dropna()

fxi_textblob_CO['Positive'] = (fxi_textblob_CO['B'] == 1)
fxi_textblob_CO['Negative'] = (fxi_textblob_CO['B'] == -1)
fxi_textblob_OC['Positive'] = (fxi_textblob_OC['A'] == 1)
fxi_textblob_OC['Negative'] = (fxi_textblob_OC['A'] == -1)
fxi_textblob_CO['Positive'] = [int(x==True) for x in fxi_textblob_CO['Positive']]
fxi_textblob_CO['Negative'] = [int(x==True) for x in fxi_textblob_CO['Negative']]
fxi_textblob_OC['Positive'] = [int(x==True) for x in fxi_textblob_OC['Positive']]
fxi_textblob_OC['Negative'] = [int(x==True) for x in fxi_textblob_OC['Negative']]
print(fxi_textblob_CO.head(5))
print(fxi_textblob_OC.head(5))

         Date  CloseOpen    B  Positive  Negative
7  2018-01-11  -0.000617  1.0         1         0
11 2018-01-18   0.003543 -1.0         0         1
15 2018-01-24   0.007140 -1.0         0         1
21 2018-02-01  -0.011649  1.0         1         0
22 2018-02-02   0.002309  1.0         1         0
         Date  OpenClose    A  Positive  Negative
0  2018-01-03   0.006873  0.0         0         0
3  2018-01-08   0.005146  1.0         1         0
5  2018-01-10   0.000206  1.0         1         0
6  2018-01-11   0.007586  1.0         1         0
10 2018-01-17   0.007123  1.0         1         0


# Run Regression & Calculate Point-biserial correlation coefficient on merged data set

### Point-biserial correlation coefficient:
The point biserial correlation coefficient (rpb) is a correlation coefficient used when one variable (e.g. Y) is dichotomous. The point-biserial correlation is mathematically equivalent to the Pearson (product moment) correlation, that is, if we have one continuously measured variable X and a dichotomous variable Y, rXY = rpb.

$$ r_{pb} = \frac{M_1-M_0}{S_n}*\sqrt{\frac{n_1*n_0}{n^2}}$$

## 1. S&P 500 vs. lstm sentiment

### a. Regression

Close-Open

In [212]:
fit_spx_CO = ols('CloseOpen ~ C(Positive)', data=spx_reg_CO).fit() 
print(fit_spx_CO.summary())

                            OLS Regression Results                            
Dep. Variable:              CloseOpen   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                    0.2204
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.639
Time:                        17:50:35   Log-Likelihood:                 1098.2
No. Observations:                 271   AIC:                            -2192.
Df Residuals:                     269   BIC:                            -2185.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0004      0.000  

Open-Close

In [213]:
fit_spx_OC = ols('OpenClose ~ C(Positive)', data=spx_reg_OC).fit() 
print(fit_spx_OC.summary())

                            OLS Regression Results                            
Dep. Variable:              OpenClose   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.008
Method:                 Least Squares   F-statistic:                   0.05105
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.822
Time:                        17:50:39   Log-Likelihood:                 421.03
No. Observations:                 125   AIC:                            -838.1
Df Residuals:                     123   BIC:                            -832.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.0008      0.001  

### b. Calculate Point-biserial correlation coefficient

In [214]:
x = spx_reg_CO['CloseOpen']
y = spx_reg_CO['Positive']
std_x = np.std(x)
M1 = np.mean(spx_reg_CO[spx_reg_CO['Positive']==1]['CloseOpen'])
n1 = spx_reg_CO[spx_reg_CO['Positive']==1].shape[0]
M0 = np.mean(spx_reg_CO[spx_reg_CO['Negative']==1]['CloseOpen'])
n0 = spx_reg_CO[spx_reg_CO['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Close-Open is:', rpb)



Point-biserial correlation coefficient for Close-Open is: -0.004854826222243691


In [215]:
x = spx_reg_OC['OpenClose']
y = spx_reg_OC['Positive']
std_x = np.std(x)
M1 = np.mean(spx_reg_OC[spx_reg_OC['Positive']==1]['OpenClose'])
n1 = spx_reg_OC[spx_reg_OC['Positive']==1].shape[0]
M0 = np.mean(spx_reg_OC[spx_reg_OC['Negative']==1]['OpenClose'])
n0 = spx_reg_OC[spx_reg_OC['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Open-Close is:', rpb)

Point-biserial correlation coefficient for Open-Close is: -0.019320563119210717


## 2. S&P 500 vs. textblob sentiment

### a. Regression

Close-Open

In [216]:
fit_spx_textblob_CO = ols('CloseOpen ~ C(Positive)', data=spx_textblob_CO).fit() 
print(fit_spx_textblob_CO.summary())


                            OLS Regression Results                            
Dep. Variable:              CloseOpen   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.004
Method:                 Least Squares   F-statistic:                   0.06211
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.803
Time:                        17:50:51   Log-Likelihood:                 909.04
No. Observations:                 227   AIC:                            -1814.
Df Residuals:                     225   BIC:                            -1807.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0002      0.000  

Open-Close

In [217]:
fit_spx_textblob_OC = ols('OpenClose ~ C(Positive)', data=spx_textblob_OC).fit() 
print(fit_spx_textblob_OC.summary())


                            OLS Regression Results                            
Dep. Variable:              OpenClose   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                 -0.005
Method:                 Least Squares   F-statistic:                    0.5259
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.470
Time:                        17:50:52   Log-Likelihood:                 324.17
No. Observations:                  98   AIC:                            -644.3
Df Residuals:                      96   BIC:                            -639.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.0003      0.001  

### b. Calculate Point-biserial correlation coefficient

In [218]:
x = spx_textblob_CO['CloseOpen']
y = spx_textblob_CO['Positive']
std_x = np.std(x)
M1 = np.mean(spx_textblob_CO[spx_textblob_CO['Positive']==1]['CloseOpen'])
n1 = spx_textblob_CO[spx_textblob_CO['Positive']==1].shape[0]
M0 = np.mean(spx_textblob_CO[spx_textblob_CO['Negative']==1]['CloseOpen'])
n0 = spx_textblob_CO[spx_textblob_CO['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Close-Open is:', rpb)



Point-biserial correlation coefficient for Close-Open is: 0.02775719450696116


In [219]:
x = spx_textblob_OC['OpenClose']
y = spx_textblob_OC['Positive']
std_x = np.std(x)
M1 = np.mean(spx_textblob_OC[spx_textblob_OC['Positive']==1]['OpenClose'])
n1 = spx_textblob_OC[spx_textblob_OC['Positive']==1].shape[0]
M0 = np.mean(spx_textblob_OC[spx_textblob_OC['Negative']==1]['OpenClose'])
n0 = spx_textblob_OC[spx_textblob_OC['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Open-Close is:', rpb)

Point-biserial correlation coefficient for Open-Close is: 0.013366760182372414


## 3. Shanghai Composite Index vs. lstm

### a. Regression

Close-Open

In [220]:
fit_sse_CO = ols('CloseOpen ~ C(Positive)', data=sse_reg_CO).fit() 
print(fit_sse_CO.summary())


                            OLS Regression Results                            
Dep. Variable:              CloseOpen   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.006
Method:                 Least Squares   F-statistic:                   0.02554
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.873
Time:                        17:50:58   Log-Likelihood:                 626.36
No. Observations:                 170   AIC:                            -1249.
Df Residuals:                     168   BIC:                            -1242.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.0010      0.001  

Open-Close

In [221]:
fit_sse_OC = ols('OpenClose ~ C(Positive)', data=sse_reg_OC).fit() 
print(fit_sse_OC.summary())


                            OLS Regression Results                            
Dep. Variable:              OpenClose   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                    0.6628
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.416
Time:                        17:50:59   Log-Likelihood:                 703.43
No. Observations:                 221   AIC:                            -1403.
Df Residuals:                     219   BIC:                            -1396.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0020      0.001  

### b. Calculate Point-biserial correlation coefficient

In [222]:
x = sse_reg_CO['CloseOpen']
y = sse_reg_CO['Positive']
std_x = np.std(x)
M1 = np.mean(sse_reg_CO[sse_reg_CO['Positive']==1]['CloseOpen'])
n1 = sse_reg_CO[sse_reg_CO['Positive']==1].shape[0]
M0 = np.mean(sse_reg_CO[sse_reg_CO['Negative']==1]['CloseOpen'])
n0 = sse_reg_CO[sse_reg_CO['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Close-Open is:', rpb)


Point-biserial correlation coefficient for Close-Open is: 0.018178598827927814


In [223]:
x = sse_reg_OC['OpenClose']
y = sse_reg_OC['Positive']
std_x = np.std(x)
M1 = np.mean(sse_reg_OC[sse_reg_OC['Positive']==1]['OpenClose'])
n1 = sse_reg_OC[sse_reg_OC['Positive']==1].shape[0]
M0 = np.mean(sse_reg_OC[sse_reg_OC['Negative']==1]['OpenClose'])
n0 = sse_reg_OC[sse_reg_OC['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Open-Close is:', rpb)


Point-biserial correlation coefficient for Open-Close is: -0.07985055685344608


## 4. Shanghai Composite Index vs. textblob sentiment

### a. Regression

Close-Open

In [224]:
fit_sse_textblob_CO = ols('CloseOpen ~ C(Positive)', data=sse_textblob_CO).fit() 
print(fit_sse_textblob_CO.summary())


                            OLS Regression Results                            
Dep. Variable:              CloseOpen   R-squared:                       0.035
Model:                            OLS   Adj. R-squared:                  0.027
Method:                 Least Squares   F-statistic:                     4.659
Date:                Wed, 16 Dec 2020   Prob (F-statistic):             0.0327
Time:                        17:51:02   Log-Likelihood:                 489.99
No. Observations:                 132   AIC:                            -976.0
Df Residuals:                     130   BIC:                            -970.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0006      0.001  

Open-Close

In [194]:
fit_sse_textblob_OC = ols('OpenClose ~ C(Positive)', data=sse_textblob_OC).fit() 
print(fit_sse_textblob_OC.summary())

                            OLS Regression Results                            
Dep. Variable:              OpenClose   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.005
Method:                 Least Squares   F-statistic:                   0.02868
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.866
Time:                        17:34:47   Log-Likelihood:                 585.38
No. Observations:                 182   AIC:                            -1167.
Df Residuals:                     180   BIC:                            -1160.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0016      0.001  

### b. Calculate Point-biserial correlation coefficient

In [225]:
x = sse_textblob_CO['CloseOpen']
y = sse_textblob_CO['Positive']
std_x = np.std(x)
M1 = np.mean(sse_textblob_CO[sse_textblob_CO['Positive']==1]['CloseOpen'])
n1 = sse_textblob_CO[sse_textblob_CO['Positive']==1].shape[0]
M0 = np.mean(sse_textblob_CO[sse_textblob_CO['Negative']==1]['CloseOpen'])
n0 = sse_textblob_CO[sse_textblob_CO['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Close-Open is:', rpb)


Point-biserial correlation coefficient for Close-Open is: -0.07790471887144576


In [226]:
x = sse_textblob_OC['OpenClose']
y = sse_textblob_OC['Positive']
std_x = np.std(x)
M1 = np.mean(sse_textblob_OC[sse_textblob_OC['Positive']==1]['OpenClose'])
n1 = sse_textblob_OC[sse_textblob_OC['Positive']==1].shape[0]
M0 = np.mean(sse_textblob_OC[sse_textblob_OC['Negative']==1]['OpenClose'])
n0 = sse_textblob_OC[sse_textblob_OC['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Open-Close is:', rpb)

Point-biserial correlation coefficient for Open-Close is: -0.027915412009342024


## 5. FXI vs. lstm

### a. Regression

Close-Open

In [227]:
fit_fxi_CO = ols('CloseOpen ~ C(Positive)', data=fxi_reg_CO).fit() 
print(fit_fxi_CO.summary())


                            OLS Regression Results                            
Dep. Variable:              CloseOpen   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.004
Method:                 Least Squares   F-statistic:                   0.03736
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.847
Time:                        17:51:06   Log-Likelihood:                 813.27
No. Observations:                 271   AIC:                            -1623.
Df Residuals:                     269   BIC:                            -1615.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0002      0.001  

Open-Close

In [164]:
fit_fxi_OC = ols('OpenClose ~ C(Positive)', data=fxi_reg_OC).fit() 
print(fit_fxi_OC.summary())


                            OLS Regression Results                            
Dep. Variable:              OpenClose   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.005
Method:                 Least Squares   F-statistic:                    0.3923
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.532
Time:                        17:25:29   Log-Likelihood:                 395.25
No. Observations:                 115   AIC:                            -786.5
Df Residuals:                     113   BIC:                            -781.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.0017      0.001  

### b. Calculate Point-biserial correlation coefficient

In [228]:
x = fxi_reg_CO['CloseOpen']
y = fxi_reg_CO['Positive']
std_x = np.std(x)
M1 = np.mean(fxi_reg_CO[fxi_reg_CO['Positive']==1]['CloseOpen'])
n1 = fxi_reg_CO[fxi_reg_CO['Positive']==1].shape[0]
M0 = np.mean(fxi_reg_CO[fxi_reg_CO['Negative']==1]['CloseOpen'])
n0 = fxi_reg_CO[fxi_reg_CO['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Close-Open is:', rpb)


Point-biserial correlation coefficient for Close-Open is: 0.02883862030036829


In [229]:
x = sse_reg_OC['OpenClose']
y = sse_reg_OC['Positive']
std_x = np.std(x)
M1 = np.mean(sse_reg_OC[sse_reg_OC['Positive']==1]['OpenClose'])
n1 = sse_reg_OC[sse_reg_OC['Positive']==1].shape[0]
M0 = np.mean(sse_reg_OC[sse_reg_OC['Negative']==1]['OpenClose'])
n0 = sse_reg_OC[sse_reg_OC['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Open-Close is:', rpb)


Point-biserial correlation coefficient for Open-Close is: -0.07985055685344608


## 6. FXI vs. textblob

### a. Regression

Close-Open

In [230]:
fit_fxi_textblob_CO = ols('CloseOpen ~ C(Positive)', data=fxi_textblob_CO).fit() 
print(fit_fxi_textblob_CO.summary())


                            OLS Regression Results                            
Dep. Variable:              CloseOpen   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                    0.3506
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.554
Time:                        17:51:12   Log-Likelihood:                 680.48
No. Observations:                 227   AIC:                            -1357.
Df Residuals:                     225   BIC:                            -1350.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0009      0.001  

Open-Close

In [231]:
fit_fxi_textblob_OC = ols('OpenClose ~ C(Positive)', data=fxi_textblob_OC).fit() 
print(fit_fxi_textblob_OC.summary())


                            OLS Regression Results                            
Dep. Variable:              OpenClose   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.059
Date:                Wed, 16 Dec 2020   Prob (F-statistic):              0.306
Time:                        17:51:13   Log-Likelihood:                 330.08
No. Observations:                  98   AIC:                            -656.2
Df Residuals:                      96   BIC:                            -651.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0004      0.001  

### b. Calculate Point-biserial correlation coefficient

In [232]:
x = fxi_textblob_CO['CloseOpen']
y = fxi_textblob_CO['Positive']
std_x = np.std(x)
M1 = np.mean(fxi_textblob_CO[fxi_textblob_CO['Positive']==1]['CloseOpen'])
n1 = fxi_textblob_CO[fxi_textblob_CO['Positive']==1].shape[0]
M0 = np.mean(fxi_textblob_CO[fxi_textblob_CO['Negative']==1]['CloseOpen'])
n0 = fxi_textblob_CO[fxi_textblob_CO['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Close-Open is:', rpb)


Point-biserial correlation coefficient for Close-Open is: -0.044188713120793345


In [233]:
x = fxi_textblob_OC['OpenClose']
y = fxi_textblob_OC['Positive']
std_x = np.std(x)
M1 = np.mean(fxi_textblob_OC[fxi_textblob_OC['Positive']==1]['OpenClose'])
n1 = fxi_textblob_OC[fxi_textblob_OC['Positive']==1].shape[0]
M0 = np.mean(fxi_textblob_OC[fxi_textblob_OC['Negative']==1]['OpenClose'])
n0 = fxi_textblob_OC[fxi_textblob_OC['Negative']==1].shape[0]
n = n1+n0
rpb = (M1-M0)*np.sqrt(n1*n0/(n*(n-1)))/std_x
print('Point-biserial correlation coefficient for Open-Close is:', rpb)

Point-biserial correlation coefficient for Open-Close is: -0.016016097171625734


# References

https://medium.com/@outside2SDs/an-overview-of-correlation-measures-between-categorical-and-continuous-variables-4c7f85610365

https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient

https://topforeignstocks.com/foreign-adrs-list/the-full-list-of-chinese-adrs/

https://finance.yahoo.com/

https://www.barchart.com/