In [65]:
import pickle

import pandas as pd

with open('doc_returns.pkl', 'rb') as f:
    doc_returns = pickle.load(f)

In [66]:
with open('../sentiment_analysis/topic_sentiment_analysis.pkl', 'rb') as f:
    topic_sentiment_analysis = pickle.load(f)
topic_sentiment_analysis.dropna(inplace=True, how='all')
topic_sentiment_analysis.head()

Unnamed: 0_level_0,123779,123730,123731,123678,123676,123677,123615,123483,123280,123281,...,66920,84387,45581,83759,69773,100780,43397,86265,100779,98672
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
credit,,0.178673,-0.001035,,,,0.283389,0.033357,0.498359,0.289802,...,-0.498845,,,0.249558,-0.026119,,,,,
dealing terms,,-0.005857,0.214961,,,,,,,,...,,0.000111,0.000326,,-4.1e-05,0.001717,0.00039,0.009697,0.001717,-0.164886
fees,0.097609,0.16625,6.9e-05,,7.4e-05,,2.9e-05,0.001532,0.000131,0.000115,...,0.495658,0.000237,0.038349,,0.046134,0.000257,9.8e-05,,0.000257,0.996133
investment approach,,,,,,,,,,,...,0.934497,,,,0.338085,,,,,
market,-0.14219,-0.001087,6.7e-05,,6e-06,,0.047601,-0.23025,0.123491,0.214307,...,0.201671,,-0.048678,-0.123748,0.041713,3.2e-05,-0.497505,,3.2e-05,0.283777


In [67]:
results = pd.merge(doc_returns, topic_sentiment_analysis.T, left_index=True, right_index=True, how='inner')
factor_cols = [col for col in results.columns if col != 'avg_return']
results.head()

Unnamed: 0,avg_return,credit,dealing terms,fees,investment approach,market,organization,performance,portfolio,stress,team
123779,-0.0009,,,0.097609,,-0.14219,,-0.017482,0.273391,,3.1e-05
123731,2.6349,-0.001035,0.214961,6.9e-05,,6.7e-05,,0.080113,0.246493,,0.000324
123678,0.1,,,,,,-2.8e-05,,,,
123676,0.3317,,,7.4e-05,,6e-06,,-0.083941,-0.103765,,0.002166
123677,1.565,,,,,,-2.8e-05,,,,


In [68]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
results_cp = results.copy()

results_cp[factor_cols] = scaler.fit_transform(results[factor_cols])
results_cp

Unnamed: 0,avg_return,credit,dealing terms,fees,investment approach,market,organization,performance,portfolio,stress,team
123779,-0.0009,,,0.098464,,-0.141847,,-0.017069,0.274342,,0.000484
123731,2.6349,-0.000540,0.215199,0.000614,,0.000708,,0.080721,0.247390,,0.000778
123678,0.1000,,,,,,0.000497,,,,
123676,0.3317,,,0.000619,,0.000647,,-0.083660,-0.103564,,0.002623
123677,1.5650,,,,,,0.000497,,,,
...,...,...,...,...,...,...,...,...,...,...,...
89742,-0.0700,0.000087,-0.000038,-0.021486,,-0.091618,0.000940,-0.049858,-0.032302,,0.000528
101535,1.7600,,,,,-0.323449,,0.195782,0.000438,,0.000487
101536,3.4600,,,,,-0.323449,,0.195782,0.000438,,0.000487
101539,-5.8500,,,,,-0.323449,,0.195782,0.000438,,0.000487


In [69]:
# Old approach
from scipy.stats import pearsonr

# results.corr()
corr = pd.DataFrame(columns=['p', 'r'])
for col in results.columns:
    if col != 'avg_return':
        factor = results[results[col] > 0]
        print(f'Factor: {col}, N: {len(factor)}')
        p,r = pearsonr(factor['avg_return'], factor[col])
        corr.loc[col] = [p, r]
corr

Factor: credit, N: 186
Factor: dealing terms, N: 196
Factor: fees, N: 299
Factor: investment approach, N: 84
Factor: market, N: 274
Factor: organization, N: 158
Factor: performance, N: 304
Factor: portfolio, N: 347
Factor: stress, N: 35
Factor: team, N: 328


Unnamed: 0,p,r
credit,-0.034087,0.644166
dealing terms,0.05871,0.41371
fees,-0.070377,0.224997
investment approach,-0.029637,0.788993
market,0.047856,0.430121
organization,0.092813,0.246094
performance,0.0234,0.684472
portfolio,0.034314,0.524068
stress,0.230613,0.182589
team,0.007175,0.897008


In [70]:
x = results_cp[factor_cols].fillna(0)
y = results_cp['avg_return']

In [71]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(x, y)
coefficients = pd.DataFrame(reg.coef_, index=results_cp.columns[1:], columns=['coefficient'])
coefficients

Unnamed: 0,coefficient
credit,-0.677262
dealing terms,0.202756
fees,-1.408873
investment approach,-0.137089
market,0.195091
organization,0.633069
performance,0.486382
portfolio,0.401551
stress,0.622487
team,0.571358


In [72]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
coefficients = pd.DataFrame(lasso.coef_, index=results_cp.columns[1:], columns=['coefficient'])
coefficients

Unnamed: 0,coefficient
credit,-0.427301
dealing terms,-0.0
fees,-1.201518
investment approach,-0.56332
market,0.0
organization,0.720108
performance,0.260454
portfolio,0.669705
stress,0.758092
team,0.437804
