In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import string
import pickle
import joblib
from sklearn.metrics import r2_score
import random

np.random.seed(74)

In [3]:
#import trained models
with open('LinReg III.pkl', 'rb') as f:
    linreg = joblib.load(f)
with open('Lasso III.pkl', 'rb') as f:
    lasso = joblib.load(f)
with open('Ridge III.pkl', 'rb') as f:
    ridge = joblib.load(f)

In [4]:
#import data
df = pd.read_excel("data.xlsx")
X = df[['BETA_SEC1M', 'BETA_MRKT1M', 'ER_MRKT1M-100', 'ER_SEC1M-100', 'RFR1M']]
Y = df[['ACTUALRETURN_1M-100']]

#predict returns
hat_linreg = linreg.predict(X)
hat_lasso  = lasso.predict(X)
hat_lasso = hat_lasso.reshape(len(hat_lasso),1)
hat_ridge  = ridge.predict(X)


Feature names unseen at fit time:
- ER_MRKT1M-100
- ER_SEC1M-100
Feature names seen at fit time, yet now missing:
- ER_MRKT1M
- ER_SEC1M

Feature names unseen at fit time:
- ER_MRKT1M-100
- ER_SEC1M-100
Feature names seen at fit time, yet now missing:
- ER_MRKT1M
- ER_SEC1M

Feature names unseen at fit time:
- ER_MRKT1M-100
- ER_SEC1M-100
Feature names seen at fit time, yet now missing:
- ER_MRKT1M
- ER_SEC1M



In [5]:
#score linreg model by calculating percent difference from actual return
scores_linreg = pd.DataFrame(abs((Y.values.tolist()-hat_linreg)/Y.values.tolist()))
scores_linreg['ticker'] = df[['ticker']]
print(scores_linreg.sort_values(by=0).head(20))

print(r2_score(Y,hat_linreg))

            0 ticker
369  0.012197  ORINY
352  0.030491   ADMA
355  0.032700   AXSM
360  0.037569   SLNX
337  0.051471   FNCH
361  0.054683   MDWD
371  0.054795  SGIPF
348  0.067900    IQV
358  0.073322   EXAI
366  0.073549   SRGA
347  0.077796  VMTHF
354  0.080183  GLAXF
329  0.083330    CVM
351  0.101746   XCUR
349  0.107588  OCTHF
367  0.107798  NEGXF
339  0.120847   CASI
342  0.124913   PLUR
370  0.130763   NVST
340  0.134179  OPHLY
-0.010128075705567063


  


In [6]:
print(r2_score(Y,hat_linreg))
#0.9778089592661071

print(r2_score(Y,hat_lasso))
#0.9778207594891477

print(r2_score(Y,hat_ridge))
#0.977820759491153

-0.010128075705567063
-0.010128075705567063
-0.010128075705544637


In [7]:
#score lasso model by calculating percent difference from actual return
scores_lasso = pd.DataFrame(abs((Y.values.tolist()-hat_lasso)/Y.values.tolist()))
scores_lasso['ticker'] = df[['ticker']]
print(scores_lasso.sort_values(by=0).head(20))

print(r2_score(Y,hat_lasso))

            0 ticker
369  0.012197  ORINY
352  0.030491   ADMA
355  0.032700   AXSM
360  0.037569   SLNX
337  0.051471   FNCH
361  0.054683   MDWD
371  0.054795  SGIPF
348  0.067900    IQV
358  0.073322   EXAI
366  0.073549   SRGA
347  0.077796  VMTHF
354  0.080183  GLAXF
329  0.083330    CVM
351  0.101746   XCUR
349  0.107588  OCTHF
367  0.107798  NEGXF
339  0.120847   CASI
342  0.124913   PLUR
370  0.130763   NVST
340  0.134179  OPHLY
-0.010128075705567063


  


In [8]:
#score ridge model by calculating percent difference from actual return
scores_ridge = pd.DataFrame(abs((Y.values.tolist()-hat_ridge)/Y.values.tolist()))
scores_ridge['ticker'] = df[['ticker']]
print(scores_ridge.sort_values(by=0).head(20))

print(r2_score(Y,hat_ridge))

            0 ticker
369  0.012197  ORINY
352  0.030491   ADMA
355  0.032700   AXSM
360  0.037569   SLNX
337  0.051471   FNCH
361  0.054683   MDWD
371  0.054795  SGIPF
348  0.067900    IQV
358  0.073322   EXAI
366  0.073549   SRGA
347  0.077796  VMTHF
354  0.080183  GLAXF
329  0.083330    CVM
351  0.101746   XCUR
349  0.107588  OCTHF
367  0.107798  NEGXF
339  0.120847   CASI
342  0.124913   PLUR
370  0.130763   NVST
340  0.134179  OPHLY
-0.010128075705544637


  


In [9]:
#get tickers, volumes, and marketcaps
ticker_info = pd.read_excel('lol (1).xlsx')
tickers = np.unique(ticker_info.loc[:,'Ticker'])
volumes = ticker_info.loc[ticker_info.loc[:,'Time'] == '2015-11-25']['vw']
marketcaps = ticker_info.loc[ticker_info.loc[:,'Time'] == '2015-11-25']['Market_Cap']

In [10]:
#linear regression
#get tickers that are in both datasets
sample_sectors = df[df['ticker'].isin(list(tickers))][:]

#get linear regression results from smaller sample
sample_hat = pd.DataFrame(hat_linreg[list(sample_sectors.index)])

#get scores for smaller sample
sample_scores = scores_linreg[scores_linreg['ticker'].isin(list(tickers))][:]

#prep data for allocation model
beta = sample_sectors.loc[:,'BETA_MRKT1M'].tolist()
accuracy = 1 - sample_scores[0]*sample_scores[0].tolist()
ticker = sample_sectors.loc[:,'ticker'].tolist()

In [11]:
#set up data for allocation model
d = {
    'beta':beta,
    'volume':volumes.tolist(),
    'marketcap':marketcaps.tolist(),
    'expected_return':sample_hat[0].tolist(),
    'accuracy':accuracy.tolist(),
    'ticker':ticker
}

stocks_linreg = pd.DataFrame(data=d)
stocks_linreg.set_index('ticker',inplace=True)


In [12]:
#lasso
#get lasso results from smaller sample
sample_lasso = pd.DataFrame(hat_lasso[list(sample_sectors.index)])

#get scores for smaller sample
sample_scores = scores_lasso[scores_lasso['ticker'].isin(list(tickers))][:]

#get accuracy for this model
accuracy = 1 - sample_scores[0]*sample_scores[0].tolist()


d = {
    'beta':beta,
    'volume':volumes.tolist(),
    'marketcap':marketcaps.tolist(),
    'expected_return':sample_lasso[0].tolist(),
    'accuracy':accuracy.tolist(),
    'ticker':ticker
}

stocks_lasso = pd.DataFrame(data=d)
stocks_lasso.set_index('ticker',inplace=True)


In [13]:
sample_ridge = pd.DataFrame(hat_ridge[list(sample_sectors.index)])

sample_scores = scores_ridge[scores_ridge['ticker'].isin(list(tickers))][:]

accuracy = 1 - sample_scores[0]*sample_scores[0].tolist()

d = {
    'beta':beta,
    'volume':volumes.tolist(),
    'marketcap':marketcaps.tolist(),
    'expected_return':sample_ridge[0].tolist(),
    'accuracy':accuracy.tolist(),
    'ticker':ticker
}

stocks_ridge = pd.DataFrame(data=d)
stocks_ridge.set_index('ticker',inplace=True)

In [29]:
beta_weight = -0.2
volume_weight = 0.5
marketcap_weight = 0.5
return_weight = 0.5

priority = dict()
beta_contributions = list()
volume_contributions = list()
marketcap_contributions = list()
return_contributions = list()



for ind, stock in stocks_linreg.iterrows():
    beta_contribution = abs(stock.beta)*beta_weight
    beta_contributions.append(beta_contribution)
    volume_contribution = np.math.log(stock.volume) * volume_weight
    volume_contributions.append(volume_contribution)
    marketcap_contribution = np.math.log(stock.marketcap) * marketcap_weight
    marketcap_contributions.append(marketcap_contribution)
    return_contribution = np.math.log10(stock.expected_return*100)* return_weight
    return_contributions.append(return_contribution)
    stock_priority = stock.accuracy * (beta_contribution + volume_contribution + marketcap_contribution + return_contribution )
    priority[stock.name] = stock_priority



stocks_linreg['beta_contribution'] = beta_contributions
stocks_linreg['volume_contribution'] = volume_contributions
stocks_linreg['marketcap_contribution'] = marketcap_contributions
stocks_linreg['return_contribution'] = return_contributions
print(stocks_linreg)

priority = sorted(priority.items(), key=lambda x:x[1],reverse=True)
print(priority)
top_priorities = priority[0:10]
top_priorities = [item for item in top_priorities if item[1] > 0]
normalized_values = dict()
for i in top_priorities:
    #print(i)
    nv = i[1] / sum(j[1] for j in top_priorities)
    normalized_values[i[0]] = nv
    
print(normalized_values)

portfolio_beta = 0
for i in normalized_values.keys():
    portfolio_beta = portfolio_beta + normalized_values[i]*stocks_linreg.loc[i]['beta']

print(portfolio_beta)

             beta       volume     marketcap  expected_return  accuracy  \
ticker                                                                    
AVGR    70.415961  138506.4000  8.669961e+06         0.067720  0.000075   
CFRX     1.663797    1744.0200  8.392915e+06         0.079198  0.004060   
BIOC     0.886079     523.0175  1.330466e+07         0.079328  0.010656   
TTNP     0.290728    3273.7600  2.123207e+06         0.079428  0.029547   
CFMS     0.265322     508.8720  1.890800e+07         0.079432  0.034790   
CHEK     0.258131     786.6000  1.375146e+07         0.079433  0.034388   

        beta_contribution  volume_contribution  marketcap_contribution  \
ticker                                                                   
AVGR           -14.083192             5.919336                7.987687   
CFRX            -0.332759             3.731974                7.971449   
BIOC            -0.177216             3.129807                8.201812   
TTNP            -0.058146    

In [15]:
d = pd.read_csv("gs://vertexaioutputs/Historical Financial Data.csv")
d.set_index('Ticker',inplace=True)
#print(d)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [31]:
portfolio_return = 0.0
for i in normalized_values.keys():
    temp = d.loc[i]
    start_date = temp.loc[temp.loc[:]['Time'] == '2015-11-25'].iloc[0]
    end_date = temp.loc[temp.loc[:]['Time'] >= '2015-12-25'].iloc[0]
    ret_val = (end_date['c'] - start_date['o'])/start_date['o']
    print(ret_val)
    portfolio_return += ret_val*normalized_values[i]
    
print('Overall return')
print(portfolio_return)

-0.16966824644549763
-0.07962790697674423
0.07092198581560284
-0.22959183673469388
0.2
0.4289838337182448
Overall return
-0.07234123036398132


In [17]:
beta_weight = -0.2
volume_weight = 0.5
marketcap_weight = 0.5
return_weight = 0.5

priority = dict()
beta_contributions = list()
volume_contributions = list()
marketcap_contributions = list()
return_contributions = list()



for ind, stock in stocks_lasso.iterrows():
    beta_contribution = abs(stock.beta)*beta_weight
    beta_contributions.append(beta_contribution)
    volume_contribution = np.math.log(stock.volume) * volume_weight
    volume_contributions.append(volume_contribution)
    marketcap_contribution = np.math.log(stock.marketcap) * marketcap_weight
    marketcap_contributions.append(marketcap_contribution)
    return_contribution = np.math.log10(stock.expected_return)* return_weight
    return_contributions.append(return_contribution)
    stock_priority = stock.accuracy * (beta_contribution + volume_contribution + marketcap_contribution + return_contribution )
    priority[stock.name] = stock_priority



stocks_lasso['beta_contribution'] = beta_contributions
stocks_lasso['volume_contribution'] = volume_contributions
stocks_lasso['marketcap_contribution'] = marketcap_contributions
stocks_lasso['return_contribution'] = return_contributions


priority = sorted(priority.items(), key=lambda x:x[1],reverse=True)
top_priorities = priority[0:10]
top_priorities = [item for item in top_priorities if item[1] > 0]
normalized_values = dict()
for i in top_priorities:
    nv = i[1] / sum(j[1] for j in top_priorities)
    normalized_values[i[0]] = nv
    
print(normalized_values)

portfolio_beta = 0
for i in normalized_values.keys():
    portfolio_beta = portfolio_beta + normalized_values[i]*stocks_lasso.loc[i]['beta']

print(portfolio_beta)

{'CFMS': 0.3082342874775525, 'CHEK': 0.30635036144029937, 'TTNP': 0.2577493973772136, 'BIOC': 0.0919283692482508, 'CFRX': 0.035737584456683694}
0.3767106084393677


In [18]:
portfolio_return = 0.0
for i in normalized_values.keys():
    temp = d.loc[i]
    start_date = temp.loc[temp.loc[:]['Time'] == '2015-11-25'].iloc[0]
    end_date = temp.loc[temp.loc[:]['Time'] >= '2015-12-25'].iloc[0]
    ret_val = (end_date['c'] - start_date['o'])/start_date['o']
    portfolio_return += ret_val*normalized_values[i]
    
print('Overall return')
print(portfolio_return)

Overall return
-0.07236999628138126


In [19]:
beta_weight = -0.2
volume_weight = 0.5
marketcap_weight = 0.5
return_weight = 0.5

priority = dict()
beta_contributions = list()
volume_contributions = list()
marketcap_contributions = list()
return_contributions = list()



for ind, stock in stocks_ridge.iterrows():
    beta_contribution = abs(stock.beta)*beta_weight
    beta_contributions.append(beta_contribution)
    volume_contribution = np.math.log(stock.volume) * volume_weight
    volume_contributions.append(volume_contribution)
    marketcap_contribution = np.math.log(stock.marketcap) * marketcap_weight
    marketcap_contributions.append(marketcap_contribution)
    return_contribution = np.math.log10(stock.expected_return)* return_weight
    return_contributions.append(return_contribution)
    stock_priority = stock.accuracy * (beta_contribution + volume_contribution + marketcap_contribution + return_contribution )
    priority[stock.name] = stock_priority



stocks_ridge['beta_contribution'] = beta_contributions
stocks_ridge['volume_contribution'] = volume_contributions
stocks_ridge['marketcap_contribution'] = marketcap_contributions
stocks_ridge['return_contribution'] = return_contributions


priority = sorted(priority.items(), key=lambda x:x[1],reverse=True)
top_priorities = priority[0:10]
top_priorities = [item for item in top_priorities if item[1] > 0]
normalized_values = dict()
for i in top_priorities:
    nv = i[1] / sum(j[1] for j in top_priorities)
    normalized_values[i[0]] = nv
    
print(normalized_values)

portfolio_beta = 0
for i in normalized_values.keys():
    portfolio_beta = portfolio_beta + normalized_values[i]*stocks_ridge.loc[i]['beta']

print(portfolio_beta)

{'CFMS': 0.3082342874772485, 'CHEK': 0.30635036143999633, 'TTNP': 0.2577493973772602, 'BIOC': 0.09192836924862016, 'CFRX': 0.03573758445687472}
0.37671060843986753


In [20]:
portfolio_return = 0.0
for i in normalized_values.keys():
    temp = d.loc[i]
    start_date = temp.loc[temp.loc[:]['Time'] == '2015-11-25'].iloc[0]
    end_date = temp.loc[temp.loc[:]['Time'] >= '2015-12-25'].iloc[0]
    ret_val = (end_date['c'] - start_date['o'])/start_date['o']
    portfolio_return += ret_val*normalized_values[i]
    
print('Overall return')
print(portfolio_return)

Overall return
-0.07236999628134885


In [21]:
normalized_values.keys()

dict_keys(['CFMS', 'CHEK', 'TTNP', 'BIOC', 'CFRX'])

In [22]:
import numpy as np, numpy.random

for i in range(10): 
    
    portfolio_return = 0.0 
    ad = np.random.dirichlet(np.ones(6),size=1)
    da = ad.tolist()[0]
    randoms = {'CFRX' : da[0], 'CHEK': da[1], 'CFMS': da[2], 'TTNP' : da[3], 'BIOC' : da[4], 'AVGR' : da[5]}


    for i in randoms.keys():
        temp = d.loc[i]
        start_date = temp.loc[temp.loc[:]['Time'] == '2015-11-25'].iloc[0]
        end_date = temp.loc[temp.loc[:]['Time'] >= '2015-12-25'].iloc[0]
        ret_val = (end_date['c'] - start_date['o'])/start_date['o']
        portfolio_return += ret_val*randoms[i]

    print(f'Overall return {portfolio_return}')
    #print(randoms)
    print('-' * 50)

Overall return 0.043509093334462914
--------------------------------------------------
Overall return -0.03587698273302806
--------------------------------------------------
Overall return -0.16371890797318453
--------------------------------------------------
Overall return 0.06298335862624894
--------------------------------------------------
Overall return -0.03581132545975131
--------------------------------------------------
Overall return 0.15084741190783804
--------------------------------------------------
Overall return 0.06565892132756193
--------------------------------------------------
Overall return 0.06598961500585004
--------------------------------------------------
Overall return 0.007782874849552916
--------------------------------------------------
Overall return -0.0327191958317937
--------------------------------------------------


In [181]:
randoms

{'CFRX': 0.08430215061741332,
 'CHEK': 0.19448913451118133,
 'CFMS': 0.04942539857790324,
 'TTNP': 0.08180386826029917,
 'BIOC': 0.22940429636983817,
 'AVGR': 0.3605751516633648}

In [182]:
len(d)

24076111