## Stock Embeddings

In [44]:
import pandas as pd
%config Completer.use_jedi = False

#-- Import all the functions needed
from S2V_functions import *
from S2V_models import *
from visualisation_functions import *

import plotly.graph_objects as go

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from scipy.stats import gaussian_kde

import plotly.io as pio
pio.renderers.default = 'notebook_connected'

#### Read in the data

In [3]:
#-- Read in the processed returns data
returns_df = pd.read_csv('Data/returns_df_611.csv', index_col=0)
returns_df.index = pd.to_datetime(returns_df.index)

#-- Read in data containing sectors etc
stock_df = pd.read_csv('Data/historical_stocks.csv')

#-- Store lists of useful items and create mapping dictionaries
tickers, ticker2idx, idx2ticker, sectors, industries, names = get_extras(returns_df, stock_df, misc_include=False)

In [5]:
display(returns_df.head(1))
display(returns_df.tail(1))

Unnamed: 0,AAPL,AB,ABM,ABT,ABX,ACXM,ADBE,ADI,ADM,ADP,...,WTS,WWW,WY,XCRA,XLNX,XOM,XRAY,XRX,ZION,ZIXI
2000-01-03,0.088754,-0.006263,-0.003067,-0.036145,0.0,0.041667,-0.025093,-0.030242,-0.010309,-0.034803,...,-0.029661,-0.057143,-0.02698,-0.050279,0.02818,-0.027929,-0.005291,0.066116,-0.062302,-0.061514


Unnamed: 0,AAPL,AB,ABM,ABT,ABX,ACXM,ADBE,ADI,ADM,ADP,...,WTS,WWW,WY,XCRA,XLNX,XOM,XRAY,XRX,ZION,ZIXI
2018-08-24,0.003109,0.008347,0.004942,0.004101,0.022483,0.003535,0.017549,0.014273,0.003762,0.001674,...,0.001804,-0.001557,0.008676,0.005579,0.008611,0.006829,0.009969,-0.001822,-0.002986,0.003663


#### Get training and testing split

In [6]:
test_returns_df = returns_df.iloc[int(len(returns_df)*0.7):]
train_returns_df = returns_df.iloc[:int(len(returns_df)*0.7)]

### Get the context sets 
    - Either generate new or load from saved

#### Generate new context sets

In [40]:
idx_combinations = get_context_sets(train_returns_df, tickers, CONTEXT_SIZE=10, lag=0, 
                                    save=True, periods=['daily','weekly', 'monthly'],
                                    IQR_daily=False, hedging=False)

# idx_combinations = get_context_sets(train_returns_df, tickers, CONTEXT_SIZE=3, lag=0, 
#                                     save=True, periods=['daily'],
#                                     IQR_daily=True, hedging=True)


Getting TGT:CONTEXT Sets


100%|██████████| 3283/3283 [06:58<00:00,  7.85it/s]


Daily pairs added


100%|██████████| 656/656 [01:23<00:00,  7.83it/s]


Weekly pairs added


100%|██████████| 156/156 [00:21<00:00,  7.28it/s]


Monthly pairs added
--- DONE ---
Number of context sets: 2467418
Saved to: TrainingData/IJCNN_TP_daily_weekly_monthly_L0_C10_IQRDailyFalse_Training_0to70.json


#### Or load from saved

In [7]:
#-- Load idx_combinations from saved
l = 0
C = 10
IQR = False
p = 'daily_weekly_monthly'
# p='daily'
hedging=False
df = pd.read_json(f'TrainingData/IJCNN_TP_{p}_L{l}_C{C}_IQRDaily{str(IQR)}_Training_0to70{"_HEDGING" if hedging else ""}.json')

idx_combinations = df.values
del df
print("--- Sets Loaded ---")
print(f'Number of context sets: {len(idx_combinations)}')

--- Sets Loaded ---
Number of context sets: 2467418


### Define and train the embeddings model

#### Firstly, get the weights if using the weighted version of the model

In [41]:
#-- Get the co-occurrence data necessary to compute weights
cooccurrence_df = get_cooccurrence_df(tickers, idx_combinations, idx2ticker)

100%|██████████| 2467418/2467418 [00:12<00:00, 202602.45it/s]


#### Define the model and train

In [42]:
""" --- Model with weights --- """
# p='daily'
# model = torch.load(f'Models/model_CBOW_Single_{p}_L{l}_C{C}_IQRDaily{str(IQR)}_70_Weights.pt')
model = CBOW_StockModeller_Single_Weights(len(tickers),20)

print(f'Models/model_CBOW_Single_{p}_L{l}_C{C}_IQRDaily{str(IQR)}_70_Weights{"_HEDGING" if hedging else ""}.pt')



losses = train_model_weights(15, model, cooccurrence_df, idx2ticker, 
                             loss_function=nn.NLLLoss(), 
                             optimizer=optim.Adam(model.parameters(),
                                                   lr=0.0001, 
                                                   #weight_decay=0.01
                                                  ), 
                             train_loader=get_train_loader(idx_combinations), early_stop=True)

#-- Save the model
PATH = f'Models/model_CBOW_Single_{p}_L{l}_C{C}_IQRDaily{str(IQR)}_70_Weights{"_HEDGING" if hedging else ""}.pt'
# Save
torch.save(model, PATH)

Models/model_CBOW_Single_daily_weekly_monthly_L0_C10_IQRDailyTrue_70_Weights.pt


100%|██████████| 15/15 [1:23:16<00:00, 333.09s/it]


In [43]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=1+np.arange(len(losses)),
                         y=losses))
fig.update_layout(  title="Training Loss by Epoch",
                    template='plotly_white',
                    yaxis=dict(title='Loss'),
                    xaxis=dict(title='Epoch'))

In [10]:
""" --- Model without weights --- """
model = CBOW_StockModeller_Single(len(tickers),20)


PATH = f'Models/model_CBOW_Single_{p}_L{l}_C{C}_IQRDaily{str(IQR)}_70{"_HEDGING" if hedging else ""}.pt'
print(PATH)

losses = train_model(30, model, 
                     loss_function=nn.NLLLoss(), 
                     optimizer=optim.Adam(model.parameters(),
                                           lr=0.0001, 
                                           #weight_decay=0.01
                                          ), 
                     train_loader=get_train_loader(idx_combinations), early_stop=True)

#-- Save the model


# Save
torch.save(model, PATH)

Models/model_CBOW_Single_daily_L0_C3_IQRDailyTrue_70_HEDGING.pt


100%|██████████| 30/30 [09:28<00:00, 18.96s/it]


## Evaluation Bits

In [12]:
#-- Load in the desired model
l = 0
C = 3
IQR = True
p='daily_weekly_monthly'
hedging=False
weighted = False
print(f'Models/model_CBOW_Single_{p}_L{l}_C{C}_IQRDaily{str(IQR)}_70{"_HEDGING" if hedging else ""}.pt')
model = torch.load(f'Models/model_CBOW_Single_{p}_L{l}_C{C}_IQRDaily{str(IQR)}_70{"_HEDGING" if hedging else ""}{"_Weights" if weighted else ""}.pt')


Models/model_CBOW_Single_daily_weekly_monthly_L0_C3_IQRDailyTrue_70.pt


### PCA Plot

In [13]:
#-- Visualise embeddings in 3D space
pca_plot_from_embeddings(model.embeddings.weight.detach().numpy(),
                         sectors, tickers, industries, names,  dimensions=3,
                         method='PCA')

### High Similarity Mismatch

In [41]:
threshold=0.924

#-- Get similarity matrix
similarity_matrix = cosine_similarity(model.embeddings.weight.detach().numpy())
#-- Set diagonal entries to 0 similarity so they dont get an edge
np.fill_diagonal(similarity_matrix, 0)
#-- Get a list of pairs which have a similarity over the threshold
edge_pairs = np.transpose((similarity_matrix>threshold).nonzero())

count=0
mismatch_list = []
for i,j in edge_pairs[:int(len(edge_pairs)/2)]:
    if sectors[i]==sectors[j]:
        count+=1
    else:
        mismatch_list.append((i,j))
        
print(f'Pairs of stocks with over {threshold} similarity are in the same sector {round(count/(len(edge_pairs)/2),2)*100}% of the time')

Pairs of stocks with over 0.924 similarity are in the same sector 98.0% of the time


In [42]:
#-- Mismatch examples
print("- "*20)
print("- "*20)
for i,j in mismatch_list:
    print(f'Similarity {similarity_matrix[i,j]:.2f}')
    print(names[i], " -- " ,sectors[i], " -- ",industries[i])
    print(names[j], " -- " ,sectors[j], " -- ",industries[j])
    print("="*20)

- - - - - - - - - - - - - - - - - - - - 
- - - - - - - - - - - - - - - - - - - - 
Similarity 0.95
KB HOME  --  CAPITAL GOODS  --  HOMEBUILDING
LENNAR CORPORATION  --  BASIC INDUSTRIES  --  HOMEBUILDING


### Sector Classification

In [69]:
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, precision_score, recall_score#, top_k_accuracy_score

def get_sector_score(model):
    embedding_matrix = model.embeddings.weight.detach().numpy()


    # X = embedding_matrix
    # y = sectors

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # sm = SMOTE(random_state=42)
    # X_res, y_res = sm.fit_resample(X_train, y_train)

    # classifier = SVC(kernel='rbf', probability=True)

    # classifier.fit(X_res,y_res)

    # print(classification_report(y_test, classifier.predict(X_test)))





    #-- Cross validation approach

    accuracy_list = []
    accuracy_list_top_k = []
    k=3
    f1_list = []
    recall_list = []
    precision_list = []

    X = embedding_matrix
    y = np.expand_dims(np.array(sectors),axis=1)


    kf = StratifiedKFold(n_splits=5, shuffle=True)

    for fold, (train_index, test_index) in enumerate(kf.split(X, y.flatten())):
        X_train = X[train_index]
        y_train = y[train_index]  # Based on your code, you might need a ravel call here, but I would look into how you're generating your y
        X_test = X[test_index]
        y_test = y[test_index]  # See comment on ravel and  y_train
        sm = SMOTE()
        X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
        classifier = SVC(kernel='rbf', probability=True)
        classifier.fit(X_train_oversampled, y_train_oversampled )  
        y_pred = classifier.predict(X_test)
        # print(f'For fold {fold}:')
        # print(f'Accuracy: {classifier.score(X_test, y_test)}')
        # print(f'f-score: {f1_score(y_test, y_pred, average="weighted")}')

        accuracy_list.append(classifier.score(X_test, y_test))
        # accuracy_list_top_k.append(top_k_accuracy_score(y_test, classifier.predict_proba(X_test),k=k))
        f1_list.append(f1_score(y_test, y_pred, average="weighted"))
        recall_list.append(recall_score(y_test, y_pred, average="weighted"))
        precision_list.append(precision_score(y_test, y_pred, average="weighted"))




    print(f'Precision Score: {np.round(np.mean(precision_list),2)}')
    print(f'Recall Score: {np.round(np.mean(recall_list),2)}')
    print(f'F1 Score: {np.round(np.mean(f1_list),2)}')
    print(f'Accuracy Score: {np.round(np.mean(accuracy_list),2)}')
    # print(f'Accuracy Score Top-{k}: {np.round(np.mean(accuracy_list_top_k),2)}')


l = 0
C = 3
IQR = True
p='daily_weekly_monthly'
weighted = False
model = torch.load(f'Models/model_CBOW_Single_{p}_L{l}_C{C}_IQRDaily{str(IQR)}_70{"_Weights" if weighted else ""}.pt')
print(f'Loaded : Models/model_CBOW_Single_{p}_L{l}_C{C}_IQRDaily{str(IQR)}_70{"_Weights" if weighted else ""}.pt')

get_sector_score(model)

Loaded : Models/model_CBOW_Single_daily_weekly_monthly_L0_C3_IQRDailyTrue_70.pt
Precision Score: 0.6
Recall Score: 0.58
F1 Score: 0.58
Accuracy Score: 0.58


In [70]:
models_dict = {'Embedding': torch.load(f'Models/model_CBOW_Single_{p}_L{0}_C{3}_IQRDaily{str(False)}_70.pt'),
                'embedding_no_IQR_weights': torch.load(f'Models/model_CBOW_Single_{p}_L{0}_C{10}_IQRDaily{str(False)}_70_weights.pt'),
                'Embedding + IQR': torch.load(f'Models/model_CBOW_Single_{p}_L{0}_C{3}_IQRDaily{str(True)}_70.pt'),
                'embedding_IQR_weights': torch.load(f'Models/model_CBOW_Single_{p}_L{0}_C{10}_IQRDaily{str(True)}_70_Weights.pt'),
                'embedding_IQR_no_weights_Hedging': torch.load(f'Models/model_CBOW_Single_{"daily"}_L{0}_C{3}_IQRDaily{str(True)}_70_HEDGING.pt'),
                'embedding_IQR_weights_Hedging': torch.load(f'Models/model_CBOW_Single_{"daily"}_L{0}_C{3}_IQRDaily{str(True)}_70_Weights_HEDGING.pt')
                }

for key,m in models_dict.items():
    print("="*50)
    print(key)
    get_sector_score(m)

### Analogies

In [75]:

embedding_matrix = model.embeddings.weight.detach().numpy()


#-- Get the average vector for various sectors

#-- Create a dictionary with the index position of all tickers of that sector
sector_idx_dict = {}
#-- Create a dictionary to store the sector embeddings
sector_embed_dict = {}
for sector in sectors:
    #-- Add indexes
    sector_idx_dict[sector]=[i for i, x in enumerate(tickers) if sectors[i]==sector]
    #-- Compute and add the average embedding for each sector
    sector_embed_dict[sector] = embedding_matrix[sector_idx_dict[sector]].mean(axis=0)
    
tick = "MSFT"
temp_sector1 = "TECHNOLOGY"
temp_sector2 = "HEALTH CARE"


embed = (embedding_matrix[ticker2idx[tick]] - sector_embed_dict[temp_sector1]) + sector_embed_dict[temp_sector2]


#-- Get cosine similarity of query embedding with each row of embedding matrix
temp_sims = cosine_similarity(np.expand_dims(embed, 0), embedding_matrix)

for idx in np.argsort(-temp_sims).flatten()[:1]:

    print("="*20)
    print(f'{tick} is to {temp_sector1} as {idx2ticker[idx]} is to Tech')
    print(tick, "--", names[ticker2idx[tick]], "--", round(temp_sims.flatten()[ticker2idx[tick]],2), "--", stock_df[stock_df.ticker==idx2ticker[ticker2idx[tick]]].sector.iloc[0], "--", stock_df[stock_df.ticker==idx2ticker[ticker2idx[tick]]].industry.iloc[0])
    print(idx2ticker[idx], "--", names[idx], "--", round(temp_sims.flatten()[idx],2), "--", stock_df[stock_df.ticker==idx2ticker[idx]].sector.iloc[0], "--", stock_df[stock_df.ticker==idx2ticker[idx]].industry.iloc[0])



MSFT is to TECHNOLOGY as MRK is to Tech
MSFT -- MICROSOFT CORPORATION -- 0.79 -- TECHNOLOGY -- COMPUTER SOFTWARE: PREPACKAGED SOFTWARE
MRK -- MERCK & COMPANY, INC. -- 0.82 -- HEALTH CARE -- MAJOR PHARMACEUTICALS


### Portfolio Hedging

In [76]:
def get_vol(tick1, tick2, plot=False):
    """ -- Simulate a portfolio with two stocks and return realised volatility --"""
    #-- Equal weight
    combined_returns = (test_returns_df[tick1].values+test_returns_df[tick2].values)/2
    #-- Simulate accumulated value of portfolio
    portfolio_evolution = np.cumprod(1+combined_returns)
    
    if plot:
        #-- Plot the portfolio evolution
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=test_returns_df.index, 
                                 y=portfolio_evolution,
                                 name='Portfolio'))
        fig.add_trace(go.Scatter(x=test_returns_df.index, 
                                 y=np.cumprod(1+test_returns_df[tick1].values),
                                 name=tick1,
                                 opacity=0.6))
        fig.add_trace(go.Scatter(x=test_returns_df.index, 
                                 y=np.cumprod(1+test_returns_df[tick2].values),
                                 name=tick2,
                                 opacity=0.6))
        fig.update_layout(template='plotly_white',
                          yaxis=dict(title='Cumulative Returns'),
                          title=f'Hedged Portfolio Evolution | {tick1} - {tick2}')
        fig.show()
        
    #-- Compute volatiltiy as standard deviation of log returns
    portfolio_log_returns = [np.log(1 + (portfolio_evolution[i+1]-portfolio_evolution[i])/portfolio_evolution[i] ) for i in range(len(portfolio_evolution)-1)]
    portfolio_vol = np.std(portfolio_log_returns)*np.sqrt(252)
    return round(portfolio_vol,3)

In [77]:

def get_vol_array(similarities):
    #-- Get vol_list for a given similarity matrix

    n=50

    vol_list = []

    #-- Ensure diagonal is nan so a stock will never hedge with itself
    np.fill_diagonal(similarities,np.nan)

    #-- Get the LOWEST similarity stock for each metric
    hedge_idxs = np.argsort(similarities, axis=0)
    #-- Choose hedge stock randomly from top-n least similar stocks
    #- Prevents the same stocks being chosen again and again
    hedge_idxs = [hedge_idxs[np.random.randint(0,n),i] for i in range(len(hedge_idxs))]

    for i in range(len(tickers)):
        tick1 = tickers[i]
        tick2 = tickers[hedge_idxs[i]]


        temp_vol = get_vol(tick1, tick2, plot=False)

        vol_list.append(temp_vol)
        
        
    return np.array(vol_list)

In [78]:

from scipy.stats import spearmanr

p = 'daily_weekly_monthly'

metrics_dict = {'Embedding': cosine_similarity(torch.load(f'Models/model_CBOW_Single_{p}_L{0}_C{3}_IQRDaily{str(False)}_70.pt').embeddings.weight.detach().numpy()),
                'embedding_no_IQR_weights': cosine_similarity(torch.load(f'Models/model_CBOW_Single_{p}_L{0}_C{10}_IQRDaily{str(False)}_70_weights.pt').embeddings.weight.detach().numpy()),
                'Embedding + IQR': cosine_similarity(torch.load(f'Models/model_CBOW_Single_{p}_L{0}_C{3}_IQRDaily{str(True)}_70.pt').embeddings.weight.detach().numpy()),
                'embedding_IQR_weights': cosine_similarity(torch.load(f'Models/model_CBOW_Single_{p}_L{0}_C{10}_IQRDaily{str(True)}_70_Weights.pt').embeddings.weight.detach().numpy()),
                'embedding_IQR_no_weights_Hedging': cosine_similarity(torch.load(f'Models/model_CBOW_Single_{"daily"}_L{0}_C{3}_IQRDaily{str(True)}_70_HEDGING.pt').embeddings.weight.detach().numpy()),
                'embedding_IQR_weights_Hedging': cosine_similarity(torch.load(f'Models/model_CBOW_Single_{"daily"}_L{0}_C{3}_IQRDaily{str(True)}_70_Weights_HEDGING.pt').embeddings.weight.detach().numpy()),
                'Pearson': train_returns_df.corr().values,
                'geometric': np.genfromtxt('geometric_similarities.csv', delimiter=","),
                'spearman': spearmanr(train_returns_df.values)[0],
                #'random': np.random.rand(611,611)
                }



vol_dict = {}
for name, similarities in tqdm(metrics_dict.items()):
    vol_dict[name]=get_vol_array(similarities)



    

100%|██████████| 9/9 [00:14<00:00,  1.56s/it]


In [84]:
fig = go.Figure()
linestyles = ['dash','dot', 'dashdot', 'dash']
count = 0
for name, vols in vol_dict.items():
    if name not in ['Embedding', 'Pearson','Embedding + IQR', 'embedding_no_IQR_weights']:
        continue
    
#     fig.add_trace(go.Histogram(x=get_vol_array(similarities), name=name))
    xvals = np.linspace(0.15,0.5,200)
    kde = gaussian_kde(vols)
    fig.add_trace(go.Scatter(x=xvals, y=kde(xvals), name=name, #marker=dict(color=color),
                             opacity=1,
                             #fill='tozeroy',
                             line=dict(dash=linestyles[count])
                            ))
    count+=1
    
fig.update_layout(template='plotly_white')
fig.update_layout(xaxis=dict(title="Portfolio Volatiltiy"),
                  yaxis = dict(title="Probability Density"))

fig.update_layout(legend=dict(
                    orientation="h",
                    yanchor="bottom",
                    y=1.02,
                    xanchor="right",
                    x=0.75
                ))
fig.update_layout(height=450, width=550)

In [87]:
#-- Run experiment multiple times for HSD test
sims_dict =  {}
for name, vols in vol_dict.items():
    sims_dict[name]=[]

for i in tqdm(range(50)):
    vol_dict = {}
    for name, similarities in metrics_dict.items():
        vol_dict[name]=get_vol_array(similarities)

    for name,vols in vol_dict.items():
        sims_dict[name].append(np.mean(vols))


import statsmodels.stats.multicomp as multi
for name, mean_vols in sims_dict.items():
    print("="*40)
    print("="*40)
    print(f'{name}: {np.mean(mean_vols):.3f}')
    print(f'{"Pearson"}: {np.mean(sims_dict["Pearson"]):.3f}')
    mc = multi.MultiComparison(np.concatenate([sims_dict[name],sims_dict['Pearson']]),
                                    [name]*len(sims_dict[name])+['pears']*len(sims_dict['Pearson']))
    Results = mc.tukeyhsd(alpha=0.01)
    display(pd.DataFrame(data=Results._results_table.data[1:], columns=Results._results_table.data[0]))   


Embedding: 0.229
Pearson: 0.238


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,Embedding,pears,0.009,0.001,0.0077,0.0102,True


embedding_no_IQR_weights: 0.227
Pearson: 0.238


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,embedding_no_IQR_weights,pears,0.0109,0.001,0.0097,0.012,True


Embedding + IQR: 0.213
Pearson: 0.238


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,Embedding + IQR,pears,0.0246,0.001,0.0234,0.0257,True


embedding_IQR_weights: 0.225
Pearson: 0.238


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,embedding_IQR_weights,pears,0.0126,0.001,0.0114,0.0137,True


embedding_IQR_no_weights_Hedging: 0.195
Pearson: 0.238


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,embedding_IQR_no_weights_Hedging,pears,0.0428,0.001,0.0417,0.0438,True


embedding_IQR_weights_Hedging: 0.215
Pearson: 0.238


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,embedding_IQR_weights_Hedging,pears,0.0228,0.001,0.0216,0.024,True


Pearson: 0.238
Pearson: 0.238


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,Pearson,pears,0.0,0.9,-0.0014,0.0014,False


geometric: 0.239
Pearson: 0.238


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,geometric,pears,-0.0018,0.0011,-0.0031,-0.0004,True


spearman: 0.240
Pearson: 0.238


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,pears,spearman,0.0026,0.001,0.0012,0.004,True
