# Word2Vec Embedding Analysis 

In [1]:
from w2v_model_and_trainer_utility import *
from pathlib import Path
import os

In [2]:
with open(
    Path(
        "../../embedding_data/w2v/w2v_runs/sgns_global_replacement/data_and_model_config.json"
    ),
    "r",
) as f:
    data_and_model_config = json.load(f)

epoch = 2

trained_w2v_full_embedding = torch.load(
    Path("../../embedding_data/w2v/w2v_runs/sgns_global_replacement/")
    / (f"w2v_epoch_{epoch}_output_embedding.pth")
)

_, _, _, word_to_idx_dict, idx_to_word_dict, _, _, _ = load_resources_sgns()

In [3]:
trained_w2v_full_embedding_cosine_similarity_matrix = compute_similarity_matrix(
    trained_w2v_full_embedding
)

## Currency Top Similar Examples

In [4]:
currency_related_term_to_search = [
    "aud",
    "cad",
    "gbp",
    "hkd",
    "inr",
    "jpy",
    "cny",
    "sgd",
]
w2v_currency_examples_df = search_top_n_similar_words_to_df(
    currency_related_term_to_search,
    3,
    word_to_idx_dict,
    idx_to_word_dict,
    trained_w2v_full_embedding_cosine_similarity_matrix,
)

w2v_currency_examples_df.style.format(na_rep="").set_table_styles([
    {'selector': 'small', 'props': ':;'}], overwrite=False).to_latex(
    buf="../../../tex_source/tables/tables_result/w2v_currency_example_table.tex",
    caption="Word2Vec Currency Top Similar Word Examples (Cosine Similarity)",
    label="tab:w2v_currency_example_table",
    encoding="utf-8",
    hrules=True,
    position_float="centering",
    position="H",
    column_format="l"+"c"*(len(w2v_currency_examples_df.columns)),
)
# {'selector': 'small', 'props': ':;'}
# {'selector': 'fontsize', 'props': ':{11pt}{13pt}\selectfont;'}

w2v_currency_examples_df

Unnamed: 0,aud,cad,cny,gbp,hkd,inr,jpy,sgd
Top 1,nzd,nzd,inr,jpy,inr,hkd,gbp,nzd
Top 2,hkd,hkd,hkd,aud,nzd,nzd,nzd,inr
Top 3,inr,inr,europ,hkd,cny,cny,aud,aud


## Financial Term Top Similar Examples

In [5]:
financial_term_to_search = [
    "security",
    "underwrite",
    "asset",
    "liability",
    "inflation",
    "default",
    "derivative",
]
w2v_financial_terms_examples_df = search_top_n_similar_words_to_df(
    financial_term_to_search,
    3,
    word_to_idx_dict,
    idx_to_word_dict,
    trained_w2v_full_embedding_cosine_similarity_matrix,
)

w2v_financial_terms_examples_df.style.format(na_rep="").set_table_styles([
    {'selector': 'small', 'props': ':;'}], overwrite=False).to_latex(
    buf="../../../tex_source/tables/tables_result/w2v_financial_terms_example_table.tex",
    caption="Word2Vec Financial Term Top Similar Word Examples (Cosine Similarity)",
    label="tab:w2v_financial_terms_example_table",
    encoding="utf-8",
    hrules=True,
    position_float="centering",
    position="H",
    column_format="l"+"c"*(len(w2v_financial_terms_examples_df.columns)),
)
# {'selector': 'small', 'props': ':;'}
# {'selector': 'fontsize', 'props': ':{11pt}{13pt}\selectfont;'}

w2v_financial_terms_examples_df

Unnamed: 0,asset,default,derivative,inflation,liability,security,underwrite
Top 1,investment,collateral,instrument,volatility,loss,asset,underwriter
Top 2,value,counterparty,trading,uncertainty,asset,investment,reinsure
Top 3,credit,exposure,hedge,yield,interest,interest,bookrunner


## Financial Institution Top Similar Examples

In [12]:
financial_institution_to_search = [
    "jpm_token",
    "hsbc_token",
    "mufg_token",
    "anz_token",
    "g_token",
    "boc_token",
    "rbc_token",
    "db_token",
    "blk_token",
    "san_token",
    "aig_token",
    "bnp_token",
    "kbfg_token",
]
w2v_financial_institution_examples_df = search_top_n_similar_words_to_df(
    financial_institution_to_search,
    3,
    word_to_idx_dict,
    idx_to_word_dict,
    trained_w2v_full_embedding_cosine_similarity_matrix,
)
w2v_financial_institution_examples_df

w2v_financial_institution_examples_df.columns = [col.replace('_token', '') for col in w2v_financial_institution_examples_df.columns]
w2v_financial_institution_examples_df = w2v_financial_institution_examples_df.map(lambda x: x.replace('_token', '') if isinstance(x, str) else x)

w2v_financial_institution_examples_df.style.format(na_rep="").set_table_styles([
    {'selector': 'small', 'props': ':;'}], overwrite=False).to_latex(
    buf="../../../tex_source/tables/tables_result/w2v_financial_institution_example_table.tex",
    caption="Word2Vec Financial Institution Top Similar Word Examples (Cosine Similarity)",
    label="tab:w2v_financial_institution_example_table",
    encoding="utf-8",
    hrules=True,
    position_float="centering",
    position="H",
    column_format="l"+"c"*(len(w2v_financial_institution_examples_df.columns)),
)
# {'selector': 'small', 'props': ':;'}
# {'selector': 'fontsize', 'props': ':{11pt}{13pt}\selectfont;'}

w2v_financial_institution_examples_df

Unnamed: 0,aig,anz,blk,bnp,boc,db,g,hsbc,jpm,kbfg,mufg,rbc,san
Top 1,met,wbc,pnc,ca,icbc,barc,az,barc,boa,wfg,mfg,td,bbva
Top 2,citi,nab,aig,sg,citic,citi,axa,nwrbs,gs,hfg,nmr,cibc,ing
Top 3,bnym,mqg,citi,bpce,bocom,ubs,av,ing,wfc,sfg,smbc,bmo,ca


# Extract Embedding Dimension of Firm Tokens for Clustering

## Firm Name Global Replacement

In [7]:
with open(
    Path(
        "../../embedding_data/w2v/w2v_runs/sgns_global_replacement/data_and_model_config.json"
    ),
    "r",
) as f:
    data_and_model_config = json.load(f)

epoch = 2

trained_w2v_full_embedding = torch.load(
    Path("../../embedding_data/w2v/w2v_runs/sgns_global_replacement/")
    / (f"w2v_epoch_{epoch}_output_embedding.pth")
)
_, _, _, word_to_idx_dict, _, _, _, _ = load_resources_sgns()

word_to_idx_dict_firm_name_only = {}
for word in word_to_idx_dict.keys():
    if "_token" in word:
        word_to_idx_dict_firm_name_only[word] = word_to_idx_dict[word]

firm_list = os.listdir(Path("../../report_data"))
try:
    firm_list.remove(".DS_Store")
except:
    pass
firm_list.sort()  # ensure the order of the embedding D0 is the same as the order of the firm_list
w2v_embedding_firm_token_only = torch.empty(
    size=(len(firm_list), data_and_model_config["embedding_dim"]), dtype=torch.float32
)

for i, firm in enumerate(firm_list):
    w2v_embedding_firm_token_only[i, :] = trained_w2v_full_embedding[
        word_to_idx_dict[f"{firm}_token"], :
    ]
w2v_embedding_firm_token_only = w2v_embedding_firm_token_only.numpy()

with open(
    Path(
        "../../embedding_data/w2v/w2v_runs/sgns_global_replacement/w2v_embedding_firm_token_only.pkl"
    ),
    "wb",
) as f:
    pickle.dump(w2v_embedding_firm_token_only, f)

## No Firm Name Global Replacement

In [8]:
# with open(Path("../../embedding_data/w2v/w2v_runs/sgns_no_global_replacement/data_and_model_config.json"), "r") as f:
#     data_and_model_config = json.load(f)

# epoch = 2

# trained_w2v_full_embedding =  torch.load(Path("../../embedding_data/w2v/w2v_runs/sgns_no_global_replacement/")/(f"w2v_epoch_{epoch}_output_embedding.pth"))
# _, _, _, word_to_idx_dict, _, _, _, _ = load_resources_sgns()

# word_to_idx_dict_firm_name_only ={}
# for word in word_to_idx_dict.keys():
#     if "_token" in word:
#         word_to_idx_dict_firm_name_only[word] = word_to_idx_dict[word]

# firm_list = os.listdir(Path("../../report_data"))
# try:
#     firm_list.remove(".DS_Store")
# except:
#     pass
# firm_list.sort() # ensure the order of the embedding D0 is the same as the order of the firm_list
# w2v_embedding_firm_token_only = torch.empty(size=(len(firm_list), data_and_model_config["embedding_dim"]), dtype=torch.float32)

# for i, firm in enumerate(firm_list):
#     w2v_embedding_firm_token_only[i,:] = trained_w2v_full_embedding[word_to_idx_dict[f"{firm}_token"],:]
# w2v_embedding_firm_token_only = w2v_embedding_firm_token_only.numpy()

# with open(Path("../../embedding_data/w2v/w2v_runs/sgns_no_global_replacement/w2v_embedding_firm_token_only.pkl"), "wb") as f:
#     pickle.dump(w2v_embedding_firm_token_only, f)