# Generate Skip-gram Negative Sampling (SGNS) training dataset for Word2Vec

In [1]:
from w2v_data_utility import *
import os
import pickle

In [2]:
# vocab_creation_hyperparameters:

start_year = 2013
end_year = 2023
normalisation_level = "docs_level_sentence_list_text_normalised"
firm_name_replaced_with_abbrevs = True
firm_name_replaced_with_abbrevs_global = True
min_sentence_len = 10
n_gram_range = (1,1)
vocab_min_token_freq = 500
vocab_max_token_freq = None
max_n_top_features = None
half_context_size = 2
sgns_k = 3

# negative_sampling_method = "selective_uniform", "scaled_unselective_unigram"

In [3]:
w2v_data_and_vocab_config = {
    "start_year": start_year,
    "end_year": end_year,
    "normalisation_level": normalisation_level,
    "firm_name_replaced_with_abbrevs": firm_name_replaced_with_abbrevs,
    "firm_name_replaced_with_abbrevs_global": firm_name_replaced_with_abbrevs_global,
    "min_sentence_len": min_sentence_len,
    "n_gram_range": n_gram_range,
    "vocab_min_token_freq": vocab_min_token_freq,
    "vocab_max_token_freq": vocab_max_token_freq,
    "max_n_top_features": max_n_top_features,
    "half_context_size": half_context_size,
    "sgns_k": sgns_k
}

In [4]:
# load parsed data

master_data_dict = {}
time_range = list(
    range(
        w2v_data_and_vocab_config["start_year"],
        w2v_data_and_vocab_config["end_year"],
    )
)

firm_list = os.listdir(Path("../../report_data"))
firm_list.sort()
for firm in tqdm(firm_list, desc="Loading Parsed PDF"):
    master_data_dict[firm] = {}
    firm_level_retrieval_path = (
        Path("../../preprocessed_and_parsed_report_data") / f"{firm}_data.json"
    )
    with open(firm_level_retrieval_path, "r") as file:
        firm_dict = json.load(file)

    for year in time_range:
        master_data_dict[firm][str(year)] = {}
        master_data_dict[firm][str(year)][
            w2v_data_and_vocab_config['normalisation_level']
        ] = firm_dict[firm][str(year)][
            w2v_data_and_vocab_config['normalisation_level']
        ]

Loading Parsed PDF: 100%|██████████| 117/117 [00:22<00:00,  5.27it/s]


In [5]:
# collect usable sentences from parsed data

training_data = collect_usable_sentences(
    master_data_dict,
    time_range, 
    firm_list,
    normalisation_level=w2v_data_and_vocab_config["normalisation_level"],
    min_sentence_len=w2v_data_and_vocab_config["min_sentence_len"],
    firm_name_replaced_with_abbrevs = w2v_data_and_vocab_config["firm_name_replaced_with_abbrevs"],
    firm_name_replaced_with_abbrevs_global = w2v_data_and_vocab_config["firm_name_replaced_with_abbrevs_global"],
    verbose=1
)

Processing Firms: 100%|██████████| 117/117 [00:11<00:00,  9.88it/s]

Number of missing years: 0
Number of usable sentences: 3695273





In [6]:
# generate n-gram dataset

generated_ngram_dataset = generate_ngram_dataset(
    training_data, n_range=w2v_data_and_vocab_config["n_gram_range"]
)

In [7]:
# calculate frequency and proportion

unique_text_list_with_freq_prop_dict = calculate_frequency_and_proportion(
    generated_ngram_dataset, verbose=1)

Total number of tokens:  76193097
Total number of unique tokens:  154836


In [8]:
# create vocab

word_to_idx_dict, idx_to_word_dict, vocab_list, vocab_stats_dict = create_vocab(
    unique_text_list_with_freq_prop_dict,
    min_freq = w2v_data_and_vocab_config['vocab_min_token_freq'],
    max_freq = w2v_data_and_vocab_config['vocab_max_token_freq'],
    max_n_top_features = w2v_data_and_vocab_config['max_n_top_features'],
    create_oov_token = False,
    verbose=1)

Vocabulary size:  6045


In [9]:
# remove oov from sentences

generated_ngram_dataset_no_oov = remove_oov_from_sentences(
    generated_ngram_dataset, word_to_idx_dict
)

In [10]:
# create skipgram dataset

input_list_skipgram, output_list_skipgram , sentence_used_to_create_dataset = create_skipgram_dataset(
    sentence_list = generated_ngram_dataset_no_oov,
    word_to_idx_dict=word_to_idx_dict,
    half_context_size=w2v_data_and_vocab_config['half_context_size'],
    verbose=1)

Window size:  5
Number of context sentence for training:  234178456


In [11]:
vocab_stats_dict = calculate_post_vocab_proportion(
    vocab_stats_dict, sentence_used_to_create_dataset
)

In [12]:
vocab_stats_dict = distribution_scaling(vocab_stats_dict)

In [13]:
vocab_stats_dict = find_all_positive_negative_output_idx(
    output_list_skipgram,
    input_list_skipgram,
    vocab_stats_dict,
    idx_to_word_dict,
    word_to_idx_dict
)

In [14]:
neg_output_indices, neg_output_data, input_list_skipgram_array = (
    negative_sampling_using_numba_data_prep(
        vocab_stats_dict, idx_to_word_dict, input_list_skipgram
    )
)

In [15]:
negative_samples_list_skipgram = create_skipgram_negative_samples_from_uniform_dist_numba(
    neg_output_indices, neg_output_data, input_list_skipgram_array, k=w2v_data_and_vocab_config['sgns_k']
)

In [None]:
# # check if sampling is done correctly
# for ith_idx in np.random.randint(0, len(input_list_skipgram), 10):
#     for idx in negative_samples_list_skipgram[ith_idx]:
#         print(f'output: {idx_to_word_dict[input_list_skipgram[ith_idx]]}')
#         print(f'Sampled negative output: {idx_to_word_dict[input_list_skipgram[idx]]}')

#         if idx in vocab_stats_dict[idx_to_word_dict[input_list_skipgram[ith_idx]]]["pos_output_set"]:
#             print("-----SAMPLING DONE INCORRECTLY-----")
#         if idx in vocab_stats_dict[idx_to_word_dict[input_list_skipgram[ith_idx]]]["neg_output_set"]:
#             print("-----PASS-----")

In [17]:
w2v_data_and_vocab_config["training_data_stats"] = {
    "n_usable_sentences": len(training_data),
    "vocab_size": len(vocab_list),
    "n_samples": len(output_list_skipgram),
}

In [18]:
# vocab related data
with open(Path("../../w2v_sgns_data/vocab_list.pkl"), 'wb') as f:
    pickle.dump(vocab_list, f)
with open(Path("../../w2v_sgns_data/vocab_stats_dict.pkl"), 'wb') as f:
    pickle.dump(vocab_stats_dict, f)
with open(Path("../../w2v_sgns_data/word_to_idx_dict.json"), 'w') as f:
    json.dump(word_to_idx_dict, f, indent=4)
with open(Path("../../w2v_sgns_data/idx_to_word_dict.json"), 'w') as f:
    json.dump(idx_to_word_dict, f, indent=4)

# training data 
with open(Path("../../w2v_sgns_data/input_list_skipgram.pkl"), 'wb') as f:
    pickle.dump(input_list_skipgram, f)
with open(Path("../../w2v_sgns_data/output_list_skipgram.pkl"), 'wb') as f:
    pickle.dump(output_list_skipgram, f)
with open(Path("../../w2v_sgns_data/negative_samples_list_skipgram.pkl"), 'wb') as f:
    pickle.dump(negative_samples_list_skipgram, f)

# metadata of data vocab and training data
with open(Path("../../w2v_sgns_data/w2v_data_and_vocab_config.json"), 'w') as f:
    json.dump(w2v_data_and_vocab_config, f, indent=4)