# Approach for Converting Numbers to Continous variables

In [1]:
import sys
import os

script_dir = os.path.dirname(os.path.abspath('number_encoder.ipynb'))
parent_directory = os.path.dirname(script_dir)
module_directory = os.path.join(parent_directory, 'module') 
utils_directory = os.path.join(parent_directory, 'utils') 

if (parent_directory not in sys.path):
    sys.path.append(parent_directory)
    
if (module_directory not in sys.path):
    sys.path.append(module_directory)
    
if (utils_directory not in sys.path):
    sys.path.append(utils_directory)  

from utils import config
from module.preprocess.bpe import Encoder
from module.preprocess.load_and_batch import DataBatcher
from module.architecture.phase import InputPhase
from module.architecture.comps import MetaDataTokens
from module.architecture.dataclasses import ModelArgs

# Loading dataset and showing examples of tabular rows that are converted to text

In [2]:
def dataset_loader() -> DataBatcher:
    # loading and batch preparation
    loader = DataBatcher()
    loader.load_state(config.BASE_LOCATION)

    # if nothing on file currently load
    if loader.nones:
        loader.load_and_process(
            base_loc=config.DATA_LOCATION, 
            minority_loc=config.SPLIT_DATASETS+'target.csv', 
            majority_loc=config.SPLIT_DATASETS+f"dataset_{config.DATASET_CONFIG['split_to_load']}.csv", 
            train_test_split=config.DATASET_CONFIG['train_test_split'],
            validation_split=config.DATASET_CONFIG['validation_split'],
            training = config.DATASET_CONFIG['training_stage'])
            
        loader.save_state(config.BASE_LOCATION)

    return loader

preprocess = dataset_loader()
print(f"----> Size of training set: {preprocess.train.shape}")

# load from validation set
batch = preprocess.get_meta_data(
        batch_size=config.DATASET_CONFIG["batch"],
        data_type="valid",
        ignore_list=["case_id"],
        output_list=["WEEK_NUM", "target"],
    )
    
# Load already trained BPE from
encoder = Encoder(None)
encoder.load_state(config.BASE_LOCATION)

print(batch.texts[0])
aaa = encoder.encode_text(batch.texts[0], True)
print(batch.texts[-1])
aaa = encoder.encode_text(batch.texts[-1], True)

----> Size of training set: (1068660, 223)
Empty instantiation. Ensure to load from pickle file location
actualdpdtolerance_344P is empty, amtinstpaidbefduel24m_4187115A is empty, annuity_780A is <|AMOUNT|>, annuitynextmonth_57A is <|AMOUNT|>, applicationcnt_361L is <|NUM|>, applications30d_658L is <|NUM|>, applicationscnt_1086L is <|NUM|>, applicationscnt_464L is <|NUM|>, applicationscnt_629L is <|NUM|>, applicationscnt_867L is <|NUM|>, avgdbddpdlast24m_3658932P is empty, avgdbddpdlast3m_4187120P is empty, avgdbdtollast24m_4525197P is empty, avgdpdtolclosure24_3658938P is empty, avginstallast24m_3658937A is empty, avglnamtstart24m_4525187A is empty, avgmaxdpdlast9m_3716943P is empty, avgoutstandbalancel6m_4187114A is empty, avgpmtlast12m_4525200A is empty, bankacctype_710L is empty, cardtype_51L is empty, clientscnt12m_3712952L is <|NUM|>, clientscnt3m_3712950L is <|NUM|>, clientscnt6m_3712949L is <|NUM|>, clientscnt_100L is <|NUM|>, clientscnt_1022L is <|NUM|>, clientscnt_1071L is <|

In [3]:
args = ModelArgs(
        dim=8,
        n_layers=1,
        n_heads=8,
        n_kv_heads=None,
        vocab_size=encoder.vocab_size,
        multiple_of=128,
        ffn_dim_multiplier=None,
        norm_eps=1e-05,
        attn_dropout_rate=0.1,
        proj_dropout_rate=0.5,
        batch_size=config.DATASET_CONFIG["batch"],
        seq_len=config.MODEL_ARGS['max_seq_len'],
        num_target_classes=2,
        use_amp=False,
        device=None,
    )

## Single Token Embeddings

**Overview**
The neural network is configured to learn embeddings for dates, numeric values, and currencies. These embeddings are crucial for capturing semantic relationships in data.

**Embedding Vectors**
- **Directional Representation**: Embedding vectors are particularly effective at representing directional relationships. To leverage this, special care is taken during the preprocessing stage to enforce scale consistency across different types of data.
- **Fixed Dimensionality**: Each tag or token type (dates, numbers, currencies) is processed at a fixed height, facilitating uniformity that aids convolutional layers in detecting and learning patterns effectively.
- **Data Types**: All numeric data is either floated or follows structured patterns, ensuring consistent handling during the embedding process.

**Training Goals**
- **Semantic Clustering**: The objective is for numbers with similar values to cluster in the same direction within the embedding space. This directional similarity should reflect the actual numerical closeness, enhancing the model's ability to interpret and utilize numerical data efectively.
fectively.


<figure>
    <img src="images/Encodings.png" alt="Centered Image" style="display: block; margin-left: auto; margin-right: auto; width:850%;" />
    <figcaption style="text-align: center;Approach for Encoding Numeric Inputng.</figcaption>
</figure>


v>"

In [4]:
modd = MetaDataTokens(embedd_dim=args.dim,  focus_data="amount", eps=1e-5, device=None)
print(f"Length to embedd: {len(batch.get_array('amount'))}")
print('---------------------------------------------')
out = modd(batch.get_array("amount"))
for idx,(key, val) in enumerate(out.items()):
    print(f"Embeddings for {key} is:\n{[round(num, 3) for num in val.tolist()[0]]}")
    if idx == 5:
        break

Length to embedd: 44
---------------------------------------------
Embeddings for 20820.0 is:
[-0.217, -1.004, -2.231, -0.813, 1.106, 2.747, 1.068, 0.486]
Embeddings for 42853.6 is:
[0.013, -0.904, -1.226, 0.016, 0.356, 2.02, -0.548, 0.196]
Embeddings for 23176.0 is:
[-1.177, -0.428, -2.167, -0.152, 0.609, 2.141, 0.789, -0.025]
Embeddings for 54769.824 is:
[-0.668, -1.091, -1.312, 0.299, 0.155, 0.981, -0.57, -0.178]
Embeddings for 93166.0 is:
[-1.08, -1.025, -1.821, -0.062, 0.644, 1.656, 0.841, 0.215]
Embeddings for 13121.601 is:
[-1.221, -1.225, -1.851, 0.225, 0.016, 2.084, 0.897, 0.017]


# Batch Embeddings

In [5]:
inp_emb = InputPhase(args, encoder)
out = inp_emb(batch)
print(f"Shape of converted input: {out.shape}")

Shape of converted input: torch.Size([5, 920, 8])
