In [1]:
# Notebook initialization for consistent paths (repo-aware)
import os, sys, pathlib

# Resolve repo root by walking up until we find 'src'
CWD = pathlib.Path.cwd()
ROOT = CWD
for _ in range(6):
    if (ROOT / 'src').exists():
        break
    ROOT = ROOT.parent
# Fallback to current if not found
if not (ROOT / 'src').exists():
    ROOT = CWD

PROJECT_ROOT = ROOT.resolve()
PROJECT_SRC = PROJECT_ROOT / 'src'
DATA_DIR = PROJECT_SRC / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

if str(PROJECT_SRC) not in sys.path:
    sys.path.insert(0, str(PROJECT_SRC))

print(f'PROJECT_ROOT={PROJECT_ROOT}')
print(f'PROJECT_SRC={PROJECT_SRC}')
print(f'RAW_DATA_DIR={RAW_DATA_DIR}')
print(f'PROCESSED_DATA_DIR={PROCESSED_DATA_DIR}')

PROJECT_ROOT=/Users/pierce.bucknerwolfso/Desktop/embeddings_paper/embeddings-service-exploration
PROJECT_SRC=/Users/pierce.bucknerwolfso/Desktop/embeddings_paper/embeddings-service-exploration/src
RAW_DATA_DIR=/Users/pierce.bucknerwolfso/Desktop/embeddings_paper/embeddings-service-exploration/src/data/raw
PROCESSED_DATA_DIR=/Users/pierce.bucknerwolfso/Desktop/embeddings_paper/embeddings-service-exploration/src/data/processed


In [2]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import gc
import math

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv(f'{PROCESSED_DATA_DIR}/final_features_for_embeddings.csv')
# Show all columns
pd.set_option('display.max_columns', None)

# Show all rows
pd.set_option('display.max_rows', None)

# Set the maximum column width to display full content
pd.set_option('display.max_colwidth', None)

# Set the maximum sequence items to display full list/array content
pd.set_option('display.max_seq_items', None)


  df = pd.read_csv(f'{PROCESSED_DATA_DIR}/final_features_for_embeddings.csv')


In [4]:
print(df.shape)
df.columns

(1369565, 16)


Index(['addr_state', 'earliest_cr_line', 'emp_length', 'emp_title',
       'sub_grade', 'title', 'zip_code', 'avg_cur_bal', 'dti',
       'fico_range_high', 'int_rate', 'loan_amnt', 'mort_acc', 'num_op_rev_tl',
       'revol_util', 'loan_outcome'],
      dtype='object')

In [5]:
# numerical vs catagorical/text columns
numerical_cols = ['loan_amnt', 'int_rate', 'dti', 'avg_cur_bal', 'revol_util', 'num_op_rev_tl', 'mort_acc', 'fico_range_high']
string_cols = ['title', 'emp_title', 'emp_length', 'addr_state', 'sub_grade', 'earliest_cr_line', 'zip_code']

print(numerical_cols)
print(string_cols)

['loan_amnt', 'int_rate', 'dti', 'avg_cur_bal', 'revol_util', 'num_op_rev_tl', 'mort_acc', 'fico_range_high']
['title', 'emp_title', 'emp_length', 'addr_state', 'sub_grade', 'earliest_cr_line', 'zip_code']


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369565 entries, 0 to 1369564
Data columns (total 16 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   addr_state        1369565 non-null  object 
 1   earliest_cr_line  1369565 non-null  object 
 2   emp_length        1369565 non-null  object 
 3   emp_title         1369565 non-null  object 
 4   sub_grade         1369565 non-null  object 
 5   title             1369565 non-null  object 
 6   zip_code          1369565 non-null  int64  
 7   avg_cur_bal       1369565 non-null  object 
 8   dti               1369565 non-null  object 
 9   fico_range_high   1369565 non-null  float64
 10  int_rate          1369565 non-null  float64
 11  loan_amnt         1369565 non-null  int64  
 12  mort_acc          1369565 non-null  object 
 13  num_op_rev_tl     1369565 non-null  object 
 14  revol_util        1369565 non-null  object 
 15  loan_outcome      1369565 non-null  int64  
dtype

In [7]:
class DICE1:
    '''
    DICE class turns numbers into their respective DICE embeddings
    
    Since the cosine function decreases monotonically between 0 and pi, simply employ a linear mapping
    to map distances s_n \in [0, |a-b|] to angles \theta \in [0, pi]
    '''
    def __init__(self, d=2, min_bound=0, max_bound=100, norm="l2"):
        self.d = d # By default, we build DICE-2
        self.min_bound = min_bound
        self.max_bound = max_bound
        self.norm = norm  # Restrict x and y to be of unit length
        self.M = np.random.normal(0, 1, (self.d, self.d))
        self.Q, self.R = np.linalg.qr(self.M, mode="complete")  # QR decomposition for orthonormal basis, Q
    
    def __linear_mapping(self, num):
        '''Eq. (4) from DICE'''
        norm_diff = num / abs(self.min_bound - self.max_bound)
        theta = norm_diff * math.pi
        return theta
    
    def make_dice(self, num):
        r = 1
        theta = self.__linear_mapping(num)
        if self.d == 2:
            # DICE-2
            polar_coord = np.array([r*math.cos(theta), r*math.sin(theta)])
        elif self.d > 2:
            # DICE-D
            polar_coord = np.array([math.sin(theta)**(dim-1) * math.cos(theta) if dim < self.d else math.sin(theta)**(self.d) for dim in range(1, self.d+1)])
        else:
            raise ValueError("Wrong value for `d`. `d` should be greater than or equal to 2.")
            
        dice = np.dot(self.Q, polar_coord)  # DICE-D embedding for `num`
        
        # return dice.tolist()
        return dice

class DICE:
    '''
    DICE class turns numbers into their respective DICE embeddings
    
    Since the cosine function decreases monotonically between 0 and pi, simply employ a linear mapping
    to map distances s_n \in [0, |a-b|] to angles \theta \in [0, pi]
    '''
    def __init__(self, d=2, min_bound=0, max_bound=100, norm="l2", seed: int = 13):
        # Minimal POC tweaks:
        # - deterministic basis via fixed seed
        # - keep API mostly the same
        if d < 2:
            raise ValueError("Wrong value for `d`. `d` should be greater than or equal to 2.")
        self.d = int(d)  # By default, we build DICE-2
        self.min_bound = float(min_bound)
        self.max_bound = float(max_bound)
        self.norm = norm  # If "l2", return a unit vector
        self.seed = int(seed)

        rng = np.random.default_rng(self.seed)
        self.M = rng.normal(0.0, 1.0, (self.d, self.d))
        self.Q, self.R = np.linalg.qr(self.M, mode="complete")  # Orthonormal basis Q
    
    def __linear_mapping(self, num):
        """Map value linearly from [min_bound, max_bound] to angle in [0, pi].
        Clamps outside values. Guards against zero range.
        """
        span = self.max_bound - self.min_bound
        if span == 0:
            return 0.0
        t = (float(num) - self.min_bound) / span
        # clamp to [0, 1]
        if t < 0.0:
            t = 0.0
        elif t > 1.0:
            t = 1.0
        theta = t * math.pi
        return theta
    
    def make_dice(self, num):
        r = 1.0
        theta = self.__linear_mapping(num)
        if self.d == 2:
            # DICE-2
            polar_coord = np.array([r * math.cos(theta), r * math.sin(theta)], dtype=np.float32)
        elif self.d > 2:
            # DICE-D
            polar_coord = np.array([
                (math.sin(theta) ** (dim - 1)) * math.cos(theta) if dim < self.d else (math.sin(theta) ** (self.d))
                for dim in range(1, self.d + 1)
            ], dtype=np.float32)
        else:
            # Guarded in __init__, but keep for safety
            raise ValueError("Wrong value for `d`. `d` should be greater than or equal to 2.")

        dice = np.dot(self.Q.astype(np.float32, copy=False), polar_coord)  # DICE-D embedding for `num`

        if self.norm == "l2":
            n = float(np.linalg.norm(dice))
            if n > 0:
                dice = dice / n

        return dice.astype(np.float32, copy=False)

In [8]:
# DICE Embeddings examples
d = DICE(d=10, min_bound=0, max_bound=10, norm="l2", seed=42)
d1 = DICE1(d=10, min_bound=0, max_bound=10)

print(d.make_dice(10))
print(d1.make_dice(10))

print(d.make_dice(10))
print(d1.make_dice(10))

print(d.make_dice(10))
print(d1.make_dice(10))

[ 0.09091438  0.26237428 -0.05515493  0.6389749   0.22175486  0.08626071
 -0.5020955  -0.38060957  0.13628195  0.1985221 ]
[ 0.4410594  -0.41056359  0.06392424 -0.41668609 -0.29017277 -0.13924215
  0.39882476 -0.17491829  0.18540026  0.36272728]
[ 0.09091438  0.26237428 -0.05515493  0.6389749   0.22175486  0.08626071
 -0.5020955  -0.38060957  0.13628195  0.1985221 ]
[ 0.4410594  -0.41056359  0.06392424 -0.41668609 -0.29017277 -0.13924215
  0.39882476 -0.17491829  0.18540026  0.36272728]
[ 0.09091438  0.26237428 -0.05515493  0.6389749   0.22175486  0.08626071
 -0.5020955  -0.38060957  0.13628195  0.1985221 ]
[ 0.4410594  -0.41056359  0.06392424 -0.41668609 -0.29017277 -0.13924215
  0.39882476 -0.17491829  0.18540026  0.36272728]


In [9]:
# Generate Embeddings
MODEL_NAME = 'all-MiniLM-L6-v2'
BATCH_SIZE = 64

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

# Load data
df = df.drop_duplicates()
feature_cols = df.columns.drop('loan_outcome')

In [10]:
X_full = df.drop(columns=['loan_outcome'])
y_full = df['loan_outcome']

In [13]:
class ColumnarHybridEmbedder:
    def __init__(self, text_cols, numerical_cols, text_model_name='all-MiniLM-L6-v2'):
        self.text_cols = sorted(text_cols)
        self.numerical_cols = sorted(numerical_cols)
        self.feature_order = self.text_cols + self.numerical_cols
        
        self.model_name = text_model_name
        device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
        print(f"Using device: {device}")
        self.text_model = SentenceTransformer(self.model_name, device=device)
        self.numerical_embedders = {}
        self.feature_map = {}
        self.numerical_embedding_dim = 0

    def fit(self, df, numerical_embedding_dim=32):
        print("Fitting embedders and building feature map...")
        self.numerical_embedding_dim = numerical_embedding_dim

        for col in self.numerical_cols:
            numeric_col = pd.to_numeric(df[col], errors='coerce')
            min_val = numeric_col.min()
            max_val = numeric_col.max()
            print(f"bounds for {col}: {min_val} to {max_val}")
            self.numerical_embedders[col] = DICE(d=numerical_embedding_dim,
                                     min_bound=min_val,
                                     max_bound=max_val)
        
        current_index = 0
        text_dim = self.text_model.get_sentence_embedding_dimension()
        for col in self.feature_order:
            start_index = current_index
            if col in self.text_cols:
                end_index = start_index + text_dim
                current_index += text_dim
            elif col in self.numerical_cols:
                end_index = start_index + self.numerical_embedding_dim
                current_index += self.numerical_embedding_dim
            self.feature_map[col] = (start_index, end_index)
            
        print("Fit complete.")
        return self

    def transform(self, df, output_name, batch_size=64):
        print(f"Generating embeddings for {len(df)} rows...")
        
        # 1. Calculate the final shape of the array beforehand
        num_rows = len(df)
        text_dim = self.text_model.get_sentence_embedding_dimension()
        total_dims = (len(self.text_cols) * text_dim) + (len(self.numerical_cols) * self.numerical_embedding_dim)
        final_shape = (num_rows, total_dims)
        
        # 2. Create the memory-mapped file on disk
        output_path = f'{PROCESSED_DATA_DIR}/{output_name}.npy'
        print(f"Creating memory-mapped file at: {output_path} with shape {final_shape}")
        final_embeddings = np.memmap(output_path, dtype='float32', mode='w+', shape=final_shape)

        # 3. Fill the memmap array column by column
        for col in tqdm(self.feature_order, desc="Embedding and saving columns"):
            start_idx, end_idx = self.feature_map[col] # Get indices from the map
            
            if col in self.text_cols:
                sentences = df[col].astype(str).tolist()
                embs = self.text_model.encode(
                    sentences, batch_size=batch_size, show_progress_bar=False, normalize_embeddings=True, convert_to_numpy=True
                )
                # Assign directly to the slice in the file on disk
                final_embeddings[:, start_idx:end_idx] = embs
            
            elif col in self.numerical_cols:
                # This part can be optimized significantly by avoiding the Python loop
                embedder = self.numerical_embedders[col]
                
                # Vectorized approach to handle missing values
                vals = pd.to_numeric(df[col], errors='coerce').values
                is_nan = np.isnan(vals)
                
                # Create embeddings for valid numbers
                valid_embs = np.array([embedder.make_dice(v) for v in vals[~is_nan]])
                
                # Create a temporary full array and fill it
                col_embs = np.zeros((num_rows, self.numerical_embedding_dim), dtype='float32')
                col_embs[~is_nan] = valid_embs
                
                # Assign the entire column's embeddings at once
                final_embeddings[:, start_idx:end_idx] = col_embs

            # 4. Flush changes to disk and clean up memory
            final_embeddings.flush()
            gc.collect()

        print("Transformation complete. Embeddings saved.")
        
        # The embeddings are already saved, so we only need to save the labels
        labels = y_full.astype(np.int8).values # Make sure y_full is defined
        np.save(f'{PROCESSED_DATA_DIR}/{output_name}_labels.npy', labels)
        
        return "Embedding generation complete. Files saved."

In [None]:
# Create hybrid embeddings
hybrid_embedder = ColumnarHybridEmbedder(text_cols=string_cols, numerical_cols=numerical_cols)
hybrid_embedder.fit(df, numerical_embedding_dim=32) # Using d=32 for DICE
hybrid_embedder.transform(df, output_name='hybrid_embeddings');

Using device: mps
Fitting embedders and building feature map...
bounds for avg_cur_bal: 0.0 to 958084.0
bounds for dti: -1.0 to 999.0
bounds for fico_range_high: 614.0 to 850.0
bounds for int_rate: 5.31 to 30.99
bounds for loan_amnt: 500 to 40000
bounds for mort_acc: 0.0 to 51.0
bounds for num_op_rev_tl: 0.0 to 83.0
bounds for revol_util: 0.0 to 892.3
Fit complete.
Generating embeddings for 1369565 rows...
Creating memory-mapped file at: /Users/pierce.bucknerwolfso/Desktop/embeddings_paper/embeddings-service-exploration/src/data/processed/hybrid_embeddings.npy with shape (1369565, 2944)


Embedding and saving columns:  27%|██▋       | 4/15 [20:21<57:14, 312.22s/it]  

In [None]:
print("\n--- Feature Map ---")
f_map = {}
for feature, (start, end) in hybrid_embedder.feature_map.items():
    f_map[feature] = (start, end)
print(f_map)


--- Feature Map ---
{'addr_state': (0, 384), 'avg_cur_bal': (384, 768), 'dti': (768, 1152), 'earliest_cr_line': (1152, 1536), 'emp_length': (1536, 1920), 'emp_title': (1920, 2304), 'fico_range_high': (2304, 2688), 'int_rate': (2688, 3072), 'loan_amnt': (3072, 3456), 'mort_acc': (3456, 3840), 'num_op_rev_tl': (3840, 4224), 'revol_util': (4224, 4608), 'sub_grade': (4608, 4992), 'title': (4992, 5376), 'zip_code': (5376, 5760)}


In [None]:
# Create string only embeddings
hybrid_embedder = ColumnarHybridEmbedder(text_cols=string_cols + numerical_cols, numerical_cols=[])
hybrid_embedder.fit(df, numerical_embedding_dim=32) # Using d=32 for DICE
hybrid_embedder.transform(df, output_name='text_only_embeddings');


In [None]:
print("\n--- Feature Map ---")
f_map = {}
for feature, (start, end) in hybrid_embedder.feature_map.items():
    f_map[feature] = (start, end)
print(f_map)