In [2]:
import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import pandas as pd
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets
import json
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import psutil
import os
process = psutil.Process(os.getpid())
print(f"Memory usage before: {process.memory_info().rss / 1024 ** 2:.2f} MB")

Memory usage before: 615.29 MB


In [4]:
ds_compinfo = load_dataset("Mateusz1017/annual_reports_tokenized_llama3_logged_returns_no_null_returns_and_incomplete_descriptions_24k")
df_compinfo = ds_compinfo['train'].to_pandas()

In [5]:
df_compinfo = df_compinfo.dropna(subset=['sic_code'])
df_compinfo = df_compinfo.reset_index(drop=True)

In [6]:
# Define a function to classify SIC codes into industries based on the first two digits
def classify_sic(sic_code):
    # Extract the first two digits of the SIC code
    first_two_digits = int(str(sic_code)[:2])
    
    # Map to industry categories
    if 1 <= first_two_digits <= 9:
        return 'Agriculture, Forestry, And Fishing'
    elif 10 <= first_two_digits <= 14:
        return 'Mining'
    elif 15 <= first_two_digits <= 17:
        return 'Construction'
    elif 20 <= first_two_digits <= 39:
        return 'Manufacturing'
    elif 40 <= first_two_digits <= 49:
        return 'Transportation, Communications, Electric, Gas, And Sanitary Services'
    elif 50 <= first_two_digits <= 51:
        return 'Wholesale Trade'
    elif 52 <= first_two_digits <= 59:
        return 'Retail Trade'
    elif 60 <= first_two_digits <= 67:
        return 'Finance, Insurance, And Real Estate'
    elif 70 <= first_two_digits <= 89:
        return 'Services'
    elif 90 <= first_two_digits <= 99:
        return 'Public Administration'
    else:
        return 'Unknown'

# Apply the classification to the SIC codes in the dataset
df_compinfo['industry_classification'] = df_compinfo['sic_code'].apply(classify_sic)
df_compinfo

Unnamed: 0,cik,year,section_1,company_name,sic_code,input_ids,ticker,returns,logged_monthly_returns_matrix,__index_level_0__,input_ids_length,industry_classification
0,75252,1993,"Item 1. Business\nOwens & Minor, Inc. (the ""Co...",OWENS & MINOR INC/VA/,5047,"[128000, 1256, 220, 16, 13, 8184, 198, 46, 86,...",[OMI],0.569779,"[-0.06939241136032089, 0.07496338743638563, 0....",0,2558,Wholesale Trade
1,40533,1993,ITEM 1. BUSINESS\nINTRODUCTION\nGeneral Dynami...,GENERAL DYNAMICS CORP,3730,"[128000, 12236, 220, 16, 13, 27693, 198, 3301,...",[GD],0.272757,"[0.03622371368058769, -0.181048534708107, -0.0...",3,3445,Manufacturing
2,91576,1993,"ITEM 1. BUSINESS\nOVERVIEW\nOn March 1, 1994, ...",KEYCORP /NEW/,6021,"[128000, 12236, 220, 16, 13, 27693, 198, 50205...","[KEY, KEY-PJ, KEY-PK, KEY-PI, KEY-PL]",0.029588,"[0.03969967097211506, 0.06939185891396345, -0....",5,6640,"Finance, Insurance, And Real Estate"
3,7536,1993,"Item 1. Business.\nArrow Electronics, Inc. (th...",ARROW ELECTRONICS INC,5065,"[128000, 1256, 220, 16, 13, 8184, 627, 27003, ...",[ARW],0.518182,"[0.004008021397538868, 0.01587334915629016, 0....",6,2464,Wholesale Trade
4,10456,1993,ITEM 1. BUSINESS.\n(a) GENERAL DEVELOPMENT OF ...,BAXTER INTERNATIONAL INC,3841,"[128000, 12236, 220, 16, 13, 27693, 627, 2948,...",[BAX],-0.206450,"[-0.04613021404865702, 0.029600784670023767, -...",7,4006,Manufacturing
...,...,...,...,...,...,...,...,...,...,...,...,...
27717,74046,2019,ITEM 1 - BUSINESS\nOVERVIEW OF BUSINESS\nIn 19...,Oil-Dri Corp of America,3990,"[128000, 12236, 220, 16, 482, 27693, 198, 5020...",[ODC],0.360402,"[0.09380163057314017, 0.07467147786746844, 0.0...",61772,2419,Manufacturing
27718,1750,2019,ITEM 1. BUSINESS\nGeneral\nAAR CORP. and its s...,AAR CORP,3720,"[128000, 12236, 220, 16, 13, 27693, 198, 15777...",[AIR],0.213843,"[-0.028972527271497482, -0.11658608430267486, ...",61782,3345,Manufacturing
27719,80420,2019,"Item 1. Business\nOverview\nPowell Industries,...",POWELL INDUSTRIES INC,3613,"[128000, 1256, 220, 16, 13, 8184, 198, 42144, ...",[POWL],0.883976,"[0.13955951729443583, -0.1815976884770055, 0.0...",61785,2292,Manufacturing
27720,934796,2019,ITEM 1.BUSINESS\nOverview of Our Business\nOur...,NETWORK CN INC,7310,"[128000, 12236, 220, 16, 1823, 2078, 24221, 19...",[NWCN],-0.775000,"[-0.3930425704489719, 0.0, 0.0, 0.287682028300...",61786,4744,Services


In [7]:
# df_compinfo = df_compinfo[["cik", "year", "sic_code", "ticker", "__index_level_0__"]]

In [8]:
df_compinfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27722 entries, 0 to 27721
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   cik                            27722 non-null  object 
 1   year                           27722 non-null  object 
 2   section_1                      27722 non-null  object 
 3   company_name                   27722 non-null  object 
 4   sic_code                       27722 non-null  object 
 5   input_ids                      27722 non-null  object 
 6   ticker                         27722 non-null  object 
 7   returns                        27709 non-null  float64
 8   logged_monthly_returns_matrix  27722 non-null  object 
 9   __index_level_0__              27722 non-null  int64  
 10  input_ids_length               27722 non-null  int64  
 11  industry_classification        27722 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usag

In [18]:
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Load BERT tokenizer and model
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

# Function to get embeddings from BERT
def get_bert_embedding(text):
    # Tokenize the text
    tokens = bert_tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    
    with torch.no_grad():  # No gradients needed for inference
        outputs = bert_model(**tokens)
        # outputs.last_hidden_state: [batch_size, seq_length, hidden_size]
        
        # Use the CLS token embedding (first token)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        
        # Alternatively, use the mean of all token embeddings
        # mean_embedding = outputs.last_hidden_state.mean(dim=1)
    
    return cls_embedding.squeeze().tolist()  # Return as a Python list

# Apply the embedding function to the DataFrame
tqdm.pandas(desc="Generating BERT Embeddings")
df_compinfo["BERT-embedding"] = df_compinfo["section_1"].progress_apply(get_bert_embedding)


Generating BERT Embeddings: 100%|██████████| 27722/27722 [47:46<00:00,  9.67it/s]


In [40]:
bert_model.config.max_position_embeddings

512

In [21]:
# df_compinfo.to_pickle("BERT-embedded_csv.pkl")

In [22]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Load SBERT model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to get SBERT embeddings
def get_sbert_embedding(text):
    return sbert_model.encode(text, convert_to_tensor=True).tolist()

# Apply the embedding function to the DataFrame
tqdm.pandas(desc="Generating SBERT Embeddings")
df_compinfo["SBERT-embedding"] = df_compinfo["section_1"].progress_apply(get_sbert_embedding)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [51]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Load SBERT model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")


In [43]:
sbert_model.max_seq_length

256

In [10]:
sbert_model[0].auto_model.config.max_position_embeddings

512

In [58]:
df_compinfo["year"] = df_compinfo["year"].astype(int)
df_compinfo = df_compinfo[~df_compinfo['year'].isin([1993, 1994, 1995])]
df_compinfo.reset_index(drop=True)
df_compinfo = df_compinfo.sort_values(by=['year'], ascending=True)

In [59]:
df_compinfo

Unnamed: 0,cik,year,section_1,company_name,sic_code,input_ids,ticker,returns,logged_monthly_returns_matrix,__index_level_0__,input_ids_length,industry_classification,BERT-embedding,SBERT-embedding
975,862861,1996,ITEM 1. BUSINESS\nGENERAL\nAppliance Recycling...,JanOne Inc.,5700,"[128000, 12236, 220, 16, 13, 27693, 198, 95836...",[ALTS],-0.864865,"[-0.2876819162165623, 0.3184534375856794, 0.08...",1538,4465,Retail Trade,"[-1.0645477771759033, 0.23505006730556488, -0....","[-0.07066744565963745, -0.03502631559967995, 0..."
992,109380,1996,ITEM 1. BUSINESS\nZions Bancorporation (the Pa...,"ZIONS BANCORPORATION, NATIONAL ASSOCIATION /UT/",6021,"[128000, 12236, 220, 16, 13, 27693, 198, 57, 9...","[ZION, ZIONP, ZIONL, ZIONO]",0.333699,"[-0.036610412591151206, -0.041527657379601665,...",1563,7232,"Finance, Insurance, And Real Estate","[-0.6020901203155518, -0.01382492296397686, -0...","[0.055379994213581085, -0.0656338632106781, -0..."
982,102037,1996,ITEM 1. BUSINESS\nA. The Company\nUniversal Co...,UNIVERSAL CORP /VA/,5150,"[128000, 12236, 220, 16, 13, 27693, 198, 32, 1...",[UVV],0.379314,"[0.15015591718026108, -0.06733310337040097, -0...",1547,4012,Wholesale Trade,"[-0.8490127921104431, -0.3367622494697571, -0....","[0.024468760937452316, -0.03050614334642887, -..."
990,710752,1996,ITEM 1. BUSINESS.\nDESCRIPTION OF THE TRUST\nS...,SABINE ROYALTY TRUST,6792,"[128000, 12236, 220, 16, 13, 27693, 627, 46533...",[SBR],0.771184,"[0.01415349005774118, 0.08221469119141876, 0.1...",1561,7794,"Finance, Insurance, And Real Estate","[-0.5184245109558105, -0.05737003684043884, -0...","[-0.04296134039759636, -0.03728189691901207, 0..."
989,72333,1996,Item 1. Business. - ------------------\nNordst...,NORDSTROM INC,5651,"[128000, 1256, 220, 16, 13, 8184, 13, 482, 147...",[JWN],-0.118097,"[0.13948494150378526, 0.07371579519008523, 0.0...",1559,566,Retail Trade,"[-0.7209739089012146, 0.13205184042453766, -0....","[0.04498894512653351, -0.01455921120941639, -0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3676,1310630,2020,ITEM 1. BUSINESS\nChina Foods Holdings Ltd. (t...,China Foods Holdings Ltd.,2833,"[128000, 12236, 220, 16, 13, 27693, 198, 23078...",[CFOO],0.088000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.08434115895221751,...",6865,1750,Manufacturing,"[-0.5753610134124756, -0.1743033528327942, -0....","[-0.037535540759563446, -0.05656890198588371, ..."
3661,798783,2020,ITEM 1.\nBusiness\nGeneral\nWe are a real esta...,UNIVERSAL HEALTH REALTY INCOME TRUST,6798,"[128000, 12236, 220, 16, 627, 23562, 198, 1577...",[UHT],-0.418372,"[-0.13549047531544484, -0.06629779212256275, 0...",6830,7235,"Finance, Insurance, And Real Estate","[-0.6004636883735657, -0.09270477294921875, 0....","[-0.015440104529261589, -0.05609987676143646, ..."
3646,914122,2020,Item 1. BUSINESS\nPerma-Pipe International Hol...,"Perma-Pipe International Holdings, Inc.",3564,"[128000, 1256, 220, 16, 13, 27693, 198, 3976, ...",[PPIH],-0.339479,"[-0.033787020441788815, -0.3389954443959665, -...",6784,2440,Manufacturing,"[-0.48827219009399414, -0.09819140285253525, -...","[-0.049300942569971085, -0.0038823760114610195..."
3727,887596,2020,ITEM 1. BUSINESS\nGeneral\nThe Cheesecake Fact...,CHEESECAKE FACTORY INC,5812,"[128000, 12236, 220, 16, 13, 27693, 198, 15777...",[CAKE],-0.038025,"[-0.07486943363771757, -0.7352798500683871, 0....",6992,613,Retail Trade,"[-0.5994244813919067, -0.214384064078331, -0.0...","[0.03250890225172043, -0.10284492373466492, 0...."


In [35]:
# df_compinfo.to_pickle("Embedded_part2.pkl")
df_compinfo = pd.read_pickle("Embedded_part2.pkl")

In [39]:
df_compinfo["year"]

1996

In [41]:
from huggingface_hub import login
import pandas as pd
import glob

# Optionally, convert to a Hugging Face Dataset
from datasets import Dataset
final_dataset = Dataset.from_pandas(df_compinfo, preserve_index=False)
final_dataset

Dataset({
    features: ['cik', 'year', 'section_1', 'company_name', 'sic_code', 'input_ids', 'ticker', 'returns', 'logged_monthly_returns_matrix', '__index_level_0__', 'input_ids_length', 'industry_classification', 'BERT-embedding', 'SBERT-embedding'],
    num_rows: 26769
})

In [None]:

# login(token="") 
# final_dataset.push_to_hub("v1ctor10/BERT_SBERT_embeddings_SAE")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /users/shaox7/.cache/huggingface/token
Login successful



[A
[A
[A
[A
[A


[A
[A
[A
[A
[A


[A
[A
[A
[A
[A


[A
[A
[A
[A
[A


[A
[A
[A
[A
[A


[A
[A
[A
[A
[A


ploading the dataset shards: 100%|██████████| 6/6 [00:38<00:00,  6.48s/it]ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/v1ctor10/BERT_SBERT_embeddings_SAE/commit/b99e35296729878ee4b1242692ef94180badf174', commit_message='Upload dataset', commit_description='', oid='b99e35296729878ee4b1242692ef94180badf174', pr_url=None, pr_revision=None, pr_num=None)

In [50]:
np.array(df_compinfo["SBERT-embedding"].iloc[3]).shape

(384,)

# To 1536 tokens (as per blackrock paper https://arxiv.org/pdf/2308.08031)

In [12]:
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pandas as pd
import numpy as np

# ====================
# 1. Configuration
# ====================

# Define constants
MAX_TOTAL_TOKENS = 1536       # Maximum total tokens per document
CHUNK_SIZE = 512              # Maximum tokens per chunk
MODEL_NAME = "bert-base-uncased"  # Pre-trained BERT model

# ====================
# 2. Setup Device
# ====================

# Utilize GPU if available for faster computations
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ====================
# 3. Load Tokenizer and Model
# ====================

# Load BERT tokenizer and model
bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)
bert_model.to(device)
bert_model.eval()  # Set model to evaluation mode

# ====================
# 4. Define Helper Functions
# ====================

def split_text_into_chunks(text, tokenizer, max_length=CHUNK_SIZE, max_total_tokens=MAX_TOTAL_TOKENS):
    """
    Splits the input text into chunks of tokens with a maximum length, ensuring total tokens do not exceed max_total_tokens.

    Args:
        text (str): The input document as a string.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
        max_length (int): Maximum number of tokens per chunk.
        max_total_tokens (int): Maximum total tokens per document.

    Returns:
        List[str]: A list of text chunks.
    """
    if not isinstance(text, str):
        return []
    
    # Tokenize the entire text without adding special tokens
    tokens = tokenizer.encode(text, add_special_tokens=False)
    
    # Limit to the first max_total_tokens
    tokens = tokens[:max_total_tokens]
    
    # Split tokens into chunks
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i + max_length]
        # Decode tokens back to string
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
    
    return chunks

def get_bert_embedding_chunk(chunk_text, tokenizer, model, device):
    """
    Generates the embedding for a single text chunk using BERT.

    Args:
        chunk_text (str): The text chunk.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
        model (transformers.PreTrainedModel): The BERT model.
        device (torch.device): The device to perform computations on.

    Returns:
        torch.Tensor: The embedding vector for the chunk.
    """
    if not chunk_text:
        # Return a zero vector if chunk_text is empty
        return torch.zeros(model.config.hidden_size).to(device)
    
    # Tokenize the chunk with padding and truncation
    tokens = tokenizer(
        chunk_text,
        truncation=True,
        padding='max_length',
        max_length=CHUNK_SIZE,
        return_tensors="pt"
    )
    
    # Move tensors to the specified device
    tokens = {k: v.to(device) for k, v in tokens.items()}
    
    with torch.no_grad():
        outputs = model(**tokens)
        # Use the CLS token embedding (first token)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: [1, hidden_size]
    
    return cls_embedding.squeeze()  # Shape: [hidden_size]

def get_document_embedding(text, tokenizer, model, device, max_length=CHUNK_SIZE, max_total_tokens=MAX_TOTAL_TOKENS):
    """
    Generates a single embedding for the entire document by averaging embeddings of its chunks.

    Args:
        text (str): The input document as a string.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
        model (transformers.PreTrainedModel): The BERT model.
        device (torch.device): The device to perform computations on.
        max_length (int): Maximum number of tokens per chunk.
        max_total_tokens (int): Maximum total tokens per document.

    Returns:
        List[float]: The final document embedding as a list of floats.
    """
    # Split the text into chunks
    chunks = split_text_into_chunks(text, tokenizer, max_length=max_length, max_total_tokens=max_total_tokens)
    
    if not chunks:
        # If the document is empty or cannot be tokenized, return a zero vector
        return [0.0] * model.config.hidden_size
    
    # Initialize a tensor to accumulate embeddings
    embedding_sum = torch.zeros(model.config.hidden_size).to(device)
    
    for chunk in chunks:
        # Get embedding for the chunk
        chunk_embedding = get_bert_embedding_chunk(chunk, tokenizer, model, device)
        embedding_sum += chunk_embedding
    
    # Calculate the average embedding
    final_embedding = embedding_sum / len(chunks)
    
    return final_embedding.cpu().tolist()  # Move to CPU and convert to list

def generate_embeddings(df, text_column, embedding_column, tokenizer, model, device, max_length=CHUNK_SIZE, max_total_tokens=MAX_TOTAL_TOKENS):
    """
    Applies the embedding generation process to each document in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing documents.
        text_column (str): The name of the column containing text documents.
        embedding_column (str): The name of the column to store embeddings.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
        model (transformers.PreTrainedModel): The BERT model.
        device (torch.device): The device to perform computations on.
        max_length (int): Maximum number of tokens per chunk.
        max_total_tokens (int): Maximum total tokens per document.

    Returns:
        pd.DataFrame: The DataFrame with added embeddings.
    """
    # Initialize the embedding column with empty lists
    df[embedding_column] = None
    
    # Iterate over the DataFrame with a progress bar
    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc="Generating BERT Embeddings"):
        text = row[text_column]
        if pd.isna(text) or not isinstance(text, str):
            # Handle missing or non-string entries
            df.at[idx, embedding_column] = None
            continue
        
        # Generate the document embedding
        embedding = get_document_embedding(text, tokenizer, model, device, max_length=max_length, max_total_tokens=max_total_tokens)
        df.at[idx, embedding_column] = embedding
    
    return df

# ====================
# 5.2. Generate Embeddings
# ====================

df_compinfo = generate_embeddings(
    df=df_compinfo,
    text_column="section_1",
    embedding_column="BERT-embedding",
    tokenizer=bert_tokenizer,
    model=bert_model,
    device=device,
    max_length=CHUNK_SIZE,
    max_total_tokens=MAX_TOTAL_TOKENS
)

Using device: cpu


Token indices sequence length is longer than the specified maximum sequence length for this model (2558 > 512). Running this sequence through the model will result in indexing errors

enerating BERT Embeddings: 100%|██████████| 27722/27722 [2:10:08<00:00,  3.55it/s]  

In [13]:
import pickle
df_compinfo.to_pickle('BERT-embedded(1536).pkl')

# SBERT:

In [14]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np

# ====================
# 1. Configuration
# ====================

# Define constants
MAX_TOTAL_TOKENS = 1536       # Maximum total tokens per document
CHUNK_SIZE = 512              # Maximum tokens per chunk
BERT_MODEL_NAME = "bert-base-uncased"      # Pre-trained BERT model
SBERT_MODEL_NAME = "all-MiniLM-L6-v2"      # Pre-trained SBERT model

# ====================
# 2. Setup Device
# ====================

# Utilize GPU if available for faster computations
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ====================
# 3. Load Tokenizer and Models
# ====================

# Load BERT tokenizer and model
bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
bert_model = AutoModel.from_pretrained(BERT_MODEL_NAME)
bert_model.to(device)
bert_model.eval()  # Set model to evaluation mode

# Load SBERT model
sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
sbert_model.to(device)

# ====================
# 4. Define Helper Functions
# ====================

def split_text_into_chunks(text, tokenizer, max_length=CHUNK_SIZE, max_total_tokens=MAX_TOTAL_TOKENS):
    """
    Splits the input text into chunks of tokens with a maximum length, ensuring total tokens do not exceed max_total_tokens.

    Args:
        text (str): The input document as a string.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
        max_length (int): Maximum number of tokens per chunk.
        max_total_tokens (int): Maximum total tokens per document.

    Returns:
        List[str]: A list of text chunks.
    """
    if not isinstance(text, str):
        return []
    
    # Tokenize the entire text without adding special tokens
    tokens = tokenizer.encode(text, add_special_tokens=False)
    
    # Limit to the first max_total_tokens
    tokens = tokens[:max_total_tokens]
    
    # Split tokens into chunks
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i + max_length]
        # Decode tokens back to string
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
    
    return chunks

def get_bert_embedding_chunk(chunk_text, tokenizer, model, device):
    """
    Generates the embedding for a single text chunk using BERT.

    Args:
        chunk_text (str): The text chunk.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
        model (transformers.PreTrainedModel): The BERT model.
        device (torch.device): The device to perform computations on.

    Returns:
        torch.Tensor: The embedding vector for the chunk.
    """
    if not chunk_text:
        # Return a zero vector if chunk_text is empty
        return torch.zeros(model.config.hidden_size).to(device)
    
    # Tokenize the chunk with padding and truncation
    tokens = tokenizer(
        chunk_text,
        truncation=True,
        padding='max_length',
        max_length=CHUNK_SIZE,
        return_tensors="pt"
    )
    
    # Move tensors to the specified device
    tokens = {k: v.to(device) for k, v in tokens.items()}
    
    with torch.no_grad():
        outputs = model(**tokens)
        # Use the CLS token embedding (first token)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: [1, hidden_size]
    
    return cls_embedding.squeeze()  # Shape: [hidden_size]

def get_document_embedding(text, tokenizer, bert_model, device, max_length=CHUNK_SIZE, max_total_tokens=MAX_TOTAL_TOKENS):
    """
    Generates a single embedding for the entire document by averaging embeddings of its BERT chunks.

    Args:
        text (str): The input document as a string.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
        bert_model (transformers.PreTrainedModel): The BERT model.
        device (torch.device): The device to perform computations on.
        max_length (int): Maximum number of tokens per chunk.
        max_total_tokens (int): Maximum total tokens per document.

    Returns:
        List[float]: The final document embedding as a list of floats.
    """
    # Split the text into chunks
    chunks = split_text_into_chunks(text, tokenizer, max_length=max_length, max_total_tokens=max_total_tokens)
    
    if not chunks:
        # If the document is empty or cannot be tokenized, return a zero vector
        return [0.0] * bert_model.config.hidden_size
    
    # Initialize a tensor to accumulate embeddings
    embedding_sum = torch.zeros(bert_model.config.hidden_size).to(device)
    
    for chunk in chunks:
        # Get embedding for the chunk
        chunk_embedding = get_bert_embedding_chunk(chunk, tokenizer, bert_model, device)
        embedding_sum += chunk_embedding
    
    # Calculate the average embedding
    final_embedding = embedding_sum / len(chunks)
    
    return final_embedding.cpu().tolist()  # Move to CPU and convert to list

def get_sbert_embedding(text, model):
    """
    Generates the SBERT embedding for a given text.

    Args:
        text (str): The input document as a string.
        model (SentenceTransformer): The SBERT model.

    Returns:
        List[float]: The SBERT embedding as a list of floats.
    """
    if not isinstance(text, str) or pd.isna(text):
        return None
    
    # Split the text into chunks
    chunks = split_text_into_chunks(text, bert_tokenizer, max_length=CHUNK_SIZE, max_total_tokens=MAX_TOTAL_TOKENS)
    
    if not chunks:
        return [0.0] * model.get_sentence_embedding_dimension()
    
    # Generate embeddings for all chunks
    embeddings = model.encode(chunks, convert_to_tensor=True)
    
    # Calculate the average embedding
    final_embedding = torch.mean(embeddings, dim=0)
    
    return final_embedding.cpu().tolist()

def generate_bert_embeddings(df, text_column, embedding_column, tokenizer, model, device, max_length=CHUNK_SIZE, max_total_tokens=MAX_TOTAL_TOKENS):
    """
    Applies the BERT embedding generation process to each document in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing documents.
        text_column (str): The name of the column containing text documents.
        embedding_column (str): The name of the column to store BERT embeddings.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
        model (transformers.PreTrainedModel): The BERT model.
        device (torch.device): The device to perform computations on.
        max_length (int): Maximum number of tokens per chunk.
        max_total_tokens (int): Maximum total tokens per document.

    Returns:
        pd.DataFrame: The DataFrame with added BERT embeddings.
    """
    # Initialize the embedding column with empty lists
    df[embedding_column] = None
    
    # Iterate over the DataFrame with a progress bar
    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc="Generating BERT Embeddings"):
        text = row[text_column]
        if pd.isna(text) or not isinstance(text, str):
            # Handle missing or non-string entries
            df.at[idx, embedding_column] = None
            continue
        
        # Generate the document embedding
        embedding = get_document_embedding(text, tokenizer, model, device, max_length=max_length, max_total_tokens=max_total_tokens)
        df.at[idx, embedding_column] = embedding
    
    return df

def generate_sbert_embeddings(df, text_column, embedding_column, model, max_length=CHUNK_SIZE, max_total_tokens=MAX_TOTAL_TOKENS):
    """
    Applies the SBERT embedding generation process to each document in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing documents.
        text_column (str): The name of the column containing text documents.
        embedding_column (str): The name of the column to store SBERT embeddings.
        model (SentenceTransformer): The SBERT model.
        max_length (int): Maximum number of tokens per chunk.
        max_total_tokens (int): Maximum total tokens per document.

    Returns:
        pd.DataFrame: The DataFrame with added SBERT embeddings.
    """
    # Initialize the embedding column with empty lists
    df[embedding_column] = None
    
    # Iterate over the DataFrame with a progress bar
    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc="Generating SBERT Embeddings"):
        text = row[text_column]
        if pd.isna(text) or not isinstance(text, str):
            # Handle missing or non-string entries
            df.at[idx, embedding_column] = None
            continue
        
        # Generate the document embedding
        embedding = get_sbert_embedding(text, model)
        df.at[idx, embedding_column] = embedding
    
    return df


# ====================
# 5.3. Generate SBERT Embeddings
# ====================

print("\nStarting SBERT Embedding Generation...")
df_compinfo = generate_sbert_embeddings(
    df=df_compinfo,
    text_column="section_1",
    embedding_column="SBERT-embedding",
    model=sbert_model,
    max_length=CHUNK_SIZE,
    max_total_tokens=MAX_TOTAL_TOKENS
)
print("SBERT Embedding Generation Completed.")

Using device: cpu

Starting SBERT Embedding Generation...


Token indices sequence length is longer than the specified maximum sequence length for this model (2558 > 512). Running this sequence through the model will result in indexing errors
Generating SBERT Embeddings: 100%|██████████| 27722/27722 [20:54<00:00, 22.10it/s]

SBERT Embedding Generation Completed.





In [15]:
import pickle
df_compinfo.to_pickle('SBERT_embedded(1536).pkl')

In [20]:
len(df_compinfo["SBERT-embedding"].iloc[1])

384

# Get PALM-GECKO (below code ran on google cloud)

In [None]:
from __future__ import annotations
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from tqdm import tqdm


model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")

# Function to get PALM-GECKO embeddings
def get_gecko(text):
  tasks = "DEFAULT"
  inputs = [TextEmbeddingInput(text, tasks)]

  embeddings = model.get_embeddings(inputs)
  return [embedding.values for embedding in embeddings][0] # should be 1-D list

# # Apply the embedding function to the DataFrame
tqdm.pandas(desc="Generating PALM-gecko Embeddings")
df_compinfo["PALMGECKO-embedding"] = df_compinfo["section_1"].progress_apply(get_gecko)