In [None]:
from transformers import AutoTokenizer, AutoModel
import numpy as np

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModel.from_pretrained("ProsusAI/finbert")

# Example document with more than 512 tokens
document = "Long document with more than 512 tokens..."

# Split the document into chunks of 512 tokens
max_length = 512
document_chunks = [document[i:i+max_length] for i in range(0, len(document), max_length)]

# Create embeddings for each chunk
document_embeddings = []
for chunk in document_chunks:
    encoded_input = tokenizer(chunk, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
        embeddings = model_output.last_hidden_state[:, 0, :].numpy()
        document_embeddings.append(embeddings)

# Aggregate embeddings to create an overall embedding for the document
document_embedding = np.mean(document_embeddings, axis=0)


In [1]:
!pip install transformers
!pip install torch
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, ht

In [6]:
import torch
from transformers import AutoTokenizer, AutoModel
from annoy import AnnoyIndex

# Load the model and tokenizer
model = AutoModel.from_pretrained("ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# Define the maximum length for document tokens
max_length = 512

# Define the number of trees for the Annoy index
num_trees = 10

# Define the dimensionality of the embeddings
embedding_size = 768

# Define the sliding window size and overlap
window_size = 256
overlap = 128

# Define the list to store the document embeddings
document_embeddings = []

# Define a function to create document embeddings with sliding windows
def create_document_embeddings(document):
    # Tokenize the document
    tokens = tokenizer.tokenize(document)
    # Create sliding windows of the document
    windows = [tokens[i:i+window_size] for i in range(0, len(tokens), window_size-overlap)]
    # Create document embeddings for each window
    embeddings = []
    for window in windows:
        # Encode the window with the tokenizer
        inputs = tokenizer(" ".join(window), max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
        # Pass the inputs through the model to get the embeddings
        with torch.no_grad():
            output = model(**inputs)
            embeddings.append(output.pooler_output.cpu().numpy()[0])
    # Concatenate the embeddings to get the document embedding
    document_embedding = np.concatenate(embeddings)
    return document_embedding

# Create the document embeddings
documents = ["your client can use the following funding tooltip","schwab offers three main resources"] # list of documents
for document in documents:
    document_embedding = create_document_embeddings(document)
    document_embeddings.append(document_embedding)

# Build the Annoy index
annoy_index = AnnoyIndex(embedding_size, 'angular')
for i in range(len(document_embeddings)):
    annoy_index.add_item(i, document_embeddings[i])
annoy_index.build(num_trees)

# Define a function to perform semantic search
 

Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


True

In [7]:
import numpy as np
def semantic_search(query, k):
    # Encode the query with the tokenizer
    inputs = tokenizer(query, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    # Pass the inputs through the model to get the query embedding
    with torch.no_grad():
        query_embedding = model(**inputs).pooler_output.cpu().numpy()[0]
    # Use the Annoy index to retrieve the most similar documents
    indices, distances = annoy_index.get_nns_by_vector(query_embedding, k, include_distances=True)
    # Return the indices and distances
    return indices, distances


In [10]:
semantic_search("cient fund account after opening",2)

([0, 1], [0.31801414489746094, 0.37996208667755127])