In [None]:
##Problem Statement - Most of the RAG workflows sometimes do not provide context based information. They usually go off context while using RAG or if there are not able to retrieve the embeddings they just cipher off and might not give any information at all leaving the user unsatisfied.
## Solution - The inherent problem with this is that the embeddings stored in the vector database do not have context related to the document that is being processed itself. Example apple's filings could be stored in a VDB but the context stored in the vdb could still be "apple" as in fruit. Thus when the user
## retrieves the embedding apple, it might not related to the company leaving the user unsatisfied with the answer. Contextual embeddings and contextual BM25 deals with this.
## Moreover, these embeddings loose context if the chunk does not have a "connection" with the other chunk. Instead of a series of chunks.


##So what basically late chunking does is pdf -> embeddings -> chunks (with contextualized information)

In [9]:
!pip install anthropic
!pip install voyageai
!pip install cohere
!pip install elasticsearch
!pip install pandas
!pip install numpy



In [10]:
from google.colab import userdata
api_key_voyage=userdata.get('VOYAGE_API_KEY')
api_key=userdata.get('ANTHROPIC_API_KEY')
api_key_cohere=userdata.get('COHERE_API_KEY')

In [11]:
!pip install transformers==4.43.4



In [12]:
from transformers import AutoModel
from transformers import AutoTokenizer

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)

In [13]:
##once the embeddings are stored, now we will need a tokenizer which will breakdown the entire corpus into chunks
import requests

def chunk_by_tokenizer_api(input_text: str, tokenizer: callable):
    # Define the API endpoint and payload
    url = 'https://tokenize.jina.ai/'
    payload = {
        "content": input_text,
        "return_chunks": "true",
        "max_chunk_length": "1000"
    }

    # Make the API request
    response = requests.post(url, json=payload)
    response_data = response.json()

    # Extract chunks and positions from the response
    chunks = response_data.get("chunks", [])
    chunk_positions = response_data.get("chunk_positions", [])

    # Adjust chunk positions to match the input format
    span_annotations = [(start, end) for start, end in chunk_positions]

    return chunks, span_annotations


In [14]:
# Function to extract text from a PDF file
input_text="In the bustling city of Tokyo, located in Japan, a remarkable event took place that would leave a lasting impression on its residents. The city, known for its vibrant culture and advanced technology, witnessed the opening of a groundbreaking new museum dedicated to the history of robotics. This museum, named The Future of Robotics, was inaugurated by Akira Tanaka, a renowned roboticist and advocate for artificial intelligence. The museum featured an array of exhibits, showcasing the evolution of robots from simple machines to complex systems capable of interacting with humans in meaningful ways. One of the most anticipated exhibits was the Interactive Robot Experience, where visitors could engage with state-of-the-art robots programmed to respond to human emotions. This exhibit drew crowds from all over the city and even attracted international visitors who were eager to witness the latest advancements in robotics. As the opening day approached, excitement grew among the local community. Schools in the area, including Tokyo International School, organized field trips to the museum to inspire students about the future of technology. Parents were thrilled at the opportunity for their children to explore the exhibits and learn from expert presentations."
# determine chunks
chunks, span_annotations = chunk_by_tokenizer_api(input_text, tokenizer)
print('Chunks:\n- "' + '"\n- "'.join(chunks) + '"')

Chunks:
- "In the bustling city of Tokyo, located in Japan, a remarkable event took place that would leave a lasting impression on its residents. The city, known for its vibrant culture and advanced technology, witnessed the opening of a groundbreaking new museum dedicated to the history of robotics. This museum, named The Future of Robotics, was inaugurated by Akira Tanaka, a renowned roboticist and advocate for artificial intelligence. "
- "The museum featured an array of exhibits, showcasing the evolution of robots from simple machines to complex systems capable of interacting with humans in meaningful ways. One of the most anticipated exhibits was the Interactive Robot Experience, where visitors could engage with state-of-the-art robots programmed to respond to human emotions. This exhibit drew crowds from all over the city and even attracted international visitors who were eager to witness the latest advancements in robotics. "
- "As the opening day approached, excitement grew amo

In [15]:
chunks

['In the bustling city of Tokyo, located in Japan, a remarkable event took place that would leave a lasting impression on its residents. The city, known for its vibrant culture and advanced technology, witnessed the opening of a groundbreaking new museum dedicated to the history of robotics. This museum, named The Future of Robotics, was inaugurated by Akira Tanaka, a renowned roboticist and advocate for artificial intelligence. ',
 'The museum featured an array of exhibits, showcasing the evolution of robots from simple machines to complex systems capable of interacting with humans in meaningful ways. One of the most anticipated exhibits was the Interactive Robot Experience, where visitors could engage with state-of-the-art robots programmed to respond to human emotions. This exhibit drew crowds from all over the city and even attracted international visitors who were eager to witness the latest advancements in robotics. ',
 'As the opening day approached, excitement grew among the lo

In [16]:
len(chunks)

3

In [17]:
##the function that applies late chunking to the pdf.
def late_chunking(
    model_output: 'BatchEncoding', span_annotation: list, max_length=None
):
    token_embeddings = model_output[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (
            max_length is not None
        ):  # remove annotations which go bejond the max-length of the model
            annotations = [
                (start, min(end, max_length - 1))
                for (start, end) in annotations
                if start < (max_length - 1)
            ]
        pooled_embeddings = [
            embeddings[start:end].sum(dim=0) / (end - start)
            for start, end in annotations
            if (end - start) >= 1
        ]
        pooled_embeddings = [
            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
        ]
        outputs.append(pooled_embeddings)

    return outputs

In [18]:
# chunk before
embeddings_traditional_chunking = model.encode(chunks)

# chunk afterwards (context-sensitive chunked pooling)
inputs = tokenizer(input_text, return_tensors='pt')
model_output = model(**inputs)
embeddings = late_chunking(model_output, [span_annotations])[0]

In [22]:
embeddings[0].shape

(768,)

In [23]:
import numpy as np

cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

berlin_embedding = model.encode('The museum')

for chunk, new_embedding, trad_embeddings in zip(chunks, embeddings, embeddings_traditional_chunking):
    print(f'similarity_new("Berlin", "{chunk}"):', cos_sim(berlin_embedding, new_embedding))
    print(f'similarity_trad("Berlin", "{chunk}"):', cos_sim(berlin_embedding, trad_embeddings))

similarity_new("Berlin", "In the bustling city of Tokyo, located in Japan, a remarkable event took place that would leave a lasting impression on its residents. The city, known for its vibrant culture and advanced technology, witnessed the opening of a groundbreaking new museum dedicated to the history of robotics. This museum, named The Future of Robotics, was inaugurated by Akira Tanaka, a renowned roboticist and advocate for artificial intelligence. "): 0.7516322
similarity_trad("Berlin", "In the bustling city of Tokyo, located in Japan, a remarkable event took place that would leave a lasting impression on its residents. The city, known for its vibrant culture and advanced technology, witnessed the opening of a groundbreaking new museum dedicated to the history of robotics. This museum, named The Future of Robotics, was inaugurated by Akira Tanaka, a renowned roboticist and advocate for artificial intelligence. "): 0.75738335
similarity_new("Berlin", "The museum featured an array o

  cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
