# Using Embeddings to test Cosine Similarities

## Installation

In [1]:
%pip install -q gradio pandas numpy langchain sentence-transformers scikit-learn datasets pypdf einops

Note: you may need to restart the kernel to use updated packages.


## Importing the PDF document

In [2]:
from pypdf import PdfReader

reader = PdfReader("data/microsoft_annual_report_2022.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

#print(pdf_texts)
#print(word_wrap(pdf_texts[0]))
# First part of the PDF
print(pdf_texts[0])

1 Dear shareholders, colleagues, customers, and partners:  
We are living through a period of historic economic, societal, and geopolitical change. The world in 2022 looks nothing like 
the world in 2019. As I write this, inflation is at a 40 -year high, supply chains are stretched, and the war in Ukraine is 
ongoing. At the same time, we are entering a technological era with the potential to power awesome advancements 
across every sector of our economy and society. As the world’s largest software company, this places us at a historic 
intersection of opportunity and responsibility to the world around us.  
Our mission to empower every person and every organization on the planet to achieve more has never been more 
urgent or more necessary. For all the uncertainty in the world, one thing is clear: People and organizations in every 
industry are increasingly looking to digital technology to overcome today’s challenges and emerge stronger. And no 
company is better positioned to help th

Setting the texts as our dataset.

In [3]:
dataset = pdf_texts

In [4]:
# Constants for default values
DEFAULT_CHUNK_SIZE = 250
DEFAULT_CHUNK_OVERLAP = 0
DEFAULT_NUM_CHUNKS = 100
model = None

# nomic-ai/nomic-embed-text-v1',trust_remote_code=True

# Initialize the sentence transformer model for embeddings
#model = SentenceTransformer('all-MiniLM-L6-v2')
#model = SentenceTransformer('BAAI/bge-small-en-v1.5')
#model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
#model = SentenceTransformer('nomic-ai/nomic-embed-text-v1',trust_remote_code=True)

In [5]:
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the input text based on the selected method and provided parameters.
    """
    num_chunks = int(num_chunks)
    output = []

    # Ensure text is provided
    if not text.strip():
        return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])

    if method == "RecursiveCharacterTextSplitter":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
        tokenized_texts = text_splitter.split_text(text)[:num_chunks]
        for i, chunk in enumerate(tokenized_texts):
            output.append({
                'Chunk #': i,
                'Text Chunk': chunk,
                'Character Count': len(chunk),
                'Token Count': len(chunk.split())
            })

    df = pd.DataFrame(output)
    return df

def calculate_embeddings(df):
    """
    Calculates embeddings for each text chunk in the dataframe.
    """
    if df.empty:
        return df

    chunks = df['Text Chunk'].tolist()
    embeddings = model.encode(chunks)
    df['Embeddings'] = embeddings.tolist()
    return df

def search_similar_chunks(query, df_with_embeddings):
    """
    Search for chunks similar to the query embedding.
    """
    # Compute the query embedding
    query_embedding = model.encode([query])[0]

    # Calculate similarity scores
    chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
    similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]

    # Insert similarity scores into the dataframe after 'Chunk #'
    df_with_embeddings.insert(1, 'Similarity', similarity_scores)

    # Return the dataframe sorted by similarity scores in descending order
    return df_with_embeddings.sort_values(by='Similarity', ascending=False)

def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the text and calculates embeddings.
    """
    df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
    df_with_embeddings = calculate_embeddings(df)
    return df_with_embeddings

def update_output(method, model_name, text, chunk_size, chunk_overlap, num_chunks, query):
    global model
    model = SentenceTransformer(model_name)
    df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
    if query:
        df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
        # Update the headers to reflect the new column order after similarity search
        return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
    return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]

  from tqdm.autonotebook import tqdm, trange


## Displaying the different methods

Note: the following might fail due to network permissions

In [6]:
import gradio as gr
iface = gr.Interface(
    fn=update_output,
    inputs=[
        gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
        gr.Dropdown(label="Select Model", choices=["all-MiniLM-L6-v2",
                                                   "BAAI/bge-small-en-v1.5",
                                                   "avsolatorio/GIST-all-MiniLM-L6-v2",                                                   
                                                   "sentence-transformers/all-mpnet-base-v2"]),
        #gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
        gr.Textbox(label="Enter Text", value=dataset),
        gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
        gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
        gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS),
        gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.")
    ],
    outputs=gr.Dataframe(height=900),
    title="Text Tokenization and Embedding Tool",
    description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature."
)

iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://f3d73b793206e5717d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]