In [2]:
import ollama
from ollama import ChatResponse
from PyPDF2 import PdfReader, PdfWriter
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import cohere
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm





True

In [2]:
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')

In [8]:
def get_pdf_text(pdf_path):    
    reader = PdfReader(pdf_path)    
    text_body = []    
    def visitor_body(text, cm, tm, fontDict, fontSize):
        y = tm[5]        
        if y > 20 and y < 720:            
            text_body.append(text)
    for page in reader.pages:        
        page.extract_text(visitor_text=visitor_body)        
    return ' '.join(text_body)    

In [9]:
def clean_text(text):    
    return re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)

In [10]:
def chunk_by_sentence_groups(text, max_length):
    # Regular expression to match sentence-ending punctuation: . ? !
    sentence_endings = re.compile(r'([.!?])(?=\s)')
    text = re.sub(r'\s+', ' ', text).strip()
    # Split the text into sentences based on the sentence-ending punctuation
    sentences = sentence_endings.split(text)
    
    # Recombine punctuation with the sentences
    sentences = [sentences[i] + (sentences[i+1] if i+1 < len(sentences) else '') for i in range(0, len(sentences), 2)]
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # If adding the sentence exceeds max_length, start a new chunk
        if len(current_chunk) + len(sentence) > max_length:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = sentence.strip()
        else:
            current_chunk += sentence  # Add the sentence to the current chunk

    if current_chunk:  # Append the last chunk
        chunks.append(current_chunk)
    
    return chunks

In [68]:
def chunk_by_sentence(text):
    sentence_endings = re.compile(r'([.!?])(?=\s)')
    sentences = sentence_endings.split(text)
    sentences = [sentences[i] + (sentences[i+1] if i+1 < len(sentences) else '') for i in range(0, len(sentences), 2)]
    sentences = [re.sub(r'\n', ' ', sentence) for sentence in sentences]    
    sentences = [re.sub(r'\s+', ' ', sentence).strip() for sentence in sentences]
    return sentences

In [65]:
def calculate_similarity(a, b):    
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)

In [13]:
def embed_chunks_llama(chunks,batch_size=500):
    embeddings = []
    for i in tqdm(range(0,len(chunks),batch_size)):
        batch = chunks[i:i+batch_size]
        embeddings.extend(ollama.embed(model='llama3.2', input=batch).embeddings)
    return embeddings

In [14]:
def embed_chunks_st(chunks,batch_size=500):
    embeddings = []
    for i in tqdm(range(0,len(chunks),batch_size)):
        batch = chunks[i:i+batch_size]
        embeddings.extend(model.encode(batch))
    return embeddings
    

In [15]:
def embed_query_llama(query):
    return ollama.embed(model='llama3.2',input=query).embeddings[0]

In [16]:
def embed_query_st(query):
    return model.encode(query)

In [17]:
COHERE_KEY = os.getenv('COHERE_KEY')
co = cohere.Client(COHERE_KEY)

In [18]:
def embed_chunks_cohere(chunks,batch_size=500):
    model = "embed-english-v3.0"
    input_type = "search_document"

    chunk_embeddings = co.embed(
        texts = chunks,
        model = model,
        input_type = input_type,
        embedding_types=['float']
    )
    return chunk_embeddings.embeddings.float

In [23]:
def embed_query_cohere(query):
    input_type = 'search_query'    
    model = "embed-english-v3.0"
    query_embedding = co.embed(
        texts = [query],
        model = model,
        input_type = input_type,
        embedding_types= ['float']
    )
    return query_embedding.embeddings.float[0]

In [69]:
text = get_pdf_text('sir_doc_file.pdf')
text = clean_text(text)
chunks = chunk_by_sentence(text)

In [70]:
print(chunks)

['HybridRAG: Integrating Knowledge Graphs and Vector Retrieval Augmented Generation for Efficient Information Extraction Bhaskarjit Sarmah bhaskarjit.sarmah@blackrock.com BlackRock, Inc.', 'Gurugram, India Benika Hall bhall@nvidia.com NVIDIA Santa Clara, CA, USA Rohan Rao rohrao@nvidia.com NVIDIA Santa Clara, CA, USA Sunil Patel supatel@nvidia.com NVIDIA Santa Clara, CA, USA Stefano Pasquali stefano.pasquali@blackrock.com BlackRock, Inc.', 'New York, NY, USA Dhagash Mehta dhagash.mehta@blackrock.com BlackRock, Inc.', 'New York, NY, USA ABSTRACT Extraction and interpretation of intricate information from unstructured text data arising in financial applications, such as earnings call transcripts, present substantial challenges to large language models (LLMs) even using the current best practices to use Retrieval Augmented Generation (RAG) (referred to as VectorRAG techniques which utilize vector databases for information retrieval) due to challenges such as domain specific terminology an

In [71]:
embedded_chunks = embed_chunks_cohere(chunks)
df = pd.DataFrame({'text':chunks,'embedding':embedded_chunks})

In [206]:
query = 'What are Mountains?'
embedded_query = embed_query_cohere(query)
df['similarity'] = df.apply(lambda x: calculate_similarity(embedded_query,x[1]),axis=1)
#df_similar_order = df.sort_values(by='similarity', ascending=False)
df_top = df[df['similarity'] > df['similarity'].quantile(0.8)]

  df['similarity'] = df.apply(lambda x: calculate_similarity(embedded_query,x[1]),axis=1)


In [207]:
# Input prompt for the model
context = ''
for txt in df_top['text']:
    context+=txt
    context+='\n'
prompt = f"""
"QUESTION:" {query}\n
"CONTEXT:" {context}
"""

In [208]:
# Generate text
response = co.chat(
    model="command-r-plus-08-2024",
    message= prompt,
    preamble='You are a friendly bot. Read the "QUESTION:" and reply. If the question demands some information, answer the question provided as "QUESTION:" using the context provided as "CONTEXT:" If the answer is not present, say you don\'t know.',
    max_tokens=200,  # Control the length of the response
    temperature=0.3,  # Adjust creativity level
)
print(response.text)

I don't know.


In [None]:
response_generator: ChatResponse = ollama.chat(model='llama3.2', messages=[
    # {
    #     'role': 'system',
    #     'content': """You are an AI assistant tasked with providing short answers based solely on the given context."""
    # },
    {
        'role': 'user',
        'content': prompt,
    },
    ],
    options={
        'temperature':0.2
    })
print(response_generator.message.content)
# Iterate through the generator to get response chunks
# for chunk in response_generator:
#     # Each chunk is a part of the response
#     print(chunk.message.content,end='', flush=True)

In [41]:
df[df['text'].str.contains('VectorRAG Confi')]

Unnamed: 0,text,embedding,similarity
79,LLM GPT-3.5-Turbo LLM Temperature 0 Embedding Model text-embedding-ada-002 Framework LangChain Vector Database Pinecone Chunk Size 1024 Chunk Overlap 0 Maximum Output Tokens 1024 Chunks for Similarity Algorithm 20 Number of Context Retrieved 4 Table 3: VectorRAG Configuration 7 https://www.pinecone.io/ The Q&A pipeline is constructed using the LangChain framework 8 .,"[-0.045715332, -0.019241333, -0.07318115, -0.043762207, -0.036590576, 0.007282257, -0.08300781, 0.040771484, -0.017318726, 0.00093746185, -0.06298828, 0.050323486, -0.052612305, -0.016403198, 0.028427124, -0.039916992, 0.022857666, 0.023590088, 0.018981934, 0.010269165, -0.0064201355, 0.012641907, 0.01979065, 0.015113831, 0.03567505, -0.07110596, 0.061706543, -0.013931274, 0.04321289, 0.0012454987, 0.024749756, -0.022628784, 0.023925781, 0.048095703, -0.03555298, 0.03010559, -0.009849548, 0.03286743, 0.0012702942, 0.015563965, 0.040039062, -0.034118652, 0.032989502, -0.03466797, 0.0026359558, 0.023361206, -0.00818634, -0.04333496, 0.031982422, 0.005596161, 0.0028133392, 0.06518555, -0.015037537, -0.010986328, 0.0007019043, 0.027526855, -0.02432251, 0.0016765594, 0.012077332, 0.04473877, 0.007850647, 0.0007481575, -0.013122559, -0.027816772, -0.015106201, -0.00064086914, -0.015457153, -0.011581421, 0.06021118, -0.0039978027, -0.010749817, 0.016098022, -0.004425049, -0.018981934, -0.018875122, 0.0047340393, -0.00037431717, -0.011184692, 0.018157959, -0.004852295, 0.05105591, 0.014549255, -0.027130127, -0.015113831, 0.029708862, 0.030899048, -0.00031375885, -0.0362854, 0.047058105, -0.00012141466, -0.0070495605, 0.011672974, 0.050323486, 0.03982544, -0.04449463, -0.014083862, -0.013290405, -0.034088135, 0.018341064, 0.018753052, ...]",0.290364


In [217]:
len(df[df['similarity']>0.4].sort_values(by='similarity'))

52

In [36]:
pd.set_option('display.max_colwidth',None)

In [23]:
df['similarity'].quantile(0.8)

0.4479245178916807