In [1]:
import cohere
from pinecone import Pinecone, ServerlessSpec
import hashlib
import os
import time
import sys

In [2]:
os.environ["PINECONE_API_KEY"] = <<PINECONE_API_KEY>>
os.environ["COHERE_API_KEY"] = <<COHERE_API_KEY>>
os.environ["OPENAI_API_KEY"] = <<OPENAI_API_KEY>>

In [82]:
# Define index configuration
INDEX_NAME = 'student'
file_type = 'txt'
filename = "USA.txt"

In [83]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

#### Load and text split using langchain frame work

##### <b><font color=orange>The langchain.document_loaders module in the LangChain library provides a variety of tools to load documents from different sources and formats.</font></b>
    Common Document Loaders :
        PyPDFLoader
        TextLoader
        CSVLoader
        JSONLoader etc

In [84]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [85]:
from langchain.document_loaders import PyPDFLoader

In [86]:
def load_text(filename,file_type):
    try:
        if file_type == 'txt':
            loader = TextLoader(filename)
        else:
            loader = PyPDFLoader(filename)
        return loader
    except FileNotFoundError:
        print(f"File not found: {filename}")
        return []
  
    

#### <center><b><font color=pink>Choosing the right text splitter depends on the content and use case:</font></b></center>
<b>RecursiveCharacterTextSplitter:</b> Best for creating chunks that resemble natural language breaks, which is useful for question-answering systems.<br>
<b>CharacterTextSplitter:</b> Simple and works when the text can be split roughly without specific formatting.<br>
<b>TokenTextSplitter:</b> Ideal when working with models that have a token limit, ensuring each chunk is optimized for token-based language models.<br>
<b>MarkdownTextSplitter:</b> Useful for documents written in Markdown format, respecting headers, lists, and code blocks.<br>

#### <font color = 'red'>**TIP**</font>:
**<font color = 'blue'>Chunks of about to 500 to 1000 characters with a 10 - 20 % 
overlap are typically effecive for QA</font>**

##### <font color = maroon >Why Splitting : There is a limitation respect to context size for exach model </font>

In [105]:
def split_text(loader,chunk_size,chunk_overlap,is_separator_regex,strip_whitespace):
    """
    Load and split the text from the file into manageable chunks.
    """
    try:
        text = loader.load()
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            is_separator_regex=is_separator_regex,
            strip_whitespace = strip_whitespace
            
        )
        return splitter.split_text(str(text))   # split_text
    
    except Exception as e:
        print(f"Error loading and splitting text from {filename}: {str(e)}")
        return []

#### <font color= green>Loading the file based on file type  </font>

In [88]:
loader = load_text(filename,file_type)

#### Parameters:

chunk_size (int) – Maximum size of chunks to return

chunk_overlap (int) – Overlap in characters between chunks

length_function (Callable[[str], int]) – Function that measures the length of given chunks

keep_separator (Union[bool, Literal['start', 'end']]) – Whether to keep the separator and where to place it in each corresponding chunk (True=’start’)

add_start_index (bool) – If True, includes chunk’s start index in metadata

strip_whitespace (bool) – If True, strips whitespace from the start and end of every document

In [101]:
chunk_size=100
chunk_overlap=20
is_separator_regex=False
strip_whitespace = True

#### Split the text based on above parameters

In [108]:
text_chunks = split_text(loader,chunk_size,chunk_overlap,is_separator_regex,strip_whitespace)

#### <font color = red> Total number of chunks </font>

In [126]:
len(text_chunks)

16

In [110]:
for i, chunk in enumerate(text_chunks):
    #print(f'{i}>>>>>>>>{chunk}')
    pass

#### Now Start Embedding using cohere model

## Models and dimentions
#####  <b>embed-english-v3.0</b> <font color = green > 1024 </font>

#####  embed-multilingual-v3.0 <font color = green > 1024 </font>

##### embed-english-light-v3.0 <font color = green > 384 </font>

##### embed-multilingual-light-v3.0 <font color = green > 384 </font>

##### embed-english-v2.0 <font color = green > 4096 </font>

##### embed-english-light-v2.0 <font color = green > 1024 </font> 

##### embed-multilingual-v2.0 <font color = green > 768 </font>

###### <center><b><font color= blue> why cohere model only </font></b></center> 
<font color=purple>Cohere models are optimized for "contextual understanding of language" , similar to other transformer-based models like BERT or OpenAI’s models. This makes them highly effective for capturing the semantics of a sentence or document, which is crucial for tasks like semantic search, recommendation, or summarization.</font>

In [112]:
# Initialize Cohere client
def get_cohere_client():
    try:
        co = cohere.Client(COHERE_API_KEY)
        return co
    except Exception as e:
        print(f"Failed to initialize Cohere client: {str(e)}")
        sys.exit(1)

#### Choosing the Right Input Type
For search applications: Use <b>search_query</b> for user input queries and <b>search_document</b> for documents or passages to be searched.<br>
For classification tasks: Use classification to ensure the embedding captures characteristics relevant to specific categories or labels.<br>
For text generation: Use <b>text_generation</b> if you’re working with tasks that involve creating coherent, flowing text.<br>
For summarization: Use summarization if you’re creating a condensed version of a document, aiming to retain key points.<br>

#### <font color=green> Each input_type helps to guide the model in producing embeddings best suited for specific types of downstream tasks.</font>

In [117]:
def generate_embedding(text):
    """
    Generate embedding using Cohere's embed-english-v3.0 model.
    """
    try:
        co = get_cohere_client()
        response = co.embed(
            texts=[text], # accept array 
            model="embed-english-v3.0",
            input_type="search_document" #  focusing on semantic meaning relevant to search contexts, so that they can be compared with "search_query" embeddings for efficient and relevant search results.
        )
        return response.embeddings[0]
    
    except Exception as e:
        print(f"Unexpected error in generating embedding: {str(e)}")
        return None

In [118]:
def create_unique_id(filename):
    """
    Create a unique identifier for the file using the filename.
    """
    try:
        base_filename = os.path.basename(filename)
        filename_hash = hashlib.md5(base_filename.encode()).hexdigest()
        return f"{base_filename}_{filename_hash[:8]}"
    except Exception as e:
        print(f"Failed to create a unique ID for the file: {str(e)}")
        return None

In [131]:
def check_and_upsert_embedding(filename, text_chunk, chunk_index):
    """
    Check if embedding exists, if not, create and upsert it.
    """
    file_id = create_unique_id(filename)
    print(f"file_id: {file_id}")
    if not file_id:
        print(f"Failed to create file ID for {filename}")
        return False
 
    unique_id = f"{file_id}_chunk_{chunk_index}"
 
    try:
        existing_vector = index.fetch(ids=[unique_id]) # check whether text chunk available or not
 
        # If the embedding does not exist, generate and upsert it
        if not existing_vector.get('vectors'):
            embedding = generate_embedding(text_chunk)
            print("dimention of the embedding :",len(embedding))
            if embedding:
                index.upsert(
                    vectors=[(unique_id, embedding, {"source": filename, "text": text_chunk, "file_id": file_id})]
                )
                print(f"Generated and upserted new embedding for chunk {chunk_index} from {filename}")
                return True
            else:
                print(f"Failed to generate embedding for chunk {chunk_index} from {filename}")
                return False
        else:
            print(f"Embedding already exists for chunk {chunk_index} from {filename}")
            return False
    except Exception as e:
        print(f"Error processing chunk {chunk_index} from {filename}: {str(e)}")
        return False
 

In [132]:
def process_text_chunks(filename, splits):
    """
    Process text chunks and upsert embeddings if required.
    """
    new_splits = []
    for i, split in enumerate(splits):
        if check_and_upsert_embedding(filename, split, i):
            new_splits.append(split)
 
    if new_splits:
        print(f"Generated and upserted new embeddings for {len(new_splits)} new text chunks")
    else:
        print("No new text chunks required new embeddings")

#### <font color= pink> <center> Vector data base </ceneter> </font>

In [134]:
def init_pinecone():
    try:
        pc = Pinecone(api_key=PINECONE_API_KEY)
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
    except Exception as e:
        print(f"Failed to initialize Pinecone client: {str(e)}")
        sys.exit(1)
    return pc

In [135]:
def connect_to_index(pinecone, index_name):
    index = pinecone.Index(index_name)
    return index

In [136]:
pinecone = init_pinecone()

In [137]:
index = connect_to_index(pinecone,INDEX_NAME)

In [138]:
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 18}},
 'total_vector_count': 18}

In [139]:
process_text_chunks(filename,text_chunks)

file_id: USA.txt_6280614a
Embedding already exists for chunk 0 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 1 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 2 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 3 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 4 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 5 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 6 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 7 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 8 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 9 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 10 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 11 from USA.txt
file_id: USA.txt_6280614a
Embedding already exists for chunk 12 from USA.txt
file_id: 

##### <center><font color= orange> CHAT MODEL </font> </center>

In [59]:
import json
from langchain.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_cohere import ChatCohere, CohereEmbeddings, CohereRerank
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
import cohere
from langchain.chains import RetrievalQA
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
import os
import time

In [66]:

file_id="dhruv.pdf_1ad54a87"  #which is given while creating and stroing embeddings
 
# Initialize Cohere embeddings
cohere_embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=COHERE_API_KEY)
 
# Initialize the language model
model_id = "command-r-plus-08-2024"
llm = ChatCohere(model=model_id, cohere_api_key=COHERE_API_KEY, temperature=0.1)


In [67]:
# Set up the document search with Pinecone vector store
docsearch = PineconeVectorStore.from_existing_index(index_name=INDEX_NAME, embedding=cohere_embeddings, text_key="text")

In [68]:
def create_prompt(question):
    prompt_template = """
Please provide a short and direct answer from the context below, repeating the exact phrasing of the question in your response.
For example, if the question is 'What is the borrower name?', respond with 'Borrower name: [exact answer]'.
If the answer cannot be found, respond with 'Apologies, but I couldn't find the information you requested in the document.'.
Do not make up any answer.
Context:
{context}
"""
 
    messages = [
        SystemMessagePromptTemplate.from_template(prompt_template),
        HumanMessagePromptTemplate.from_template(question)
    ]
    prompt = ChatPromptTemplate.from_messages(messages)
    return prompt


In [69]:
while True:
    user_question = input(
        "Please enter your question (or type 'exit' to quit): "
    ).strip()
 
    if user_question.lower() == "exit":  # Exit condition
 
        break
    # Generate and print the response
    prompt = create_prompt(user_question)
    chain_type_kwargs = {"prompt": prompt}
    t1=time.time()
    top_k=30
    retriever = docsearch.as_retriever(
            include_metadata=True,
            metadata_key='source',
            top_k=30,
            search_type="similarity",
            search_kwargs={"filter": {"file_id": file_id}}
        )
    compressor = CohereRerank(model="rerank-english-v3.0",top_n=8)
    
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=retriever
    )
    # compressed_docs = compression_retriever.get_relevant_documents(user_question)
    # Print the relevant documents from using the embeddings and reranker
    # print(compressed_docs)
    t2=time.time()
    print(t2-t1,"retriever time****")
 
    chain = RetrievalQA.from_chain_type(
        llm=llm,    
        chain_type="stuff", 
        retriever=compression_retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt}
    )
    
    t3=time.time()
    answer=chain.invoke({"query": user_question})
    # print("response",answer)
    t4=time.time()
    print(t4-t3,"time at chain.invoke")
    # print(answer)
    print("Result",answer["result"])
 

Please enter your question (or type 'exit' to quit):  what is the student name?


0.017817258834838867 retriever time****
2.425938367843628 time at chain.invoke
Result Student name: Dhruv Vinjamuri


Please enter your question (or type 'exit' to quit):  what is the school name ?


0.17368602752685547 retriever time****
2.484807014465332 time at chain.invoke
Result Apologies, but I couldn't find the information you requested in the document.


Please enter your question (or type 'exit' to quit):  what is the Enrolled Campus?


0.04800724983215332 retriever time****
2.262035846710205 time at chain.invoke
Result Enrolled Campus: Early Childhood School


Please enter your question (or type 'exit' to quit):  student id ?


0.019999980926513672 retriever time****
2.6309633255004883 time at chain.invoke
Result Student ID#: 303265


Please enter your question (or type 'exit' to quit):  exit


In [24]:
for chunk in text_chunks:
    embedding = generate_embedding(chunk)