In [1]:
# %pip install langchain langchain_community pinecone-client python-dotenv tiktoken
# %pip install -U langchain-ollama
# %pip install pinecone
# %pip install llama-index pypdf
# %pip install einops accelerate sentence-transformers
# %pip install llama-index-llms-langchain
# %pip install chromadb
# %pip install transformers

Loading the document

In [2]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data

def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [3]:
# for i, page in enumerate(data):
#             print(f"Page {i + 1}:")
#             print(page.page_content)
#             print("-" * 40)

Cleaning the data


In [4]:
def text_formatter(text):
    clean_text = text.replace("\n", " ").strip()
    return clean_text

def clean(data):
    data_cleaned=[]
    for i,page in enumerate(data):
        data_cleaned.append({
            "Content":text_formatter(page.page_content),
            "PageNumber" : i+1
        }
        )
    return data_cleaned

Splitting them into sentences

In [5]:
from spacy.lang.en import English
def sentencizer(pages_and_texts):
    nlp = English()
    nlp.add_pipe("sentencizer")
    for item in pages_and_texts:
        item["sentences"] = list(nlp(item["Content"]).sents)
        item["sentences"] = [str(sentence) for sentence in item["sentences"]]

Chunking

In [6]:
def split_list(input_list, slice_size):

    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

def chunker(data,num_sentence_chunk_size):
    for item in data:
        item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                            slice_size=num_sentence_chunk_size)
        item["num_chunks"] = len(item["sentence_chunks"])

In [7]:
import re

def join_sentences(data):
    pages_and_chunks = []
    for item in data:
        for sentence_chunk in item["sentence_chunks"]:
            chunk_dict = {}
            chunk_dict["page_number"] = item["PageNumber"]
            
            # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
            joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
            joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
            chunk_dict["sentence_chunk"] = joined_sentence_chunk
            pages_and_chunks.append(chunk_dict)
    return pages_and_chunks

Embedding

In [8]:
from sentence_transformers import SentenceTransformer
def embedding(final_chunked_data):
    embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2") 
    for item in final_chunked_data:
        item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  from tqdm.autonotebook import tqdm, trange


Converting them into a list


In [9]:
def list_converter(final_chunked_data):
    documents = [item["sentence_chunk"] for item in final_chunked_data]
    # pageNumbers = [item["page_number"] for item in final_chunked_data]
    embedding = [item["embedding"].tolist() for item in final_chunked_data]

    id = [f"id{x}" for x,item in enumerate(final_chunked_data)]
    return documents, embedding,id

Adding them into a DB

In [10]:
def db(documents,embeddings,id,name):
    import chromadb

    chroma_client = chromadb.Client()

    existing_collections = chroma_client.list_collections()
    print(existing_collections)
    if name in [col.name for col in existing_collections]:
        collection = chroma_client.get_collection(name=name)
        print(f"Using existing collection: {name}")
    else:
        collection = chroma_client.create_collection(name=name)
        print(f"Created new collection: {name}")
    collection.add(documents=documents,embeddings=embeddings,ids=id)
    return collection

Similarity search

In [11]:
def search_result(query,collection,n_result):
    embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2") 
    query_embeddings = embedding_model.encode(query).tolist()
    results = collection.query(
        query_embeddings=query_embeddings, 
        n_results=n_result
    )
    return results

Query Generator


In [12]:
def query_engine(current_query,old_convo=None,context_summary=None):
    from langchain_ollama import OllamaLLM
    llm = OllamaLLM(model="llama3.1",temperature=0.2)
    from langchain import PromptTemplate
    old_convo_str = " ".join([f"Human: {prompt} | AI: {response}" for prompt, response in old_convo]) if old_convo is not None else ""
    if context_summary == None and old_convo == None:
        template = "create a query to a RAG model where the current prompt is {current_query}. Make sure the query only consists of key terms which the user wants to know about in the current prompt."
    elif context_summary == None and old_convo != None:
        template = "create a query to a RAG model where the past three prompts along with responses are {old_convo_str} and the current prompt is {current_query}. Make sure the query only consists of key terms which the user wants to know about in the current prompt which may or may not be related to the previous conversations."
    elif old_convo != None and context_summary != None:
        template ="create a query to a RAG model where the context of conversation so far is : {context_summary} and the past three promps along wiht response is {old_convo_str} and the current prompt is{current_query}. Make sure query only consists of key terms which the user wants to know about in the current prompt which may or may not be related to the previous conversations"
    prompt = PromptTemplate(
        input_variables=['current_query','old_convo_str','context_summary'],
        template=template
    )
    response = llm(prompt.format(current_query = current_query,old_convo_str = old_convo_str,context_summary = context_summary))
    return response

ChatSummaryBufferChain

In [13]:
from collections import deque
class ChatSummaryBufferChain:
    def __init__(self, max_memory=2,summary_fn = None):
        self.max_memory = max_memory
        self.summary_fn = summary_fn
        self.memory = deque(maxlen=max_memory)
        self.summaries = ""

    
    def add_conversation(self, human_message, ai_message):
        self.memory.append((human_message, ai_message))
        print(self.memory[-1])
        if len(self.memory) == self.memory.maxlen and len(self.summaries) > 0:
            self.summarize_conversations()


    def summarize_conversations(self):
        if self.summary_fn and len(self.memory) > 0 and self.summaries=="":
            if len(self.memory)==self.max_memory:
                oldest_convo = self.memory.popleft()
            summary = self.summary_fn(oldest_convo)
            self.summaries = summary

        elif self.summaries != "" and self.summary_fn and len(self.memory) > 0 :
            oldest_convo = self.memory.popleft()
            summary = self.summary_fn(oldest_convo,self.summaries)
            self.summaries = summary


Summary Generator

In [14]:
def summarizer_fn(old_convo,summary_old=""):
    from langchain.schema import AIMessage, HumanMessage, SystemMessage
    from langchain_ollama import OllamaLLM
    from langchain import PromptTemplate
    summarizer = OllamaLLM(model="llama3.1")
    template = ''' Your task is to combine the old summary with the latest conversation. 
        Focus on merging the key points from both the old summary and the new conversation into a single, concise summary that contains all relevant information.
        Ensure that the final summary is brief but informative, covering all the essential details from both sources. Respond with the version which is a concice summary of the below two
        
        Old Summary: {old_summary}

        New Conversation: {new_convo}

        
        '''
    prompt = PromptTemplate(
        input_variables=['old_summary','new_convo'],
        template=template
    )
    
    messages = [
    SystemMessage(content="""
You are an expert summarizer. Respond with only the summary text
"""),
    HumanMessage(prompt.format(old_summary=summary_old,new_convo=old_convo))
]
    summary = summarizer.invoke(messages)
    print("Summary: " , summary , "\n\n")
    return summary

Final Combo

In [15]:
def ask_with_memory(llm,collection):
    buffer = ChatSummaryBufferChain(summary_fn=summarizer_fn)
    query = input("Question:")
    while query != "quit":
        context = search_result(query,collection,3)
        context = context['documents']
        context_str = ''
        for i in context:
            for j in i:
                context_str += str(j)
        old_convo = buffer.memory
        old_convo_str = " ".join([f"Human: {prompt} | AI: {response}" for prompt, response in old_convo])
        # query = query_engine(query,old_convo,buffer.summaries)
        from langchain import PromptTemplate
        template = ''' Answer the current question. You may use the information given in the context if necessary. You may also refer to the summary and the last two conversations that we has (old conversation)

        Current Question: {query}

        Context: {context_str}
        
        Summary : {summary}

        old Conversation:{old_convo_str}

        Respond with only the answer, do not add anything else
        '''
        prompt = PromptTemplate(
            input_variables=['query','summary','old_covo_str','context'],
            template=template
        )
        response = llm(prompt.format(query=query,summary=buffer.summaries,old_convo_str=old_convo_str,context_str = context_str))
        buffer.add_conversation(query,response)
        if  old_convo_str != "":
            buffer.summarize_conversations()
        print("Context: ", context_str + "\n\n")
        print("Old convo: ", old_convo_str + "\n\n")

        print("Response: ", response)
        query=input("Question")

In [16]:
data = load_document("D:\Coding\Python\RAG\VerdictIQ\data\COVID-Handbook-for-journalists.pdf")
final_data = clean(data)
sentencizer(final_data)
chunker(final_data,12)
final_chunked_data = join_sentences(final_data)
embedding(final_chunked_data)
documents, embeddings,id = list_converter(final_chunked_data)
collection = db(documents,embeddings,id,"test2")
print("embeddings and collection creation done")

Loading D:\Coding\Python\RAG\VerdictIQ\data\COVID-Handbook-for-journalists.pdf




[]
Created new collection: test2
embeddings and collection creation done


In [17]:
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model="llama3.1",temperature=0.2)

ask_with_memory(llm,collection)

  warn_deprecated(


('What is the Novel Coronavirus?', 'The Novel Coronavirus (CoV) is a new strain of the Coronavirus. The disease caused by it has been named Coronavirus Disease 2019 (COVID-19).')
Context:  Coronavirus Disease (COVID-19) Pandemic4Chapter 1 Understanding the  Novel Coronavirus To report effectively on any subject, a thorough understanding of the subject is imperative. In this case, at the heart of the matter lies a virus. A virus can generate so much news – community outbreak, cancellation of events, celebrities falling ill, Sensex collapse, and so on and so forth. Journalists are professionally oriented to track such developments and report on them. The steps to take to avoid getting infected by the virus, the symptoms of infection, how to prepare the family to deal with an emergency, whom to contact for services, etc are also good subjects for feature stories. Readers and viewers want to know that what they are getting is immediate, relevant and accurate information, which will help th



('What are the symptoms of COVID-19 and how does the virus spread?', 'The symptoms of COVID-19 are similar to those of the flu or the common cold, which are fever, dry cough, a runny nose, fatigue and difficulty in breathing.')
Summary:  Here's a concise summary combining both sources:

The Novel Coronavirus, also known as COVID-19, is a new strain of the Coronavirus that was first identified in 2019, causing an outbreak of respiratory illness worldwide. 


Context:  Characterising COVID-19 as a pandemic is not an indication that the virus has become deadlier. Rather, it’s an acknowledgement of the geographical spread of the disease. What are the symptoms of COVID-19 and how does the virus spread?Symptoms can include fever, dry cough, a runny nose, fatigue and difficulty in breathing. Some may display very mild symptoms or no symptoms at all. In more severe cases, infection can cause pneumonia, grievous illness, and even death. Older people, and those with underlying medical problems a



('How can the virus be prevented from spreading?', 'Wash your hands frequently using an alcohol-based hand rub or soap and water, cover your mouth and nose when coughing or sneezing, avoid close contact with anyone who has a fever and cough, seek medical care early if you have fever, cough and difficulty breathing.')
Summary:  Here's a concise summary combining both sources:

COVID-19 is a new strain of Coronavirus identified in 2019, causing an outbreak of respiratory illness worldwide. Symptoms include fever, dry cough, runny nose, fatigue, and difficulty breathing, similar to those of the flu or common cold. 


Summary:  Here's a concise summary combining both sources:

COVID-19 is a new strain of Coronavirus causing respiratory illness worldwide. Symptoms include fever, dry cough, runny nose, fatigue, and difficulty breathing. To prevent the virus from spreading, wash hands frequently with soap and water or hand rub, cover your mouth and nose when coughing/sneezing, avoid close con



('When did it start spreading', 'There is no information provided in the context about when COVID-19 started spreading.')
Context:  Characterising COVID-19 as a pandemic is not an indication that the virus has become deadlier. Rather, it’s an acknowledgement of the geographical spread of the disease. What are the symptoms of COVID-19 and how does the virus spread?Symptoms can include fever, dry cough, a runny nose, fatigue and difficulty in breathing. Some may display very mild symptoms or no symptoms at all. In more severe cases, infection can cause pneumonia, grievous illness, and even death. Older people, and those with underlying medical problems are at a higher risk of developing serious illness.1  A ‘pandemic’ generally refers to an epidemic that has spread on a more global scale, affecting large numbers of people. Exactly when enough places have enough infections to declare one isn’t a black-and-white decision. But generally, the WHO is looking for sustained outbreaks in differe



('When did covid 19 happen', 'December 2019.')
Summary:  Here's a concise summary combining both sources:

COVID-19 is a new strain of Coronavirus causing respiratory illness worldwide, with symptoms including fever, dry cough, runny nose, fatigue, and difficulty breathing. To prevent its spread, wash hands frequently, cover your mouth/nose when coughing/sneezing, avoid close contact with those who have symptoms, and seek medical care early if symptoms persist. 


Summary:  Here's a concise summary combining both sources:

COVID-19 is a new strain of Coronavirus causing respiratory illness worldwide, appearing in December 2019 with symptoms including fever, dry cough, runny nose, fatigue, and difficulty breathing. To prevent its spread, practice good hygiene by washing hands frequently, covering your mouth/nose when coughing/sneezing, avoiding close contact with those who have symptoms, and seeking medical care early if symptoms persist. 


Context:  Coronavirus Disease (COVID-19) Pand

KeyboardInterrupt: 

In [None]:
# from langchain_ollama import OllamaLLM

# llm = OllamaLLM(model="llama3.1",temperature=0.2)

# from langchain import PromptTemplate
# template = ''' .

# Context: {context}'''
# prompt = PromptTemplate(
#     input_variables=['person','context'],
#     template=template
# )
# response = llm(prompt.format(person = "JOSEPH B. MARTIN",context = context["documents"]))
# print(response)

In [None]:
# from langchain_ollama import OllamaLLM
# llm = OllamaLLM(model="llama3.1",temperature=0.2)
# print(llm.invoke("JOSEPH B. MARTIN"))