In [None]:
# pip install langchain langchain_community pinecone-client python-dotenv tiktoken
# pip install -U langchain-ollama
# pip install pinecone
# pip install llama-index pypdf
# pip install einops accelerate sentence-transformers
# pip install llama-index-llms-langchain
# pip install chromadb
# pip install transformers

In [None]:
#csv
import csv
file_name = "D:\Coding\Python\RAG\VerdictIQ\data\COVID-Handbook-for-journalists.pdf"
def read_csv_file():
    with open(file_name, "r") as file:
        reader = csv.reader(file)
        return list(reader)
def write_csv_file(data):
    with open(file_name, "w") as file:
        writer = csv.writer(file)
        writer.writerows(data)

Loading the document

In [None]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data

def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [None]:
# for i, page in enumerate(data):
#             print(f"Page {i + 1}:")
#             print(page.page_content)
#             print("-" * 40)

Cleaning the data


In [None]:
def text_formatter(text):
    clean_text = text.replace("\n", " ").strip()
    return clean_text

def clean(data):
    data_cleaned=[]
    for i,page in enumerate(data):
        data_cleaned.append({
            "Content":text_formatter(page.page_content),
            "PageNumber" : i+1
        }
        )
    return data_cleaned

Splitting them into sentences

In [None]:
from spacy.lang.en import English
def sentencizer(pages_and_texts):
    nlp = English()
    nlp.add_pipe("sentencizer")
    for item in pages_and_texts:
        item["sentences"] = list(nlp(item["Content"]).sents)
        item["sentences"] = [str(sentence) for sentence in item["sentences"]]

Chunking

In [None]:
def split_list(input_list, slice_size):

    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

def chunker(data,num_sentence_chunk_size):
    for item in data:
        item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                            slice_size=num_sentence_chunk_size)
        item["num_chunks"] = len(item["sentence_chunks"])

In [None]:
import re

def join_sentences(data):
    pages_and_chunks = []
    for item in data:
        for sentence_chunk in item["sentence_chunks"]:
            chunk_dict = {}
            chunk_dict["page_number"] = item["PageNumber"]
            
            # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
            joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
            joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
            chunk_dict["sentence_chunk"] = joined_sentence_chunk
            pages_and_chunks.append(chunk_dict)
    return pages_and_chunks

Embedding

In [None]:
from sentence_transformers import SentenceTransformer
def embedding(final_chunked_data):
    embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2") 
    for item in final_chunked_data:
        item["embedding"] = embedding_model.encode(item["sentence_chunk"])

Converting them into a list


In [None]:
def list_converter(final_chunked_data):
    documents = [item["sentence_chunk"] for item in final_chunked_data]
    # pageNumbers = [item["page_number"] for item in final_chunked_data]
    embedding = [item["embedding"].tolist() for item in final_chunked_data]

    id = [f"id{x}" for x,item in enumerate(final_chunked_data)]
    return documents, embedding,id

Adding them into a DB

In [None]:
def db(documents,embeddings,id,name):
    import chromadb

    chroma_client = chromadb.Client()

    existing_collections = chroma_client.list_collections()
    print(existing_collections)
    if name in [col.name for col in existing_collections]:
        collection = chroma_client.get_collection(name=name)
        print(f"Using existing collection: {name}")
    else:
        collection = chroma_client.create_collection(name=name)
        print(f"Created new collection: {name}")
    collection.add(documents=documents,embeddings=embeddings,ids=id)
    return collection

Similarity search

In [None]:
def search_result(query,collection,n_result):
    embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2") 
    query_embeddings = embedding_model.encode(query).tolist()
    results = collection.query(
        query_embeddings=query_embeddings, 
        n_results=n_result
    )
    return results

Query Generator


In [None]:
def query_engine(current_query,old_convo=None,context_summary=None):
    from langchain_ollama import OllamaLLM
    llm = OllamaLLM(model="llama3.1",temperature=0.2)
    from langchain import PromptTemplate
    old_convo_str = " ".join([f"Human: {prompt} | AI: {response}" for prompt, response in old_convo]) if old_convo is not None else ""
    if context_summary == None and old_convo == None:
        template = "create a query to a RAG model where the current prompt is {current_query}. Make sure the query only consists of key terms which the user wants to know about in the current prompt."
    elif context_summary == None and old_convo != None:
        template = "create a query to a RAG model where the past three prompts along with responses are {old_convo_str} and the current prompt is {current_query}. Make sure the query only consists of key terms which the user wants to know about in the current prompt which may or may not be related to the previous conversations."
    elif old_convo != None and context_summary != None:
        template ="create a query to a RAG model where the context of conversation so far is : {context_summary} and the past three promps along wiht response is {old_convo_str} and the current prompt is{current_query}. Make sure query only consists of key terms which the user wants to know about in the current prompt which may or may not be related to the previous conversations"
    prompt = PromptTemplate(
        input_variables=['current_query','old_convo_str','context_summary'],
        template=template
    )
    response = llm(prompt.format(current_query = current_query,old_convo_str = old_convo_str,context_summary = context_summary))
    return response

ChatSummaryBufferChain

In [None]:
from collections import deque

class ChatSummaryBufferChainTest:
    def __init__(self, max_memory=2,summary_fn = None):
        memory = read_csv_file()
        print(memory)
        self.max_memory = max_memory
        self.summary_fn = summary_fn
        x =()
        if len(memory)>1:
            x= tuple(memory[1])
            self.memory = deque([x],maxlen=max_memory) 
        self.summaries = memory[0][0] if len(memory)>0 else ""


    def add_conversation(self, human_message, ai_message):
        self.memory.append((human_message, ai_message))
        print(self.memory[-1])
        if len(self.memory) == self.memory.maxlen and len(self.summaries) > 0:
            print("length",len(self.memory))
            self.summarize_conversations()


    def summarize_conversations(self):
        if self.summary_fn and len(self.memory) > 0 and self.summaries=="":
            if len(self.memory)==self.max_memory:
                oldest_convo = self.memory.popleft()
            summary = self.summary_fn(oldest_convo)
            self.summaries = summary

        elif self.summaries != "" and self.summary_fn and len(self.memory) > 0 :
            oldest_convo = self.memory.popleft()
            summary = self.summary_fn(oldest_convo,self.summaries)
            self.summaries = summary
        old_convo_memory = [(prompt, response) for prompt, response in self.memory]
        print("old_convo_memory",old_convo_memory)
        write_csv_file([[self.summaries],old_convo_memory])


Summary Generator

In [None]:
def summarizer_fn(old_convo,summary_old=""):
    from langchain.schema import AIMessage, HumanMessage, SystemMessage
    from langchain_ollama import OllamaLLM
    from langchain import PromptTemplate
    summarizer = OllamaLLM(model="llama3.1")
    template = ''' Your task is to combine the old summary with the latest conversation. 
        Focus on merging the key points from both the old summary and the new conversation into a single, concise summary that contains all relevant information.
        Ensure that the final summary is brief but informative, covering all the essential details from both sources. Respond with the version which is a concice summary of the below two
        
        Old Summary: {old_summary}

        New Conversation: {new_convo}

        
        '''
    prompt = PromptTemplate(
        input_variables=['old_summary','new_convo'],
        template=template
    )
    
    messages = [
    SystemMessage(content="""
You are an expert summarizer. Respond with only the summary text
"""),
    HumanMessage(prompt.format(old_summary=summary_old,new_convo=old_convo))
]
    summary = summarizer.invoke(messages)
    print("Summary: " , summary , "\n\n")
    return summary

Final Combo

In [None]:
def ask_with_memory(llm,collection):
    buffer = ChatSummaryBufferChainTest(summary_fn=summarizer_fn)
    query = input("Question:")
    while query != "quit":
        context = search_result(query,collection,3)
        context = context['documents']
        context_str = ''
        for i in context:
            for j in i:
                context_str += str(j)
        old_convo = buffer.memory
        print("Old convo: ", old_convo)
        old_convo_str = " ".join([f"Human: {prompt} | AI: {response}" for prompt, response in old_convo])
        # query = query_engine(query,old_convo,buffer.summaries)
        from langchain import PromptTemplate
        template = ''' Answer the current question. You may use the information given in the context if necessary. You may also refer to the summary and the last two conversations that we has (old conversation)

        Current Question: {query}

        Context: {context_str}
        
        Summary : {summary}

        old Conversation:{old_convo_str}

        Respond with only the answer, do not add anything else
        '''
        prompt = PromptTemplate(
            input_variables=['query','summary','old_covo_str','context'],
            template=template
        )
        response = llm(prompt.format(query=query,summary=buffer.summaries,old_convo_str=old_convo_str,context_str = context_str))
        buffer.add_conversation(query,response)
        if  old_convo_str != "":
            buffer.summarize_conversations()
        print("Context: ", context_str + "\n\n")
        print("Old convo: ", old_convo_str + "\n\n")

        print("Response: ", response)
        query=input("Question")

In [None]:
data = load_document("/Users/sarveshwar/Documents/Semester-3/Projects/LLM/moon.txt")
final_data = clean(data)
sentencizer(final_data)
chunker(final_data,12)
final_chunked_data = join_sentences(final_data)
embedding(final_chunked_data)
documents, embeddings,id = list_converter(final_chunked_data)
collection = db(documents,embeddings,id,"test2")
print("embeddings and collection creation done")

In [None]:
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model="llama3.1",temperature=0.2)
ask_with_memory(llm,collection)

In [None]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(model="llama3.1",temperature=0.2)

from langchain import PromptTemplate
template = ''' .

Context: {context}'''
prompt = PromptTemplate(
    input_variables=['person','context'],
    template=template
)
response = llm(prompt.format(person = "JOSEPH B. MARTIN",context = context["documents"]))
print(response)

In [None]:
# from langchain_ollama import OllamaLLM
# llm = OllamaLLM(model="llama3.1",temperature=0.2)
# print(llm.invoke("JOSEPH B. MARTIN"))