In [29]:
# %pip install langchain langchain-community langchain-openai neo4j
# %pip install openllm
# %pip install -qU langchain-groq
# %pip install -U langchain-huggingface
# %pip install faiss-cpu



In [2]:
book1path= "books\Global Warming.pdf"

from PyPDF2 import PdfReader
reader = PdfReader(book1path)
page = reader.pages[0]

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text


text=extract_text_from_pdf(book1path)


In [3]:
import langchain

from dotenv import load_dotenv
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings


In [4]:
path="uploads\globalwarming.txt"
with open(path, "r", encoding="UTF-8") as file:
    loaded_text = file.read()

In [5]:
text_split= CharacterTextSplitter(
    separator='\n',
    chunk_size=2000,
    chunk_overlap=200,
    length_function=len,
)
splitted_text= text_split.split_text(loaded_text)

In [6]:
len(splitted_text)

473

In [8]:
embeddings= HuggingFaceEmbeddings()

  warn_deprecated(


In [30]:
embeddings= HuggingFaceEmbeddings()
vectorstore= FAISS.from_texts(splitted_text, embeddings)

In [31]:
vectorstore.save_local("vector_index")

In [9]:
db = FAISS.load_local("vector_index", embeddings,allow_dangerous_deserialization=True)

In [36]:
db.similarity_search("minimum environmental impact")

[Document(page_content='its impact on the construction of this buil ding whose roof we call \x93“emission cuts". \nThe calculation process at a corporate level faces the following obstacles:  \nx Great difficulty reaching scope 3. Collectin g the supplier\x92’s indirect footprint is an \nimpossible mission for many corporations. In  a d d i t i o n  t o  t h e  p r o c e d u r a l  d i f f i c u l t y  \ninvolved in "forcing" providers to do the calculation, it is based on a totally non-\nstandard assembly process in which each pr ovider chooses the method to calculate the \nfootprint of their products. This creates great distortion and the results lack credibility.  \nx Voluntary choice of the calculation method, and the scope and the emission factors as \nlong as they come from \'reliable source s\'. This leaves the spreadsheet open.  \nx Inconsistency with the footprint of products  or services when these are calculated.  \nx Legislation compliance (CO 2 e m i s s i o n  r i g h t s

In [10]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
chat = ChatGroq(
    temperature=0,
    model="llama3-70b-8192",
    api_key="" # Optional if not set as an environment variable
)

In [11]:
from langchain.chains import RetrievalQA


reteiver= db.as_retriever(search_type="similarity",search_kwargs={"k":4})
rqa=RetrievalQA.from_chain_type(llm=chat,
                                chain_type="stuff",
                                retriever=reteiver,
                                return_source_documents=True)

In [12]:
result = rqa.invoke("Carbonfeel")

In [13]:
print(result['result'])

Carbonfeel is a collaborative initiative that provides a universal indicator, the carbon footprint (CF), to help organizations and individuals measure and reduce their environmental impact. It aims to facilitate the expansion of a responsible economy by promoting a credible label supported by various certifiers, consultants, companies, associations, universities, and others. Carbonfeel provides a procedural solution for calculating, verifying, certifying, and labeling the carbon footprint of organizations, products, and services. It seeks to make environmental accounting accessible to all businesses, regardless of size, and to promote transparency, comparability, and continuous improvement.


Gradio interface

In [14]:
import re

def split_into_sections_and_chapters(text):
    # Define regular expressions to identify sections and chapters
    section_pattern = re.compile(r'Section \d+\s+.+')
    chapter_pattern = re.compile(r'Chapter \d+\s+.+')

    sections = section_pattern.split(text)
    sections_titles = section_pattern.findall(text)

    sections_with_chapters = []

    for i, section in enumerate(sections):
        if i == 0:
            continue  # Skip the preface or any text before the first section

        chapters = chapter_pattern.split(section)
        chapter_titles = chapter_pattern.findall(section)
        
        chapters_with_content = []
        for j in range(len(chapter_titles)):
            if j + 1 < len(chapters):
                chapters_with_content.append((chapter_titles[j], chapters[j + 1]))
            else:
                chapters_with_content.append((chapter_titles[j], chapters[j]))

        sections_with_chapters.append((sections_titles[i - 1], chapters_with_content))
    
    return sections_with_chapters


file_path = 'uploads\globalwarming.txt'
text = read_text_file(file_path)

# Split the text into sections and chapters
sections_with_chapters = split_into_sections_and_chapters(text)
 


In [15]:
import gradio as gr
import PyPDF2
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
bi_encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



# Function to save the uploaded PDF file
def save_pdf(file):
    with open("uploaded_file.pdf", "wb") as f:
        f.write(file.read())
    return "PDF saved successfully!"


# Function to display the PDF file
def display_pdf(file):
    return gr.File.update(value=file.name)


def preprocess_pdf(file):
    text = ""
    with open(file, "rb") as file:
        reader = PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
        sections_with_chapters = split_into_sections_and_chapters(text)  
        
        



        # Encode sections and chapters
        encoded_sections = []
        for section_title, chapters in sections_with_chapters:
            for chapter_title, chapter_content in chapters:
                encoded_content = bi_encoder.encode(chapter_content, convert_to_tensor=True)
                encoded_sections.append((section_title, chapter_title, chapter_content, encoded_content))
    
    with open("preprocessed_text.txt", "w") as f:
        f.write(text)
    return "PDF preprocessed successfully!"


# Function to handle chatbot queries using bi-encoder retrieval
def chatbot_bi_encoder(query):
    with open("preprocessed_text.txt", "r") as f:
        context = f.read()
    context_embeddings = bi_encoder.encode(context, convert_to_tensor=True)
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, context_embeddings)[0]
    best_idx = scores.argmax()
    return context[best_idx:best_idx+200]


def chatbot_qa(query):
    with open("preprocessed_text.txt", "r") as f:
        context = f.read()
    result = rqa.invoke(query)
    return result['answer']





# Encode sections and chapters
encoded_sections = []
for section_title, chapters in sections_with_chapters:
    for chapter_title, chapter_content in chapters:
        encoded_content = bi_encoder.encode(chapter_content, convert_to_tensor=True)
        encoded_sections.append((section_title, chapter_title, chapter_content, encoded_content))



def semantic_search(query, encoded_sections, model, k=10):
    # Encode the query
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    # Collect all embeddings and their associated titles and contents
    all_embeddings = [item[3] for item in encoded_sections]
    all_titles = [(item[0], item[1]) for item in encoded_sections]
    all_contents = [item[2] for item in encoded_sections]
    
    # Perform semantic search
    hits = util.semantic_search(query_embedding, all_embeddings, top_k=k)[0]
    
    # Collect results
    results = [((all_titles[hit['corpus_id']][0], all_titles[hit['corpus_id']][1], all_contents[hit['corpus_id']]), hit['score']) for hit in hits]
    return results

# Perform a semantic search
query = "Carbonfeel"
results = semantic_search(query, encoded_sections, model, k=10)

for (section_title, chapter_title, chapter_content), score in results:
    print(f"Section: {section_title}, Chapter: {chapter_title}, Score: {score}\nContent: {chapter_content[:500]}...\n")  # Print first 500 characters for brevity


NameError: name 'sections_with_chapters' is not defined

In [16]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


In [17]:
import uuid


class TreeNode:
    def __init__(self, title, content=None, level=0):
        self.id = str(uuid.uuid4())
        self.title = title
        self.content = content if content else []
        self.level = level
        self.children = []
        

    def add_child(self, child_node):
        self.children.append(child_node)

    def __repr__(self):
        return f"TreeNode(id={self.id}, title={self.title}, level={self.level}, children={len(self.children)})"

def parse_book_content(book_content):
    lines = book_content.split('\n')
    root = TreeNode("Textbook")
    current_section = None
    current_chapter = None

    for line in lines:
        line = line.strip()
        if line.startswith("Section") :
            current_section = TreeNode(line, level=1)
            root.add_child(current_section)
            current_chapter = None
            
        elif line.startswith("Chapter") :
            current_chapter = TreeNode(line, level=2)
            if current_section:
                current_section.add_child(current_chapter)
        elif current_chapter and line :
            current_chapter.content.append(line)
        
          
    
    return root

def print_tree(node, indent=0):
    tree_str = "  " * indent + f"{node.title} (ID: {node.id})\n"
    for child in node.children:
        tree_str += print_tree(child, indent + 1)
        
        
    return tree_str





text = read_text_file("preprocessed_text.txt")

treenode = parse_book_content(text)

output=print_tree(treenode)





In [18]:
import re

def split_into_sections_and_chapters(text):
    # Define regular expressions to identify sections and chapters
    section_pattern = re.compile(r'Section \d+\s+.+')
    chapter_pattern = re.compile(r'Chapter \d+\s+.+')

    sections = section_pattern.split(text)
    sections_titles = section_pattern.findall(text)

    sections_with_chapters = []

    for i, section in enumerate(sections):
        if i == 0:
            continue  # Skip the preface or any text before the first section

        chapters = chapter_pattern.split(section)
        chapter_titles = chapter_pattern.findall(section)
        
        chapters_with_content = []
        for j in range(len(chapter_titles)):
            if j + 1 < len(chapters):
                chapters_with_content.append((chapter_titles[j], chapters[j + 1]))
            else:
                chapters_with_content.append((chapter_titles[j], chapters[j]))

        sections_with_chapters.append((sections_titles[i - 1], chapters_with_content))
    
    return sections_with_chapters



 


In [19]:
# Read the text file
file_path = 'uploads\globalwarming.txt'
text = read_text_file(file_path)

# Split the text into sections and chapters
sections_with_chapters = split_into_sections_and_chapters(text)

In [20]:
def semantic_search(query, encoded_sections, model, k=1):
    # Encode the query
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    # Collect all embeddings and their associated titles and contents
    all_embeddings = [item[3] for item in encoded_sections]
    all_titles = [(item[0], item[1]) for item in encoded_sections]
    all_contents = [item[2] for item in encoded_sections]
    
    # Perform semantic search
    hits = util.semantic_search(query_embedding, all_embeddings, top_k=k)[0]
    
    # Collect results
    results = [((all_titles[hit['corpus_id']][0], all_titles[hit['corpus_id']][1], all_contents[hit['corpus_id']]), hit['score']) for hit in hits]
    return results

In [26]:
import gradio as gr
import PyPDF2
from sentence_transformers import SentenceTransformer, util

# Load models for the chatbot
bi_encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Function to save the uploaded PDF file
def save_pdf(file):
    with open("uploaded_file.pdf", "wb") as f:
        f.write(file.read())
    return "PDF saved successfully!"

# Function to display the PDF file
def display_pdf(file):
    return gr.File.update(value=file.name)

# Function to preprocess the PDF file
def preprocess_pdf(file):
    with open(file.name, 'rb') as f:  # Open in binary mode
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    with open("preprocessed_text.txt", "w", encoding='utf-8') as f:
        f.write(text)
    return "PDF preprocessed successfully!"

# Function to handle chatbot queries using QA pipeline
def chatbot_qa(query):
    result = rqa.invoke(query)
    return result['result']

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Function to handle chatbot queries using bi-encoder retrieval
def chatbot_bi_encoder(query,k_top):
    text = read_text_file("preprocessed_text.txt")
    sections_with_chapters = split_into_sections_and_chapters(text)    
    
    # Encode sections and chapters
    encoded_sections = []
    for section_title, chapters in sections_with_chapters:
        for chapter_title, chapter_content in chapters:
            encoded_content = bi_encoder.encode(chapter_content, convert_to_tensor=True)
            encoded_sections.append((section_title, chapter_title, chapter_content, encoded_content))
    
    results = semantic_search(query, encoded_sections, bi_encoder, k_top)  
    
    return results


def display_tree():
    text = read_text_file("preprocessed_text.txt")
    tree = parse_book_content(text)
    return print_tree(tree)

# Switchable chatbot function
def chatbot(query, mode,slider):
    if mode == "QA_RAG":
        return chatbot_qa(query)
    elif mode == "Bi-Encoder":
        return chatbot_bi_encoder(query,slider)
    elif mode == "Heirarchial Structure":
        text = read_text_file("preprocessed_text.txt")

        treenode = parse_book_content(text)

        output=print_tree(treenode)
        return output

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("""
    <h2 style="color: #FF5033; font-family: Arial, sans-serif; text-align: center;">DOCSENSE</h2>
    """)
    
    # PDF Upload and Display
    pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
    save_button = gr.Button("Save PDF")
    
    save_button.click(save_pdf, inputs=pdf_file, outputs=None)
    
    # Preprocess PDF
    preprocess_button = gr.Button("Preprocess PDF")
    preprocess_button.click(preprocess_pdf, inputs=pdf_file, outputs=None)
    
    # Chatbot
    chatbot_input = gr.Textbox(label="Enter your query")
    mode_selector = gr.Radio(["QA_RAG", "Bi-Encoder","Heirarchial Structure"], label="Select Chatbot Mode", value="QA")
    slider_input = gr.Slider(label="K",minimum=1, maximum=3,step=1)
    chatbot_button = gr.Button("Get Response")
    
    chatbot_output = gr.Textbox(label="Response")
    
    
    chatbot_button.click(chatbot, inputs=[chatbot_input, mode_selector,slider_input], outputs=chatbot_output)

# Launch the interface
demo.launch(share=True)


Running on local URL:  http://127.0.0.1:7868

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




Traceback (most recent call last):
  File "f:\projects\RAG_HIndex\.venv\lib\site-packages\gradio\queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
  File "f:\projects\RAG_HIndex\.venv\lib\site-packages\gradio\route_utils.py", line 276, in call_process_api
    output = await app.get_blocks().process_api(
  File "f:\projects\RAG_HIndex\.venv\lib\site-packages\gradio\blocks.py", line 1897, in process_api
    result = await self.call_function(
  File "f:\projects\RAG_HIndex\.venv\lib\site-packages\gradio\blocks.py", line 1483, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "f:\projects\RAG_HIndex\.venv\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "f:\projects\RAG_HIndex\.venv\lib\site-packages\anyio\_backends\_asyncio.py", line 2177, in run_sync_in_worker_thread
    return await future
  File "f:\projects\RAG_HIndex\.venv\lib\site-packag