# 1. Importing Required Libraries

Files (os, requests),
Text processing (langchain, PyMuPDFLoader, TextLoader),
Machine learning (ChatOpenAI, HuggingFaceEmbeddings),
Storing and searching embeddings (Chroma, chromadb),
Data processing (pandas).

In [25]:
import os

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader

In [4]:
from langchain_openai import ChatOpenAI

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

In [6]:
from langchain.prompts import PromptTemplate

In [7]:
from langchain.chains import ConversationalRetrievalChain

In [8]:
from langchain_community.vectorstores import Chroma

In [9]:
import chromadb

In [10]:
import pandas as pd
import uuid

In [11]:
from langchain.agents.agent_types import AgentType

In [12]:
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent

In [13]:
import requests

In [14]:
from utils.file_conversions import ppt_to_pdf, docx_to_pdf

In [26]:
os.environ[
    "OPENAI_API_KEY"] = "key"

# 2. Class Defining ; Final
This class contains methods for handling document uploads, text processing, embedding creation, and question-answering.

Class Initialization (__init__ method)
Loads HuggingFaceEmbeddings to convert text into numerical vectors.
Defines paths for storing temporary files and embeddings.
Initializes placeholders for different components (retriever, QA model, etc.).

Key Variables:
self.emb: Path where numeric vectors are stored through embedding.

self.temp_pdf_path: Path where temporary PDFs are saved.

self.agent_avail: Tracks if the document is CSV/Excel for data querying.

In [16]:
class Final:

    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",
                                                model_kwargs={"device": "cuda"},
                                                # model_kwargs={"device": "cpu"},
                                                encode_kwargs={'normalize_embeddings': True})
        self.retriever = None
        # path to be server can't change
        self.emb = f"/home/ubuntu/Embeddings"
        self.temp_pdf_path = "/home/ubuntu/temp"
        # path set locally can be edited
        # self.emb = "/home/banshee/Prudentbit/Embeddings"
        self.qa = None
        self.em = None
        self.source = None
        self.persistent_client = None
        self.agent = None
        self.agent_avail = False
        self.df = None
        self.page = True

Uses HuggingFaceEmbeddings with the model "sentence-transformers/all-mpnet-base-v2".
Embeddings are normalized (encode_kwargs={'normalize_embeddings': True}).

self.emb = "/home/ubuntu/Embeddings" . it fixed path on a server.
self.temp_pdf_path = "/home/ubuntu/temp". it is atemporary storage for PDFs.

self.retriever: it store a retriever object for searching embeddings.

self.qa:it used for answering questions based on embeddings.

self.agent:it is an AI chatbot.

self.agent_avail = False: the agent is inactive.

self.df:it store metadata, documents, or QA results.

# 3. check_file_path(self, name)
Simply extracts and returns the file extension (e.g., .pdf, .txt, .csv).

In [17]:
 def check_file_path(self, name):
        return name.split('.')[-1]

# 4. file_loader(self, path, name, hash=None, domain=None)
file_loader is designed to load different file types, extract their content, and process it.
Working:
Checks the file extension (pdf, docx, pptx, txt, csv, xlsx).
Uses different loaders depending on the file type:

PDF: Extracts text and images using PyMuPDFLoader.

DOCX/DOC: Converts to PDF first, then extracts text.

PPTX/PPT: Converts slides to PDF, then extracts text.

TXT: Downloads text from a URL and saves it temporarily.

CSV/XLSX: Loads into a Pandas DataFrame for structured querying.

Stores and organizes files in a directory.

If a CSV/XLSX , it sets self.agent_avail = True, allowing data querying instead of text-based retrieval.

In [18]:

    def file_loader(self, path, name, hash=None, domain=None):
        ext = self.check_file_path(name)
        ext = ext.lower()   ##Converts the extension to lowercase so that comparisons are case-insensitive (e.g., PDF → pdf)
        if ext == 'pdf':
            loader = PyMuPDFLoader(path, extract_images=True)
            self.agent_avail = False
            return loader.load()    ##Returns the extracted content using loader.load().


        elif ext == 'docx' or ext == 'doc':
            # loader = Docx2txtLoader(path)
            convert_pdf = docx_to_pdf(path) #Converts the Word file to PDF using docx_to_pdf(path).
            loader = PyMuPDFLoader(convert_pdf, extract_images=True)
            self.agent_avail = False
            self.page = False
            return loader.load()
        elif ext == 'pptx' or ext == 'ppt':
            # loader = UnstructuredPowerPointLoader(path)
            convert_pdf = ppt_to_pdf(path) ##Converts PowerPoint to PDF using ppt_to_pdf(path).
            print("done 3")
            loader = PyMuPDFLoader(convert_pdf)
            self.agent_avail = False
            self.page = False
            return loader.load()
        elif ext == 'txt':
            self.page = False
            response = requests.get(path) ##Downloads the text file from a URL (requests.get(path)).
            data = response.text
            with open(f"{self.temp_pdf_path}/temp.txt", "w") as file:
                file.write(data)  #Saves the downloaded text in a temporary file.
            loader = TextLoader(f"{self.temp_pdf_path}/temp.txt") #Loads the text using TextLoader.
            self.agent_avail = False
            return loader.load()
        elif ext == 'csv':
            try:
                self.df = pd.read_csv(path) #it read the CSV file into a Pandas DataFrame
                self.df.to_csv(f"{self.emb}/{domain}/{hash}.csv", index=False) #Saves in {domain}/{hash}.csv folder.
                self.agent_avail = True
            except:
                os.system(f"mkdir {self.emb}/{domain}") #If the folder doesn't exist, it creates the directory (os.system(f"mkdir {self.emb}/{domain}")).

                self.df.to_csv(f"{self.emb}/{domain}/{hash}.csv", index=False)
                self.agent_avail = True
        elif ext == 'xlsx':
            try:
                all_sheets = pd.read_excel(path, sheet_name=None) #Reads all sheets from the Excel file.
                sheets = all_sheets.keys()
                os.system(f"mkdir {self.emb}/{domain}/{hash}")
                for sheet_name in sheets:
                    temp_df = pd.DataFrame(pd.read_excel(path, sheet_name=sheet_name))
                    temp_df.to_csv(f"{self.emb}/{domain}/{hash}/{sheet_name}.csv", index=False)
                self.agent_avail = True
            except:
                all_sheets = pd.read_excel(path, sheet_name=None)
                sheets = all_sheets.keys()
                os.system(f"mkdir {self.emb}/{domain}")
                os.system(f"mkdir {self.emb}/{domain}/{hash}")
                for sheet_name in sheets:
                    temp_df = pd.DataFrame(pd.read_excel(path, sheet_name=sheet_name)) #Saves each sheet as a CSV file in the folder.
                    temp_df.to_csv(f"{self.emb}/{domain}/{hash}/{sheet_name}.csv", index=False)
                self.agent_avail = True
        else:
            return False


# 5. text_splitter(self, texts)
Splits long texts into smaller chunks (3,000 characters with 1,000-character overlap).


Creates a RecursiveCharacterTextSplitter object.it splits long text into smaller parts for easier processing.

chunk_size=3000 → Each text chunk will be at most 3000 characters long.
chunk_overlap=1000 → Each chunk will overlap the next one by 1000 characters.

This preserves context when splitting text, preventing loss of meaning.



In [27]:
def text_splitter(self, texts):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=3000,
            chunk_overlap=1000)
        return text_splitter.split_documents(texts)

# 6. embed_data(self, texts, domain, index)
Converts text into numerical vectors using HuggingFaceEmbeddings.
Stores these vectors in Chroma for retrieval.

Connects to the ChromaDB database.ChromaDB is a vector database used for storing and retrieving embeddings.

Creates a collection for storing embeddings.

Converts each text chunk into a vector and stores it.
Sets up a retriever to fetch similar text chunks when queried.

In [20]:
 def embed_data(self, texts, domain, index):
        self.persistent_client = chromadb.PersistentClient(self.emb + f"/{domain}/") #Creates a persistent database connection with ChromaDB.
        collection = self.persistent_client.create_collection(name=index, metadata={"hnsw:space": "cosine"})
        #Creates a folder inside ChromaDB to store embeddings.Uses "cosine" similarity for searching (compares embeddings based on angle differences).
        collection = self.persistent_client.get_collection(index)
        for i in texts:
            collection.upsert(
                documents=str(i.page_content).encode('utf-8', 'replace').decode('utf-8'), #Converts the document text (i.page_content) into a string format that avoids encoding issues.
                embeddings=self.embeddings.embed_query(i.page_content),#create numerical representations of the text.
                metadatas=i.metadata,
                ids=[str(uuid.uuid1())] #Generates a unique ID for each document using uuid.uuid1().

            )

        langchain_chroma = Chroma(
            client=self.persistent_client,
            collection_name=index,
            embedding_function=self.embeddings, )
        self.retriever = langchain_chroma.as_retriever(search_kwargs={"k": 5})

 Connects to ChromaDB and creates/retrieves a collection.
 Loops through text chunks and converts them into embeddings.
 Stores embeddings in ChromaDB along with metadata.
 Sets up a retriever to find the top 5 most similar documents when searching.



# 7. load_embedings(self, domain, index)
Loads pre-stored embeddings from the database and prepares a retriever for searching.
This avoids re-processing the document every time.

In [21]:

    def load_embedings(self, domain, index):
        self.persistent_client = chromadb.PersistentClient(self.emb + f"/{domain}/")
        collection = self.persistent_client.get_collection(index)
        langchain_chroma = Chroma(
            client=self.persistent_client,
            collection_name=index,
            embedding_function=self.embeddings, ) #Wraps the ChromaDB collection inside LangChain’s Chroma tool.
        self.retriever = langchain_chroma.as_retriever(search_kwargs={"k": 5})


# 8. qas(self, domain=None, hash=None)
This method initializes either:

A structured data agent (for CSV/XLSX files)

Loads all sheets in an Excel file.
Uses an agent to answer queries using ChatOpenAI.
A text-based question-answering system

Uses llm to retrieve information from the embedded document.
The prompt ensures responses are professional and stay within the document’s context.
If a user asks for PII (Personally Identifiable Information) like names, emails, phone numbers, etc., the system replaces them with @#$%^ to protect privacy.

In [22]:
 def qas(self, domain=None, hash=None):
        temp_list = []
        print(self.agent_avail)
        if self.agent_avail:
            for i in os.listdir(f"../Embeddings/{domain}/{hash}/"):
                df = pd.read_csv(f"../Embeddings/{domain}/{hash}/{i}")
                temp_list.append(df)
            self.agent = create_pandas_dataframe_agent(ChatOpenAI(temperature=0, ), temp_list, verbose=True,
                                                       agent_type=AgentType.OPENAI_FUNCTIONS, allow_dangerous_code=True)
            print("code in gent")
        else:
            prompt_template = """You are a helpful and professional AI assistant called Veda. 
            You have access to a single document that has been provided to you. 
            If a user only greets you only then answer Hello! I am veda How i can assist you. 
            If there is a question after the greeting do the remaining task.
            Your goal is to engage in a conversation with the user and answer their questions to the best of your ability using only the information contained within the given document. 
            If the user asks a question that cannot be answered based on the contents of the document, politely inform them that you do not have enough information to provide an answer.
            If a user asks for your opinion politely decline.
            If the question {question} asked by the user is not professional rephrase the question in a professional manner and then answer the rephrased question do not tell the user that the question has been rephrased.
            Also Do not tell the user to be professional
            If the rephrased question can only be answered by going out of context politely decline.
            You should keep the conversation professional and not stray from the topic.
            All the conversation should be witthin the provided {context}

            if the user asks for the name of the document. Use {Doc_name} as the name of the document.

            When a user requests any kind of PII (Personally Identifiable Information) data, such as names, addresses, phone numbers, email addresses, social security numbers, or any other information that could be used to identify a specific individual, just respond with @#$%^ without any modifications or additional information. 
            Do not attempt to generate, predict, or make up any PII data in your responses. 
            Strictly use the provided message @#$%^ whenever PII data is requested or discussed by the user.



            The document you have access to is the following:
            {context}
            You should aim to provide concise and relevant answers, extracting key details from the document as needed to address the user's questions.
            Feel free to quote short excerpts from the document to support your responses. 
            Remember to stay on topic and avoid making claims that are not backed up by the document. 
            If you are uncertain about something, it's better to say you're not sure rather than speculating.
            Your conversation should be professional and natural, as if you are an intelligent and knowledgeable assistant. 
            Greet the user to begin the conversation. Let me know if you have any other questions!
            You are also provided with a chat history {chat_history} use that history to exxpand your knowledge about the current conversation that is going on and also use that to keep the conversation helpful and professional
            QUESTION:```{question}```
            ANSWER:"""
            PROMPT = PromptTemplate(
                template=prompt_template, input_variables=["context", "question", "chat_history", "Doc_name"]
            )
            chain_type_kwargs = {"prompt": PROMPT}
            self.qa = ConversationalRetrievalChain.from_llm(llm=ChatOpenAI(
                temperature=0.5,
                verbose=True,
            ),
                retriever=self.retriever,
                chain_type="stuff",
                combine_docs_chain_kwargs={"prompt": PROMPT},
                return_source_documents=True,
                verbose=True, )

# 9. agent_output(self, query)
If the document is a structured file (CSV/Excel), this method runs queries on the data.

In [23]:

    def agent_output(self, query):
        return self.agent.invoke(query)


# 10. output(self, query, history, name)
Answers user queries based on the uploaded document.
Returns a structured response with the answer.
This function takes a user’s question and chat history and gets an answer.then returns the answer in JSON format.
JSON (JavaScript Object Notation) is a lightweight format for storing and exchanging data. It is used in APIs, web applications, and AI systems because it's easy to read and write.

In [24]:

    def output(self, query, history, name):
        llm_response = self.qa.invoke({"question": query, "chat_history": history, "Doc_name": name})
        json_response = {
            "question": query,
            "answer": llm_response['answer'],
        }
        return json_response

# Summary:
Uploads a file (PDF, DOCX, PPT, TXT, CSV, XLSX).
Extracts text.
Embeds the text into a database for fast retrieval.
Allows querying of either text-based documents or structured files.
Provides professional AI-generated responses.
Responses stay within the document’s context.
Example Use Case:
User uploads a PDF (e.g., a research paper).
The system extracts text and creates embeddings.
The user asks: "What is the key conclusion of this paper?"
The llm retrieves relevant sections and provides an answer.


Similarly, if a CSV file is uploaded:
The user can ask: "What is the average sales revenue in Q1?"
The llm analyzes the data and responds.