# 1. Install Required Libraries:
    
langchain: Framework for AI-powered applications.

openai: Access to GPT models.

chromadb: Vector database to store embeddings.

pandas: Handles CSV/Excel files.

pypdf: Reads and extracts text from PDFs.

python-docx: Reads DOCX files.



In [13]:
pip install langchain openai chromadb pandas pypdf python-docx


Note: you may need to restart the kernel to use updated packages.


# 2 Load Required Libraries:
os → Handles file paths.

pandas → Loads CSV/Excel files for structured data.

chromadb → Stores vector embeddings for fast text retrieval.

PyPDFLoader → Extracts text from PDFs.

TextLoader → Reads text files.

HuggingFaceEmbeddings → Converts text into numerical vector embeddings.

Chroma → Stores these embeddings in a vector database.

OpenAI → Calls the GPT model to generate responses.

ConversationalRetrievalChain → Uses OpenAI and embeddings to answer queries.

ConversationBufferMemory → Stores conversation history for context-aware answers.



In [None]:
import os
import pandas as pd
import chromadb
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
langchain.memory import ConversationBufferMemory



# 3.Define Chatbot Class
class Chatbot: → Defines a chatbot class.

def init(self): → Constructor method that initializes the chatbot.

self.embeddings = HuggingFaceEmbeddings() → Loads the Hugging Face model to create embeddings.

self.vector_db = None → Placeholder for the Chroma vector database.

self.memory = ConversationBufferMemory(memory_key="chat_history") → Stores past conversations.

In [None]:
class Chatbot:
    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings()
        self.vector_db = None
        self.memory = ConversationBufferMemory(memory_key="chat_history")


# 4. Load & Process Documents:
def load_document(self, file_path): → Function to load documents.

ext = os.path.splitext(file_path)[1] → Extracts the file extension.

if ext == ".pdf": → Checks if the file is a PDF.

loader = PyPDFLoader(file_path) → Uses PyPDFLoader for PDFs.

elif ext == ".txt": → Checks if the file is a text file.

loader = TextLoader(file_path) → Uses TextLoader for text files.

else: return "Unsupported file format" → Rejects unsupported formats.

docs = loader.load() → Loads and extracts text from the document.

self.store_embeddings(docs) → Calls the store_embeddings function to save embeddings.

In [None]:
 def load_document(self, file_path):
        ext = os.path.splitext(file_path)[1]
        if ext == ".pdf":
            loader = PyPDFLoader(file_path)
        elif ext == ".txt":
            loader = TextLoader(file_path)
        else:
            return "Unsupported file format"

        docs = loader.load()
        self.store_embeddings(docs)


# 5. Store Embeddings in ChromaDB:
def store_embeddings(self, docs): → Function to create and store text embeddings.

self.vector_db = Chroma.from_documents(docs, self.embeddings) → Converts documents into vector embeddings and stores them in ChromaDB.


In [None]:
def store_embeddings(self, docs):
        self.vector_db = Chroma.from_documents(docs, self.embeddings)

# 6. Answer User Queries:
def answer_query(self, query): → Function to process user queries.

retriever = self.vector_db.as_retriever() → Retrieves relevant document parts from ChromaDB.

qa_chain = ConversationalRetrievalChain.from_llm(...) → Creates a question-answering chain.

llm=OpenAI() → Uses OpenAI’s GPT model.

retriever=retriever → Uses the retriever to find relevant document parts.

memory=self.memory → Stores conversation history.

return qa_chain.run(query) → Runs the query and returns the response.

In [None]:
    def answer_query(self, query):
        retriever = self.vector_db.as_retriever()
        qa_chain = ConversationalRetrievalChain.from_llm(
            llm=OpenAI(),
            retriever=retriever,
            memory=self.memory
        )
        return qa_chain.run(query)


# 7. Running the Chatbot:
bot = Chatbot() → Creates a chatbot instance.

bot.load_document("example.pdf") → Loads and processes a sample PDF.

response = bot.answer_query("What is the main topic of the document?") → Asks a question about the document.

print(response) → Prints the chatbot’s response.

In [None]:

bot = Chatbot()
bot.load_document("example.pdf")  # Load a document
response = bot.answer_query("What is the main topic of the document?")
print(response)



# How It Works
User uploads a document (PDF, TXT, etc.).

Chatbot extracts text and converts it into vector embeddings.

Embeddings are stored in ChromaDB for retrieval.

User asks a question, and the chatbot retrieves relevant information.

GPT generates a response based on the document content.