In [1]:
! pip install langchain-core langgraph>0.2.27

In [2]:
! pip install langchain-community langchain-core

Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_do

In [14]:
! pip install langchain_openai langchain_pinecone docx2txt

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3960 sha256=8af0b7061f8667cc0fe53efd39ba4ef5ca6697474c998d7534e09b93e8431d7b
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8


In [17]:
from langchain_community.document_loaders import PyPDFLoader, CSVLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from typing import List
from pinecone import Pinecone
import os

class RAGChatbot:
    def __init__(self, openai_api_key: str, pinecone_api_key: str, pinecone_region: str, index_name: str):
        """Initialize the RAG chatbot with necessary API keys and configurations."""
        self.openai_api_key = openai_api_key
        self.index_name = index_name

        # Initialize Pinecone
        self.pc = Pinecone(
            api_key=pinecone_api_key
        )

        # Create index if it doesn't exist
        if index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=index_name,
                dimension=1536,  # OpenAI embedding dimension
                metric='cosine'
            )

        # Initialize embeddings
        self.embeddings = OpenAIEmbeddings(
            api_key=openai_api_key
        )

        # Initialize vector store
        self.vectorstore = PineconeVectorStore(
            index=self.pc.Index(index_name),
            embedding=self.embeddings,
            text_key="text"
        )

        # Initialize LLM
        self.llm = ChatOpenAI(
            api_key=openai_api_key,
            temperature=0.7,
            model="gpt-3.5-turbo"
        )

    def load_documents(self, file_path: str) -> List:
        """Load documents based on file type."""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        file_extension = file_path.split('.')[-1].lower()

        if file_extension == 'pdf':
            loader = PyPDFLoader(file_path)
        elif file_extension == 'csv':
            loader = CSVLoader(file_path)
        elif file_extension in ['docx', 'doc']:
            loader = Docx2txtLoader(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_extension}")

        return loader.load()

    def split_documents(self, documents: List, chunk_size: int = 1000, chunk_overlap: int = 200) -> List:
        """Split documents into smaller chunks."""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len
        )
        return text_splitter.split_documents(documents)

    def create_embeddings_and_store(self, chunks: List):
        """Create embeddings and store them in Pinecone."""
        self.vectorstore.add_documents(chunks)

    def setup_rag_chain(self):
        """Set up the RAG chain using LCEL."""
        # Create retriever
        retriever = self.vectorstore.as_retriever(search_kwargs={"k": 3})

        # Define the prompt template
        template = """You are a helpful AI assistant. Use the following context to answer the user's question.
        If you don't know the answer, just say you don't know. Don't try to make up an answer.

        Context: {context}

        Question: {question}

        Answer: """

        prompt = ChatPromptTemplate.from_template(template)

        # Create the RAG chain
        self.chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | prompt
            | self.llm
            | StrOutputParser()
        )

    def query(self, question: str) -> str:
        """Query the RAG chain with a question."""
        return self.chain.invoke(question)

def main():
    # Replace these with your actual API keys and configuration
    openai_api_key = "<<Your_openn_ai_key>>"
    pinecone_api_key = "your_pinecone_key"
    pinecone_region = "us-east-1"
    index_name = "testindex"

    try:
        # Initialize chatbot
        print("Initializing chatbot...")
        chatbot = RAGChatbot(
            openai_api_key=openai_api_key,
            pinecone_api_key=pinecone_api_key,
            pinecone_region=pinecone_region,
            index_name=index_name
        )

        # Example usage with document path
        document_path = "/content/sample_data.docx"

        # 1. Load documents
        print("Loading documents...")
        documents = chatbot.load_documents(document_path)

        # 2. Split documents
        print("Splitting documents into chunks...")
        chunks = chatbot.split_documents(documents)

        # 3. Create embeddings and store in Pinecone
        print("Creating embeddings and storing in Pinecone...")
        chatbot.create_embeddings_and_store(chunks)

        # 4. Set up the RAG chain
        print("Setting up the RAG chain...")
        chatbot.setup_rag_chain()

        # 5. Query the chatbot
        print("\nChatbot is ready! Type 'quit' to exit.")
        while True:
            question = input("\nQuestion: ")
            if question.lower() == 'quit':
                break

            response = chatbot.query(question)
            print(f"\nAnswer: {response}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Initializing chatbot...
Loading documents...
Splitting documents into chunks...
Creating embeddings and storing in Pinecone...
Setting up the RAG chain...

Chatbot is ready! Type 'quit' to exit.

Question: Natural Language Processing

Answer: Natural Language Processing (NLP) is a branch of AI that helps computers understand, interpret, and manipulate human language. Major applications include text classification, sentiment analysis, machine translation, and chatbots and virtual assistants.

Question: quit
