In [4]:
# Import PyPDFLoader from langchain_community.document_loaders module
from langchain_community.document_loaders import PyPDFLoader

# Create an instance of PyPDFLoader to handle PDF loading
# Specify the path to the PDF file you want to process
loader = PyPDFLoader("flowers.pdf")

# Load the PDF file and split its content into pages
# This function will return a list where each element is the text content of a page
pages = loader.load_and_split()


In [1]:
# Import RecursiveCharacterTextSplitter from the langchain.text_splitter module
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Import WeaviateVectorStore from the langchain_weaviate.vectorstores module
from langchain_weaviate.vectorstores import WeaviateVectorStore

# Import the weaviate module to interact with the Weaviate vector database
import weaviate

# Create a connection to a locally hosted Weaviate instance
client = weaviate.connect_to_local()

# Print the status of the Weaviate client to check if it's ready for operations
print(client.is_ready())

True


In [2]:
# Import the Ollama class from the langchain_community.llms module
from langchain_community.llms import Ollama

# Define the model name to be used with Ollama
model_name = "phi3"

# Initialize the Ollama model with the specified model name
llm = Ollama(model=model_name)

In [7]:
# Import the OllamaEmbeddings class from the langchain_community.embeddings module
from langchain_community.embeddings import OllamaEmbeddings

# Initialize the OllamaEmbeddings with the specified model name
# This will generate embeddings using the "phi3" model
embeddings = OllamaEmbeddings(model="phi3")


In [None]:
# Import RecursiveCharacterTextSplitter from the langchain.text_splitter module
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize a RecursiveCharacterTextSplitter object
# Set the chunk_size to 140 and the chunk_overlap to 40
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Split the pages of the document into chunks using the text splitter
# 'pages' should be a list of text documents, and the output will be a list of text chunks
splits = text_splitter.split_documents(pages)

In [None]:
# Create a WeaviateVectorStore from the provided documents and embeddings
# Initialize the Weaviate vector store
# - 'splits': List of text chunks obtained from splitting the documents
# - 'embeddings': Embedding model to convert text chunks into vectors
# - 'client': Weaviate client to interact with the Weaviate instance
# - 'index_name': Name of the collection in Weaviate to store the vectors
# - 'text_key': The key under which the text will be stored in the Weaviate schema

db = WeaviateVectorStore.from_documents(
    documents=splits,         # The text chunks to be indexed
    embeddings=embeddings,    # The model used to generate embeddings
    client=client,            # Weaviate client for database interaction
    index_name='MyCollection',# Name of the Weaviate collection
    text_key='text'           # Key to access the text content
)

In [8]:
db = WeaviateVectorStore(client,index_name='MyCollection',embedding=embeddings,text_key='text')

In [9]:
# Import the necessary classes and functions from langchain
from langchain.chains import RetrievalQA
from langchain import hub

# Pull a pre-defined prompt from the Langchain hub
# The 'rlm/rag-prompt' is a pre-configured prompt used for retrieval-augmented generation (RAG)
prompt = hub.pull("rlm/rag-prompt")

# Initialize a RetrievalQA chain using the specified language model and retriever
qa_chain = RetrievalQA.from_chain_type(
    llm,                        # The language model (llm) used for generating answers
    retriever=db.as_retriever(),# The retriever method from the WeaviateVectorStore instance
    chain_type_kwargs={"prompt": prompt} # Additional keyword arguments including the custom prompt
)


In [10]:
# Define a question for the QA system
question = "tell me about plants in the family Asparagaceae"

# Use the RetrievalQA chain to process the query and generate an answer
result = qa_chain({"query": question})

# Import the pprint module for pretty-printing the result
import pprint

# Create a PrettyPrinter object with an indentation of 4 spaces for better readability
pp = pprint.PrettyPrinter(indent=4)

# Pretty-print the result obtained from the QA chain
pp.pprint(result["result"])

  warn_deprecated(


(' Bluebells are perennial plants within the Asparagaceae family, indigenous '
 "to Atlantic Europe's regions. Unlike other families mentioned such as "
 'Caryophyllaceae or Asteraceae which have different plant types (herbaceous '
 'and flowering), bluebells stand out for their specific habitat preference in '
 'the Asparagaceae lineage.')
