In [1]:
!pip install langchain transformers pypdf faiss-cpu sentence-transformers
!pip install langchain_community
!pip install langchain_huggingface

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
                                              0.0/298.0 kB ? eta -:--:--
     --------------------------             204.8/298.0 kB 4.1 MB/s eta 0:00:01
     -------------------------------------- 298.0/298.0 kB 3.7 MB/s eta 0:00:00
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-win_amd64.whl (13.8 MB)
                                              0.0/13.8 MB ? eta -:--:--
                                              0.2/13.8 MB 6.3 MB/s eta 0:00:03
     -                                        0.6/13.8 MB 6.1 MB/s eta 0:00:03
     --                                       0.8/13.8 MB 5.4 MB/s eta 0:00:03
     ---                                      1.1/13.8 MB 5.6 MB/s eta 0:00:03
     ---                                      1.3/13.8 MB 6.0 MB/s eta 0:00:03
     ----                                     1.6/13.8 MB 5.7 MB/s eta 0:00:03
     -----                                    1.9/13.8 MB 5


[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Step 1: Load PDF as LangChain Documents
def load_pdf_as_documents(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return documents

# Step 2: Split Documents into Chunks
def split_documents_into_chunks(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    split_docs = text_splitter.split_documents(documents)
    return split_docs

# Step 3: Create FAISS Vector Store
def create_faiss_index_from_documents(documents):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(documents, embeddings)
    return vector_store

# Step 4: Set Up HuggingFace Question Generation
def setup_qg_pipeline():
    model_name = "google/flan-t5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Step 5: RAG Pipeline Setup
def setup_rag_pipeline_with_documents(pdf_path):
    # Load PDF and split into chunks
    documents = load_pdf_as_documents(pdf_path)
    split_docs = split_documents_into_chunks(documents)

    # Create vector store
    vector_store = create_faiss_index_from_documents(split_docs)

    # Set up QA pipeline
    retriever = vector_store.as_retriever()
    qg_pipeline = setup_qg_pipeline()
    llm = HuggingFacePipeline(pipeline=qg_pipeline)
    qa_chain = RetrievalQA(llm=llm, retriever=retriever)

    return qa_chain



**Step 1** - Loading PDFs as LangChain Documents
Explanation:

This step uses PyPDFLoader to load a PDF and convert it into a list of Document objects.
Each Document contains text and optional metadata, such as page numbers.

In [13]:
from langchain.document_loaders import PyPDFLoader

# Step 1: Load PDF as LangChain Documents
def load_pdf_as_documents(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return documents

# Example: Load a PDF
pdf_path = "example.pdf"  # Replace with your PDF file
documents = load_pdf_as_documents(pdf_path)

# Display the first few documents
print("Number of documents loaded:", len(documents))
print("First document content:", documents[0].page_content[:500])  # Print first 500 characters


Number of documents loaded: 61
First document content: natural-resources.canada.ca /our-natural-resources/energy-sources-distribution/electricity-infrastru…
Powering Canada’s Future: A Clean Electricity
Strategy
155-197 minutes
Table of Contents
Foreword – Clean Electricity Strategy
1.0 The Case for Clean Electricity
1.1 Laying Out a Clean Electricity Strategy for Canada
1.2 A Strategy informed by extensive engagement, electricity sector experts,
and Indigenous energy leaders
1.3 Key Guiding Principles
2.0 Toward the Grid of the Future
2.1 Global Co


**Step 2** - Splitting Documents into Chunks
Explanation:

Large documents need to be divided into smaller chunks to improve retrieval performance.
RecursiveCharacterTextSplitter ensures that the text is split into manageable sizes while maintaining some overlap for context continuity.

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Step 2: Split Documents into Chunks
def split_documents_into_chunks(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    split_docs = text_splitter.split_documents(documents)
    return split_docs

# Example: Split the loaded documents
chunked_documents = split_documents_into_chunks(documents)
print("Number of chunks created:", len(chunked_documents))
print("First chunk content:", chunked_documents[0].page_content[:500])  # Print first 500 characters


Number of chunks created: 198
First chunk content: natural-resources.canada.ca /our-natural-resources/energy-sources-distribution/electricity-infrastru…
Powering Canada’s Future: A Clean Electricity
Strategy
155-197 minutes
Table of Contents
Foreword – Clean Electricity Strategy
1.0 The Case for Clean Electricity
1.1 Laying Out a Clean Electricity Strategy for Canada
1.2 A Strategy informed by extensive engagement, electricity sector experts,
and Indigenous energy leaders
1.3 Key Guiding Principles
2.0 Toward the Grid of the Future
2.1 Global Co


**Step 3** - Creating a FAISS Vector Store
Explanation:

Text chunks are embedded into numerical vectors using sentence-transformers.
FAISS (Facebook AI Similarity Search) is used to index these vectors for fast retrieval during queries.

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Step 3: Create FAISS Vector Store
def create_faiss_index_from_documents(documents):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(documents, embeddings)
    vector_store.save_local("faiss_store")
    return vector_store

# Example: Create a FAISS index
vector_store = create_faiss_index_from_documents(chunked_documents)
print("FAISS vector store created!")


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS vector store created!


In [23]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Step 4: Set Up HuggingFace Question Generation
def setup_qg_pipeline():
    model_name = "google/flan-t5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def setup_rag_pipeline_with_documents(pdf_path):
    # Load and process documents
    documents = load_pdf_as_documents(pdf_path)
    split_docs = split_documents_into_chunks(documents)
    vector_store = create_faiss_index_from_documents(split_docs)

    # Set up retriever
    retriever = vector_store.as_retriever()

    # Set up HuggingFacePipeline
    qg_pipeline = setup_qg_pipeline()
    llm = HuggingFacePipeline(pipeline=qg_pipeline)

    # Define a prompt template for QA
    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template="Given the context: {context}, answer the question: {question}",
    )

    # Create RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # Default chain type for combining documents
        retriever=retriever,
        return_source_documents=True,  # To get the source of the answer
        chain_type_kwargs={"prompt": prompt_template},
    )

    # Return the qa_chain
    return qa_chain  # Added this return statement

# Example: Set up the RAG pipeline
pdf_path = "example.pdf"  # Replace with your PDF file
qa_chain = setup_rag_pipeline_with_documents(pdf_path)

# Query the pipeline
query = "What is the document about?"  # Modify this query as needed
response = qa_chain.invoke(query)

print("\n--- Query Response ---")
print(response)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (601 > 512). Running this sequence through the model will result in indexing errors



--- Query Response ---
{'query': 'What is the document about?', 'result': 'The Kinship and Prosperity Report', 'source_documents': [Document(id='187c47b1-47aa-4808-83f9-5215649a6922', metadata={'source': 'example.pdf', 'page': 8, 'page_label': '9'}, page_content='advisory council, which first met in December 2022.\nThe Kinship and Prosperity Report focuses on six key themes: easing access to\nfunding; developing consistent project eligibility criteria that prioritize Indigenous\ncommunity benefits; advancing inclusive opportunities and a Just Transition;\naccelerating Indigenous leadership in the energy transition; respecting self-\ndetermination by prioritizing Indigenous-led decisions; and sustainably funding\nIndigenous participation.\n1.3 Key Guiding Principles\nSix key principles underpin the Clean Electricity Strategy and will guide federal\naction to support electricity grid decarbonization and expansion.\nPrinciple 1: Provincial and Territorial Jurisdiction Must be Respected a