## Install Required Packages
First, install the necessary packages. OpenAI's Python client library and any specific embedding-related library (like langchain) should be installed.

In [48]:
# ! pip install langchain lancedb openai
# ! pip install langchain-community
# !pip install requests pypdf
# ! pip install PyPDF2
# ! pip install rank_bm25
# ! pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


 ## Set Up Your API Key
In Google Colab, you can set your API key by directly assigning it in the notebook or using environment variables. For security, it's best practice to avoid hardcoding sensitive information in your code
Set the API Key Using Environment Variables in Cola


In [50]:
import openai
from langchain.embeddings import OpenAIEmbeddings

# Directly set your API key here
openai.api_key = 'Your-api-key'

## Access the Environment Variable in Your Code:


In [51]:
from langchain.vectorstores import LanceDB
import lancedb
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader


# Initialize embeddings for semantic search
embedding = OpenAIEmbeddings()


## Download the PDF
Before we start let's download required pdfs.

In [38]:
import requests
import time

def download_pdf(url, save_path, retries=3):
    attempt = 0
    while attempt < retries:
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Check if the download was successful
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f"Downloaded PDF from {url} to {save_path}")
            return True
        except requests.exceptions.RequestException as e:
            attempt += 1
            print(f"Error downloading PDF (attempt {attempt} of {retries}): {e}")
            if attempt < retries:
                time.sleep(5)  # Wait before retrying
    return False

# Example URL and file path
pdf_url = "https://pdf.usaid.gov/pdf_docs/PA00TBCT.pdf"
pdf_path = "/content/Food_and_Nutrition.pdf"

# Download the PDF
if not download_pdf(pdf_url, pdf_path):
    raise Exception("Failed to download PDF after multiple attempts")


Downloaded PDF from https://pdf.usaid.gov/pdf_docs/PA00TBCT.pdf to /content/Food_and_Nutrition.pdf


## Load and Split the PDF
Use PyPDFLoader to load and split the PDF into pages.

In [43]:
from langchain.document_loaders import PyPDFLoader

# Load documents
loader = PyPDFLoader("Food_and_Nutrition.pdf")
pages = loader.load_and_split()


## Initialize the BM25 Retriever
Set up the BM25 retriever to fetch top results.

In [46]:
from langchain.retrievers import BM25Retriever

# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(pages)
bm25_retriever.k = 2  # Retrieve top 2 results using BM25


## Create LanceDB Vector Store for Semantic Search
Connect to LanceDB and create a table for storing embeddings.

In [None]:
import lancedb

# Create lancedb vector store for semantic search
db = lancedb.connect('/tmp/lancedb')
table = db.create_table("pandas_docs", data=[
    {"vector": embedding.embed_query("Hello World"), "text": "Hello World", "id": "1"}
], mode="overwrite")


## Initialize LanceDB Retriever
Set up the LanceDB retriever for semantic search.

In [None]:
from langchain.vectorstores import LanceDB

# Initialize LanceDB retriever
docsearch = LanceDB.from_documents(pages, embedding, connection=table)
retriever_lancedb = docsearch.as_retriever(search_kwargs={"k": 2})


## Initialize the Ensemble Retriever
Combine the BM25 and LanceDB retrievers with specified weights.

In [None]:
from langchain.retrievers import EnsembleRetriever

# Initialize the ensemble retriever with weights
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever_lancedb], weights=[0.4, 0.6])


## Retrieve Relevant Documents
Perform a query and retrieve relevant documents using the ensemble retriever.

In [None]:
# Example query
query = "Lorem ipsum dolor sit amet"

# Retrieve relevant documents
docs = ensemble_retriever.get_relevant_documents(query)

# Print retrieved documents
for doc in docs:
    print(doc.text)
