In [1]:
# Import libraries

import pandas as pd
import os
import requests
import dlt
import lancedb
from dlt.destinations.adapters import lancedb_adapter
from rest_api import RESTAPIConfig, rest_api_source
from dlt.sources.helpers.rest_client.paginators import BasePaginator, JSONResponsePaginator
from dlt.sources.helpers.requests import Response, Request
from datetime import datetime, timezone
import ollama

In [2]:
!yes | dlt init rest_api lancedb

Looking up the init scripts in [1mhttps://github.com/dlt-hub/verified-sources.git[0m...
No files to update, exiting
yes: standard output: Broken pipe


In [3]:
os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL_PROVIDER"] = "sentence-transformers"
os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL"] = "all-MiniLM-L6-v2"
os.environ["DESTINATION__LANCEDB__CREDENTIALS__URI"] = ".lancedb"

In [4]:
# Ensure the API key is set in the environment variables
api_key = os.getenv("SOURCES__REST_API__NOTION__API_KEY")
if not api_key:
    raise EnvironmentError("The Notion API key is not set in the environment variables.")

In [5]:
class PostBodyPaginator(BasePaginator):
    def __init__(self):
        super().__init__()
        self.cursor = None

    def update_state(self, response: Response) -> None:
        # Assuming the API returns an empty list when no more data is available
        if not response.json():
            self._has_next_page = False
        else:
            self.cursor = response.json().get("next_cursor")
            if self.cursor is None:
                self._has_next_page = False

    def update_request(self, request: Request) -> None:
        if request.json is None:
            request.json = {}

        # Add the cursor to the request body
        request.json["start_cursor"] = self.cursor

In [6]:
# API Configuration
@dlt.resource(name="homework")
def rest_api_notion_resource():
    notion_config: RESTAPIConfig = {
        "client": {
            "base_url": "https://api.notion.com/v1/",
            "auth": {
                "token": api_key
            },
            "headers": {
                "Content-Type": "application/json",
                "Notion-Version": "2022-06-28"
            }
        },
        "resources": [
            {
                "name": "search",
                "endpoint": {
                    "path": "search",
                    "method": "POST",
                    "paginator": PostBodyPaginator(),
                    "json": {
                        "query": "homework",
                        "sort": {
                            "direction": "ascending",
                            "timestamp": "last_edited_time"
                        }
                    },
                    "data_selector": "results"
                }
            },
            {
                "name": "page_content",
                "endpoint": {
                    "path": "blocks/{page_id}/children",
                    "paginator": JSONResponsePaginator(),
                    "params": {
                        "page_id": {
                            "type": "resolve",
                            "resource": "search",
                            "field": "id"
                        }
                    },
                }
            }
        ]
    }

    yield from rest_api_source(notion_config, name="homework")

# Function that describes how to extract the data and the structure of the output
def extract_page_content(response):
    block_id = response["id"]
    last_edited_time = response["last_edited_time"]
    block_type = response.get("type", "Not paragraph")
    if block_type != "paragraph":
        content = ""
    else:
        try:
            content = response["paragraph"]["rich_text"][0]["plain_text"]
        except IndexError:
            content = ""
    return {
        "block_id": block_id,
        "block_type": block_type,
        "content": content,
        "last_edited_time": last_edited_time,
        "inserted_at_time": datetime.now(timezone.utc)
    }

@dlt.resource(
    name="homework",
    write_disposition="merge",  # appends data
    primary_key="block_id",
    columns={"last_edited_time": {"dedup_sort": "desc"}}  # in case of duplicate data, takes the latest version
)
# Defines the incremental loading
def rest_api_notion_incremental(
    last_edited_time=dlt.sources.incremental("last_edited_time", initial_value="2024-06-26T08:16:00.000Z", primary_key=("block_id"))
):
    for block in rest_api_notion_resource.add_map(extract_page_content):
        if not len(block["content"]):
            continue
        yield block

# Pipeline that extracts and loads data to lancedb
def load_notion() -> None:
    pipeline = dlt.pipeline(
        pipeline_name="notion_homework",
        destination="lancedb",
        dataset_name="notion_pages"
    )

    load_info = pipeline.run(
        lancedb_adapter(
            rest_api_notion_incremental,
            embed="content"
        ),
        table_name="homework",
        write_disposition="merge"
    )
    print(load_info)

load_notion()

Pipeline notion_homework load step completed in ---
0 load package(s) were loaded to destination LanceDB and into dataset None
The LanceDB destination used <dlt.destinations.impl.lancedb.configuration.LanceDBCredentials object at 0x7f892cb4c4f0> location to store data


In [8]:
db = lancedb.connect(".lancedb")
print(db.table_names())

['notion_pages____dlt_loads', 'notion_pages____dlt_pipeline_state', 'notion_pages____dlt_version', 'notion_pages___dltSentinelTable', 'notion_pages___homework']


In [9]:
dbtable2 = db.open_table("notion_pages___homework")
dbtable2.to_pandas()

Unnamed: 0,id__,vector__,block_id,block_type,content,last_edited_time,inserted_at_time,_dlt_load_id,_dlt_id
0,c69f1ecf-7b02-5810-8286-3f42659ae9d4,"[-0.024265597, 0.047460817, -0.011796484, 0.06...",a8196881-ae94-4767-8767-92fe1a327d24,paragraph,We owe our success to our employees. To show o...,2024-07-05 22:34:00+00:00,2024-07-22 08:00:48.838756+00:00,1721635247.431267,aDXKfl1SP56jlA
1,f2c18ac0-50f5-5b72-a871-dc5a46780353,"[-0.049661595, 0.10853507, -0.009762611, -0.03...",31fcbf26-2ca5-468a-8af8-d7eb4c2db8c8,paragraph,We want to ensure that private information abo...,2024-07-05 22:38:00+00:00,2024-07-22 08:00:48.842587+00:00,1721635247.431267,F04+H0tw6dbILg
2,4553193e-c655-54df-9a33-cfc570bf34d0,"[-0.0631632, 0.17331508, 0.02535169, -0.019146...",da7721fd-3d0f-4c04-bc5e-825ad60bed1c,paragraph,Employee health is important to us. We don’t d...,2024-07-05 22:52:00+00:00,2024-07-22 08:00:48.842767+00:00,1721635247.431267,WYx+I1scqeJp2g
3,791be1a1-6c67-530d-87ab-bd9912500ea5,"[-0.10974316, 0.10586073, 0.003290643, -0.0213...",ff36dcf3-5faa-40b4-ad8e-92fdc952201e,paragraph,Our company is dedicated to maintaining a safe...,2024-07-05 22:52:00+00:00,2024-07-22 08:00:48.842896+00:00,1721635247.431267,zk0xun8fzsQFTQ
4,a83497f4-922c-5d62-bab1-53804e93c811,"[0.052423313, -0.06457593, 0.065863006, 0.0145...",a1ff9697-4bb6-4f1e-b464-dda296dbd307,paragraph,If your job doesn’t require you to be present ...,2024-07-05 22:52:00+00:00,2024-07-22 08:00:48.843026+00:00,1721635247.431267,odR3efPGZIwf7w
5,434b71e9-a11a-519d-a9fe-e3ade78d47d6,"[0.0005233272, -0.054883476, 0.043573424, -0.0...",e4ec9f4d-b687-4c28-a80d-985bfabcc2ba,paragraph,Remote working refers to working from a non-of...,2024-07-05 22:52:00+00:00,2024-07-22 08:00:48.843159+00:00,1721635247.431267,autltZRRQCpZxA
6,17816363-54b7-5ba7-b8d5-06d871a25414,"[0.03802626, -0.021509666, 0.04752782, 0.06470...",e6e550dc-b59e-4928-abd7-07eace948681,paragraph,There are some expenses that we will pay direc...,2024-07-05 22:52:00+00:00,2024-07-22 08:00:48.843295+00:00,1721635247.431267,RyywN8NPW4p+pA
7,2a434cf9-09d9-5514-a88b-02977f2f953e,"[-0.058588073, -0.075404495, 0.03377525, 0.009...",a269d0ca-ce14-481b-a5f4-9192d6840d6e,paragraph,Our company operates between 9 a.m. to 7 p.m. ...,2024-07-05 22:52:00+00:00,2024-07-22 08:00:48.843433+00:00,1721635247.431267,T1x50B7mHIhe2w
8,5f9384fa-7f98-5f52-a06e-05b05f42f69a,"[-0.013599278, 0.0047529866, 0.024835171, 0.01...",5b65f3e7-0a37-429a-818d-f99b53755ebd,paragraph,"In this section, we are going to be covering i...",2024-07-05 23:33:00+00:00,2024-07-22 08:00:48.843574+00:00,1721635247.431267,+sBfU6VnoU/n4A
9,42af72f6-9db7-54a2-87b2-d466169078ff,"[0.03206092, 0.024244629, 0.00847134, 0.031790...",b27f7d80-f2f1-460e-aa0c-b8e770cf050a,paragraph,Our company observes the following holidays: N...,2024-07-05 22:52:00+00:00,2024-07-22 08:00:48.843726+00:00,1721635247.431267,orNu8QkSAjGesw


In [10]:
df = dbtable2.to_pandas()

if not df.empty:
    df['last_edited_time'] = pd.to_datetime(df['last_edited_time'])
    max_last_edited_time = df['last_edited_time'].max()
    print(f"The most recent last_edited_time is: {max_last_edited_time}")
else:
    print("No data found.")

The most recent last_edited_time is: 2024-07-05 23:33:00+00:00


In [11]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Downloading ollama...
######################################################################## 100.0%#=#=#                                                                         
>>> Installing ollama to /usr/local/bin...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [12]:
%%capture
!ollama pull llama2-uncensored

In [13]:
def retrieve_context_from_lancedb(dbtable2, question, top_k=2):

    query_results = dbtable2.search(query=question).to_list()
    context = "\n".join([result["content"] for result in query_results[:top_k]])

    return context

In [14]:
def main():
  # Connect to the lancedb table
  db = lancedb.connect(".lancedb")
  dbtable2 = db.open_table("notion_pages___homework")

  # A system prompt telling ollama to accept input in the form of "Question: ... ; Context: ..."
  messages = [
      {"role": "system", "content": "You are a helpful assistant that helps users understand policies inside a company's employee handbook. The user will first ask you a question and then provide you relevant paragraphs from the handbook as context. Please answer the question based on the provided context. For any details missing in the paragraph, encourage the employee to contact the HR for that information. Please keep the responses conversational."}
  ]

  while True:
    # Accept user question
    question = input("You: ")

    # Retrieve the relevant paragraphs on the question
    context = retrieve_context_from_lancedb(dbtable2,question,top_k=2)

    # Create a user prompt using the question and retrieved context
    messages.append(
        {"role": "user", "content": f"Question: '{question}'; Context:'{context}'"}
    )

    # Get the response from the LLM
    response = ollama.chat(
        model="llama2-uncensored",
        messages=messages
    )
    response_content = response['message']['content']
    print(f"Assistant: {response_content}")

    # Add the response into the context window
    messages.append(
        {"role": "assistant", "content":response_content}
    )

In [None]:
main()

You:  how many PTO days do you get?


Assistant: Thank you for asking about your PTO days. You get 30 days of Paid Time Off (PTO) per year, which is one extra day each year after your first year with our company up to a maximum of 25 days. To use your PTO, you can send a request through HRIS. Once approved by your manager or HR, you are permitted to take your leave. There is no need to specify a reason for requesting PTO, and you cannot transfer any remaining PTO to the next year.
We encourage you to use your time off throughout the year, but if you have questions about how many days you've taken or accrued, please contact HR. If you leave our company, we may compensate accrued PTO with your final paycheck according to local law. When the law doesn't have provisions, we will compensate accrued leave to employees who were not terminated for cause.
These holidays are considered "off-days" for most employees and you can work on these holidays if needed. If you are a non-exempt employee, you will receive your regular hourly ra

In [None]:
main()