In [31]:
import os
import requests
import json
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Get the Notion API keys
NOTION_TOKEN = os.getenv('NOTION_TOKEN')
DATABASE_ID = os.getenv('DATABASE_ID')

if NOTION_TOKEN is None:
    raise ValueError("NOTION_API_KEY not found in .env file")

if DATABASE_ID is None:
    raise ValueError("DATABASE_ID not found in .env file")

headers = {
    "Authorization": "Bearer " + NOTION_TOKEN,
    "Content-Type": "application/json",
    "Notion-Version": "2022-06-28",
}

def extract_notion_rows(notion_response):
    results = []
    
    for page in notion_response.get("results", []):
        page_data = {
            "id": page.get("id"),
            "properties": {}
        }
        properties = page.get("properties", {})
        
        for prop_name, prop_value in properties.items():
            prop_type = prop_value.get("type")

            if prop_type == "rich_text":
                value = "".join([rt.get("plain_text", "") for rt in prop_value.get("rich_text", [])])
            elif prop_type == "title":
                value = "".join([t.get("plain_text", "") for t in prop_value.get("title", [])])
            elif prop_type == "number":
                value = prop_value.get("number")
            elif prop_type == "url":
                value = prop_value.get("url")
            elif prop_type == "date":
                value = prop_value.get("date")
            elif prop_type == "select":
                value = prop_value.get("select", {}).get("name")
            elif prop_type == "multi_select":
                value = [item.get("name") for item in prop_value.get("multi_select", [])]
            elif prop_type == "checkbox":
                value = prop_value.get("checkbox")
            elif prop_type == "email":
                value = prop_value.get("email")
            elif prop_type == "phone_number":
                value = prop_value.get("phone_number")
            elif prop_type == "people":
                value = [person.get("name", "") for person in prop_value.get("people", [])]
            elif prop_type == "files":
                value = [f.get("name", "") for f in prop_value.get("files", [])]
            else:
                value = f"Unsupported type: {prop_type}"

            page_data["properties"][prop_name] = value
        
        results.append(page_data)

    return results

def extract_pages(num_pages=None):  
    """
    If num_pages is None, get all pages, otherwise just the defined number.
    """
    url = f"https://api.notion.com/v1/databases/{DATABASE_ID}/query"

    page_size = 1000 if num_pages==None else num_pages

    payload = {"page_size": page_size}
    response = requests.post(url, json=payload, headers=headers)

    notion_data = response.json()

    parsed_data = extract_notion_rows(notion_data)

    # Comment this out to dump all data to a file
    with open('notion_database.json', 'w', encoding='utf8') as f:
        json.dump(parsed_data, f, ensure_ascii=False, indent=4)

In [32]:
extract_pages()

In [28]:
os.environ.pop("NOTION_TOKEN", None)
os.environ.pop("DATABASE_ID", None)

'1f088b77abf280608750fa0fd2366465'

In [None]:
import os
import json
import requests
from dotenv import load_dotenv
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Load environment variables
load_dotenv()

UPSTASH_REDIS_REST_URL = os.getenv("UPSTASH_REDIS_REST_URL")
UPSTASH_REDIS_REST_TOKEN = os.getenv("UPSTASH_REDIS_REST_TOKEN")

if not UPSTASH_REDIS_REST_URL or not UPSTASH_REDIS_REST_TOKEN:
    raise ValueError("Missing Upstash credentials in environment")

HEADERS = {
    "Authorization": f"Bearer {UPSTASH_REDIS_REST_TOKEN}"
}

def list_upstash_keys(prefix="notion_"):
    """Fetch all keys matching a prefix using Upstash Redis REST API."""
    url = f"{UPSTASH_REDIS_REST_URL}"  # No /keys route!
    headers = {
        "Authorization": f"Bearer {UPSTASH_REDIS_REST_TOKEN}",
        "Content-Type": "application/json"
    }
    payload = ["KEYS", f"{prefix}*"]

    response = requests.post(url, headers=headers, json=payload)
    if response.status_code != 200:
        raise Exception(f"Failed to list keys: {response.text}")
    
    return response.json().get("result", [])

def get_upstash_json_by_key(key):
    """Get and parse JSON value for a specific key."""
    url = f"{UPSTASH_REDIS_REST_URL}/get/{key}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch key {key}: {response.text}")
    raw_value = response.json().get("result")
    return json.loads(raw_value) if raw_value else []

def load_dataset_from_upstash(prefix="notion_"):
    """
    Loads and combines documents from multiple JSON values stored in Upstash Redis.
    """
    all_documents = []
    keys = list_upstash_keys(prefix=prefix)

    print(keys[0])

    for key in keys:
        data = get_upstash_json_by_key(key)
        if not isinstance(data, list):
            raise ValueError(f"Expected a list from key {key}, got {type(data)}")

        for entry in data:
            text = json.dumps(entry['properties'], ensure_ascii=False, indent=2)
            doc = Document(
                page_content=text,
                metadata={
                    "id": entry.get("id", ""),
                    "source_key": key
                }
            )
            all_documents.append(doc)

    return all_documents

def chunk_documents(documents, chunk_size=1000, chunk_overlap=50):
    """
    Splits documents into smaller chunks to improve retrieval performance.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunked_docs = []
    for doc in documents:
        splits = text_splitter.split_text(doc.page_content)
        for chunk in splits:
            chunked_docs.append(Document(page_content=chunk, metadata=doc.metadata))
    return chunked_docs


def create_vectorstore(documents):
    """
    Creates a vectorstore by embedding document chunks using a local sentence-transformers model.
    """
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore