# Notion Reader
Read Notion date and insert it into Pinecone.

In [None]:
!pip install llama_index
!pip install pinecone_client
!pip install langchain
!pip install sentence-transformers

In [8]:
import os
import logging
import sys
import requests
import pinecone
import torch
from pinecone import Pinecone
from IPython.display import Markdown, display
from langchain.llms.openai import OpenAIChat
from langchain_community.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.legacy import VectorStoreIndex
from llama_index.legacy.indices import SummaryIndex, GPTVectorStoreIndex
from llama_index.legacy.llms.openai import OpenAI
from llama_index.legacy.readers.notion import NotionPageReader
from llama_index.legacy.service_context import ServiceContext
from llama_index.legacy.storage.storage_context import StorageContext
from llama_index.legacy.vector_stores.pinecone import PineconeVectorStore

In [3]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [4]:
# Get secret keys
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["NOTION_INTEGRATION_TOKEN"] = userdata.get('NOTION_INTEGRATION_TOKEN')
os.environ["PINECONE_API_KEY"] = userdata.get('PINECONE_API_KEY')

In [5]:
# Define constants
PINECONE_ENVIRONMENT = 'us-west1-gcp-free'
PINECONE_INDEX_NAME = 'notion-based-gpt'
EMBEDDING_MODEL='text-embedding-ada-002'
NOTION_DATABASE_ID='0bcee2b8bfa64ef9a6e2c93a890feeaf'

In [None]:
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index(PINECONE_INDEX_NAME)
vector_store = PineconeVectorStore(index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
llm = OpenAIChat(temperature=0, model=EMBEDDING_MODEL)
service_context = ServiceContext.from_defaults(llm=llm)

In [None]:
def create_document_data(id, data_type):
  if data_type == 'text':
    documents = NotionPageReader(integration_token=os.environ["NOTION_INTEGRATION_TOKEN"]).load_data(
          page_ids=[id]
      )
    return documents[0]
  elif data_type == 'database':
    documents = NotionPageReader(integration_token=os.environ["NOTION_INTEGRATION_TOKEN"]).load_data(
          database_id=id
      )
    return documents[0]
  else:
    return (None)

In [None]:
# Fetch Notion page ids from Notion dataabse. It regards there is a notion database that has Notion page id in title column of each record.
notion_integration_token = os.environ["NOTION_INTEGRATION_TOKEN"]

headers = {
    "Authorization": f"Bearer {notion_integration_token}",
    "Content-Type": "application/json",
    "Notion-Version": "2022-06-28"
}

read_url = f"https://api.notion.com/v1/databases/{NOTION_DATABASE_ID}/query"
response = requests.post(read_url, headers=headers)

if response.status_code == 200:
    data = response.json()
else:
    print(f"Error: {response.status_code}")

# page_ids = []
documents_list = []

for metadata in data['results']:
  if metadata['properties']['Insert Flag']['select']['name'] == 'TRUE':
    if metadata['properties']['Alternative Text']['select']['name'] == 'TRUE':
      alt_page_id = metadata['url'].replace('https://www.notion.so/', '')
      alt_document_data = create_document_data(alt_page_id, metadata['properties']['Data Category']['select']['name'])
      page_id = metadata['properties']['ID']['title'][0]['text']['content']
      document_data = create_document_data(page_id, metadata['properties']['Data Category']['select']['name'])
      vars(document_data)['text'] = vars(alt_document_data)['text']
      documents_list.append(document_data)
    else:
      page_id = metadata['properties']['ID']['title'][0]['text']['content']
      documents_list.append(create_document_data(page_id, metadata['properties']['Data Category']['select']['name']))

In [None]:
# Insert Document data into Pinecone
VectorStoreIndex.from_documents(documents_list, storage_context=storage_context, service_context=service_context)