<img width="10%" alt="Naas" src="https://landen.imgix.net/jtci2pxwjczr/assets/5ice39g4.png?w=160"/>

# Notion - Get page
<a href="https://app.naas.ai/user-redirect/naas/downloader?url=https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/Notion/get_page.ipynb" target="_parent"><img src="https://img.shields.io/badge/-Open%20in%20Naas-success?labelColor=000000&logo="/></a>

# Input

### Import library

In [None]:
import requests
import pandas as pd
import json
from pprint import pprint

### Variables

In [None]:
# Enter Notion token API
TOKEN_API = 'YOUR_TOKEN_API'

# Enter page url
PAGE_URL = 'YOUR_PAGE_URL'

# Notion version
_VERSION = '2021-08-16'

# Model

In [None]:
def create_headers(token_api, version):
    return {
            'Authorization': f'Bearer {token_api}',
            'Notion-Version': f'{version}',
        }

create_headers(TOKEN_API, _VERSION)

In [None]:
def get_id_from_url(database_url):
    return database_url.split('-')[-1]

get_id_from_url(PAGE_URL)

### Get properties

In [None]:
# make a request to Notion API and receive a Python dictionary
def fetch_raw_properties(token_api, page_url):
    page_id = get_id_from_url(page_url)
    url = f'https://api.notion.com/v1/pages/{page_id}'
    headers = create_headers(token_api, _VERSION)
    res = requests.get(url, headers=headers)
    try:
        res.raise_for_status()
    except requests.HTTPError as e:
        return e
    return res.json()

page = fetch_raw_properties(TOKEN_API, PAGE_URL)
pprint(page)

In [None]:
def extract_text(dictionnary):
    if 'name' in dictionnary:
        return dictionnary['name']
    elif 'plain_text' in dictionnary:
        return dictionnary['plain_text']
    else:
        return ''

def extract_date(dictionnary):
    '''
    For the moment we extract only the starting date of a date field
    Example {'id': 'prop_1', 'type': 'date', 'date': {'start': '2018-03-21', 'end': None}}
    '''
    return dictionnary['start']
    
def extract_data(element):
    ''' 
    input: a dictionnary of a notion property
    Exemple: {'id': 'W#4k', 'type': 'select', 'select': {'id': 'b305bd26-****-****-****-c78e2034db8f', 'name': 'Client', 'color': 'green'}}
    output: the string containing the information of the dict. (Client in the exemple)
    '''
    if type(element) is dict:
        dict_type = element['type'] 
        informations = element[dict_type]

        if type(informations) is dict:
            if dict_type == 'date':
                return extract_date(informations)
            else:
                return extract_text(informations)
        
        elif type(informations) is list:
            informations = [extract_text(elm) for elm in informations]
            return ','.join(informations)
        else:
            return informations
    else:
        return ''


def extract_properties(dictionary):
    return {key: extract_data(elm) for key,elm in dictionary['properties'].items()}
 
extract_properties(page)

In [None]:
def clean_meta_data(dictionary):
    meta_data = dictionary.copy()
    meta_data['PARENT_TYPE'] = meta_data['parent']['type']
    meta_data['PARENT_ID'] = meta_data['parent'][meta_data['PARENT_TYPE']]
    useless_meta = ['url', 'object', 'parent', 'properties']
    [meta_data.pop(useless) for useless in useless_meta]
    
    return meta_data

clean_meta_data(page)

In [None]:
def convert_keys_to_upper(dictionary):
    return {key.upper(): value for key,value in dictionary.items()}

In [None]:
def get_page_properties(token_api, page_url):
    raw_data = fetch_raw_properties(token_api, page_url)
    properties = extract_properties(raw_data)
    meta_data = clean_meta_data(raw_data)
    
    properties.update(meta_data)
    properties = convert_keys_to_upper(properties)
    return pd.DataFrame([properties])

get_page_properties(TOKEN_API, PAGE_URL)

### Get content
👉  The content of a page is return as a array of blocks by the Notion API 
```json
{
  "object": "block",
  "id": "9bc30ad4-9373-46a5-84ab-0a7845ee52e6",
  "created_time": "2021-03-16T16:31:00.000Z",
  "last_edited_time": "2021-03-16T16:32:00.000Z",
  "has_children": false,
  "type": "to_do",
  "to_do": {
    "text": [
      {
        "type": "text",
        "text": {
          "content": "Lacinato kale",
          "link": null
        },
        "annotations": {
          "bold": false,
          "italic": false,
          "strikethrough": false,
          "underline": false,
          "code": false,
          "color": "default"
        },
        "plain_text": "Lacinato kale",
        "href": null
      }
    ],
    "checked": false
  }
}
```

Each block is a dictionary with different keys:
- id *(str)*
- has_children *(bool)*
- created_time *(str)*
- last_edited_time *(str)*
- type *(str)*
- {type} *(dict)*



{type} is an object with type-specific block information<br>
List of block type:
- paragraph
- heading(1,2,3)
- bullet list item
- numbered list item
- to_do_blocks
- toggle block
- child page block

More info here: https://developers.notion.com/reference/block 

🚨 BEAWARE OF:
- I can't retreive the children element of a block: it's not the same behaviour than the one in Block Object look like it's a bug from the API 
- Some data information are lost. Exemple: the color of the text and the link 
- blank blocks are count as a paragraph we maybe need to create a new category for them or delete them from the result

In [None]:
def fetch_raw_blocks(token_api, page_url):
    page_id = get_id_from_url(page_url)
    url = f'https://api.notion.com/v1/blocks/{page_id}/children'
    headers = create_headers(token_api, _VERSION)
    response = requests.get(url, headers=headers)
    res = requests.get(url, headers=headers)
    try:
        res.raise_for_status()
    except requests.HTTPError as e:
        return e
    return res.json()['results']

blocks = fetch_raw_blocks(TOKEN_API, PAGE_URL)
pprint(blocks[0])

In [None]:
def extract_text_from_rich_text(rich_text):
    return rich_text['plain_text']

def extract_text_from_array_of_rich_text(array):
    content = [extract_text_from_rich_text(rich_text) for rich_text in array]
    return ' '.join(content)

def extract_block_content(block):
    block_type = block['type']
    
    if block_type.startswith('heading'):
        array_of_rich_text = block[block_type]['text']
        return extract_text_from_array_of_rich_text(array_of_rich_text)
    
    elif block_type == 'paragraph':
        array_of_rich_text = block[block_type]['text']
        return extract_text_from_array_of_rich_text(array_of_rich_text)

    elif block_type.endswith('list_item'):
        array_of_rich_text = block[block_type]['text']
        return extract_text_from_array_of_rich_text(array_of_rich_text)
    
    elif block_type == 'to_do':
        array_of_rich_text = block[block_type]['text']
        return extract_text_from_array_of_rich_text(array_of_rich_text)
    
    elif block_type == 'toggle':
        array_of_rich_text = block[block_type]['text']
        return extract_text_from_array_of_rich_text(array_of_rich_text)
    
    elif block_type == 'child_page':
        return block[block_type]['title']
        

first_block = blocks[0]
extract_block_content(first_block)

In [None]:
def get_page_content(TOKEN_API, PAGE_URL):
    blocks = fetch_raw_blocks(TOKEN_API, PAGE_URL)
    page_content = []
    for block in blocks:

        block['content'] = extract_block_content(block)
        block.pop( block['type'])
        block.pop('object')
        block = convert_keys_to_upper(block)
        page_content.append(block)

    return pd.DataFrame(page_content)

get_page_content(TOKEN_API, PAGE_URL)

---
# Output

### 1. Get properties : Table format
- PROPERTIES (Majuscule + unstacked)
- ID 
- PARENT_TYPE
- PARENT_ID
- CREATED_TIME
- LAST_EDITED_TIME
- ARCHIVED

In [None]:
get_page_properties(TOKEN_API, PAGE_URL)

### 2. Get content : Table format

- TYPE
- TEXT ("plain_text") (if "paragraph" then concat "plain_text" in list "text")
- ID
- HAS_CHILDREN
- CREATED_TIME
- LAST_EDITED_TIME

In [None]:
# get pages content
get_page_content(TOKEN_API, PAGE_URL)