/
baseplate.py
50 lines (38 loc) · 1.71 KB
/
baseplate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from typing import Dict, List, TypedDict
from unstructured.documents.elements import Text
from unstructured.staging.base import flatten_dict
class BaseplateRow(TypedDict):
"""Typed dictionary for an individual Baseplate row. Baseplate docs show what the JSON
representation should look like:
https://docs.baseplate.ai/api-reference/documents/overview
"""
data: Dict[str, str]
metadata: Dict[str, str]
class BaseplateRows(TypedDict):
"""Typed dictionary for multiple Baseplate rows. Baseplate docs show what the JSON
representation should look like. This is the JSON that is submitted to the Baseplate
API to upload data.
https://docs.baseplate.ai/api-reference/documents/overview
"""
rows: List[BaseplateRow]
def stage_for_baseplate(elements: List[Text]) -> BaseplateRows:
"""Converts a list of unstructured elements into a dictionary of rows that can be uploaded
into Baseplate via the API.
References
----------
https://docs.baseplate.ai/api-reference/documents/overview
https://docs.baseplate.ai/api-reference/documents/upsert-data-rows
"""
rows: List[BaseplateRow] = []
for element in elements:
element_dict = element.to_dict()
metadata = element_dict.pop("metadata")
row: BaseplateRow = {
# Baseplate maps each key in the row's data object to a column in the dataset and
# each key in the row's metadata object to a metadata column in the dataset.
# We infer that Baseplate cannot map a nested object to a column in its dataset.
"data": flatten_dict(element_dict),
"metadata": flatten_dict(metadata),
}
rows.append(row)
return {"rows": rows}