/
weaviate.py
88 lines (69 loc) · 2.41 KB
/
weaviate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from typing import Any, Dict, List, TypedDict
from unstructured.documents.elements import ElementMetadata, Text
class Properties(TypedDict):
name: str
dataType: List[str]
exclude_metadata_keys = ("data_source", "coordinates")
def stage_for_weaviate(elements: List[Text]) -> List[Dict[str, Any]]:
"""Converts a list of elements into a list of dictionaries that can be uploaded to
Weaviate. The outputs will conform to the schema created with
create_unstructured_weaviate_class.
References
----------
https://weaviate.io/developers/weaviate/tutorials/import#batch-import-process
"""
data: List[Dict[str, Any]] = []
for element in elements:
properties = element.metadata.to_dict()
for k in exclude_metadata_keys:
if k in properties:
del properties[k]
properties["text"] = element.text
properties["category"] = element.category
data.append(properties)
return data
def create_unstructured_weaviate_class(class_name: str = "UnstructuredDocument"):
"""Creates a Weaviate schema class for Unstructured documents using the information
available in ElementMetadata.
Parameters
----------
class_name: str
The name to use for the Unstructured class in the schema.
Defaults to "UnstructuredDocument".
References
----------
https://weaviate.io/developers/weaviate/client-libraries/python#manual-batching
"""
properties: List[Properties] = [
{
"name": "text",
"dataType": ["text"],
},
{
"name": "category",
"dataType": ["text"],
},
]
for name, annotation in ElementMetadata.__annotations__.items():
if name not in exclude_metadata_keys:
data_type = _annotation_to_weaviate_data_type(annotation)
properties.append(
{
"name": name,
"dataType": data_type,
},
)
class_dict = {
"class": class_name,
"properties": properties,
}
return class_dict
def _annotation_to_weaviate_data_type(annotation: str):
if "str" in annotation:
return ["text"]
elif "int" in annotation:
return ["int"]
elif "date" in annotation:
return ["date"]
else:
raise ValueError(f"Annotation {annotation} does not map to a Weaviate dataType.")