Skip to content

Commit

Permalink
Move all management of dependencies into setup utils
Browse files Browse the repository at this point in the history
  • Loading branch information
rbiseck3 committed Apr 5, 2024
1 parent bdade98 commit f5e1cb9
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 116 deletions.
110 changes: 3 additions & 107 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,46 +20,9 @@

from setuptools import find_packages, setup

from setup_utils import load_requirements
from setup_utils import get_base_reqs, get_extras
from unstructured.__version__ import __version__

csv_reqs = load_requirements("requirements/extra-csv.in")
doc_reqs = load_requirements("requirements/extra-docx.in")
docx_reqs = load_requirements("requirements/extra-docx.in")
epub_reqs = load_requirements("requirements/extra-epub.in")
image_reqs = load_requirements("requirements/extra-pdf-image.in")
markdown_reqs = load_requirements("requirements/extra-markdown.in")
msg_reqs = load_requirements("requirements/extra-msg.in")
odt_reqs = load_requirements("requirements/extra-odt.in")
org_reqs = load_requirements("requirements/extra-pandoc.in")
pdf_reqs = load_requirements("requirements/extra-pdf-image.in")
ppt_reqs = load_requirements("requirements/extra-pptx.in")
pptx_reqs = load_requirements("requirements/extra-pptx.in")
rtf_reqs = load_requirements("requirements/extra-pandoc.in")
rst_reqs = load_requirements("requirements/extra-pandoc.in")
tsv_reqs = load_requirements("requirements/extra-csv.in")
xlsx_reqs = load_requirements("requirements/extra-xlsx.in")

all_doc_reqs = list(
set(
csv_reqs
+ docx_reqs
+ epub_reqs
+ image_reqs
+ markdown_reqs
+ msg_reqs
+ odt_reqs
+ org_reqs
+ pdf_reqs
+ pptx_reqs
+ rtf_reqs
+ rst_reqs
+ tsv_reqs
+ xlsx_reqs,
),
)


setup(
name="unstructured",
description="A library that prepares raw documents for downstream ML tasks.",
Expand Down Expand Up @@ -89,75 +52,8 @@
entry_points={
"console_scripts": ["unstructured-ingest=unstructured.ingest.main:main"],
},
install_requires=load_requirements(),
extras_require={
# Document specific extra requirements
"all-docs": all_doc_reqs,
"csv": csv_reqs,
"doc": doc_reqs,
"docx": docx_reqs,
"epub": epub_reqs,
"image": image_reqs,
"md": markdown_reqs,
"msg": msg_reqs,
"odt": odt_reqs,
"org": org_reqs,
"pdf": pdf_reqs,
"ppt": ppt_reqs,
"pptx": pptx_reqs,
"rtf": rtf_reqs,
"rst": rst_reqs,
"tsv": tsv_reqs,
"xlsx": xlsx_reqs,
# Extra requirements for data connectors
"airtable": load_requirements("requirements/ingest/airtable.in"),
"astra": load_requirements("requirements/ingest/astra.in"),
"azure": load_requirements("requirements/ingest/azure.in"),
"azure-cognitive-search": load_requirements(
"requirements/ingest/azure-cognitive-search.in",
),
"biomed": load_requirements("requirements/ingest/biomed.in"),
"box": load_requirements("requirements/ingest/box.in"),
"chroma": load_requirements("requirements/ingest/chroma.in"),
"clarifai": load_requirements("requirements/ingest/clarifai.in"),
"confluence": load_requirements("requirements/ingest/confluence.in"),
"delta-table": load_requirements("requirements/ingest/delta-table.in"),
"discord": load_requirements("requirements/ingest/discord.in"),
"dropbox": load_requirements("requirements/ingest/dropbox.in"),
"elasticsearch": load_requirements("requirements/ingest/elasticsearch.in"),
"gcs": load_requirements("requirements/ingest/gcs.in"),
"github": load_requirements("requirements/ingest/github.in"),
"gitlab": load_requirements("requirements/ingest/gitlab.in"),
"google-drive": load_requirements("requirements/ingest/google-drive.in"),
"hubspot": load_requirements("requirements/ingest/hubspot.in"),
"jira": load_requirements("requirements/ingest/jira.in"),
"mongodb": load_requirements("requirements/ingest/mongodb.in"),
"notion": load_requirements("requirements/ingest/notion.in"),
"onedrive": load_requirements("requirements/ingest/onedrive.in"),
"opensearch": load_requirements("requirements/ingest/opensearch.in"),
"outlook": load_requirements("requirements/ingest/outlook.in"),
"pinecone": load_requirements("requirements/ingest/pinecone.in"),
"postgres": load_requirements("requirements/ingest/postgres.in"),
"qdrant": load_requirements("requirements/ingest/qdrant.in"),
"reddit": load_requirements("requirements/ingest/reddit.in"),
"s3": load_requirements("requirements/ingest/s3.in"),
"sharepoint": load_requirements("requirements/ingest/sharepoint.in"),
"salesforce": load_requirements("requirements/ingest/salesforce.in"),
"sftp": load_requirements("requirements/ingest/sftp.in"),
"slack": load_requirements("requirements/ingest/slack.in"),
"wikipedia": load_requirements("requirements/ingest/wikipedia.in"),
"weaviate": load_requirements("requirements/ingest/weaviate.in"),
# Legacy extra requirements
"huggingface": load_requirements("requirements/huggingface.in"),
"local-inference": all_doc_reqs,
"paddleocr": load_requirements("requirements/extra-paddleocr.in"),
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
"openai": load_requirements("requirements/ingest/embed-openai.in"),
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),
},
install_requires=get_base_reqs(),
extras_require=get_extras(),
package_dir={"unstructured": "unstructured"},
package_data={"unstructured": ["nlp/*.txt"]},
)
103 changes: 96 additions & 7 deletions setup_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from pathlib import Path
from typing import List, Optional, Union
from typing import List, Union


def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List[str]:
if file_list is None:
file_list = ["requirements/base.in"]
if isinstance(file_list, str):
file_list = [file_list]
def load_requirements(file_list: Union[str, List[str]]) -> List[str]:
file_list = list(file_list)
requirements: List[str] = []
for file in file_list:
path = Path(file)
Expand All @@ -22,4 +19,96 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
file_path = Path(file_dir) / file_spec
filenames.append(str(file_path.resolve()))
requirements.extend(load_requirements(file_list=filenames))
return list(set(requirements))
return list({r for r in requirements if r})


def get_base_reqs() -> List[str]:
file = "requirements/base.in"
return load_requirements(file)


def get_doc_reqs() -> dict[str, List[str]]:
return {
"csv": load_requirements("requirements/extra-csv.in"),
"doc": load_requirements("requirements/extra-docx.in"),
"docx": load_requirements("requirements/extra-docx.in"),
"epub": load_requirements("requirements/extra-epub.in"),
"image": load_requirements("requirements/extra-pdf-image.in"),
"markdown": load_requirements("requirements/extra-markdown.in"),
"msg": load_requirements("requirements/extra-msg.in"),
"odt": load_requirements("requirements/extra-odt.in"),
"org": load_requirements("requirements/extra-pandoc.in"),
"pdf": load_requirements("requirements/extra-pdf-image.in"),
"ppt": load_requirements("requirements/extra-pptx.in"),
"pptx": load_requirements("requirements/extra-pptx.in"),
"rtf": load_requirements("requirements/extra-pandoc.in"),
"rst": load_requirements("requirements/extra-pandoc.in"),
"tsv": load_requirements("requirements/extra-csv.in"),
"xlsx": load_requirements("requirements/extra-xlsx.in"),
}


def get_all_doc_reqs() -> List[str]:
reqs = []
for req in get_doc_reqs().values():
reqs.extend(req)
return list(set(reqs))


def get_connector_reqs() -> dict[str, List[str]]:
return {
"airtable": load_requirements("requirements/ingest/airtable.in"),
"astra": load_requirements("requirements/ingest/astra.in"),
"azure": load_requirements("requirements/ingest/azure.in"),
"azure-cognitive-search": load_requirements(
"requirements/ingest/azure-cognitive-search.in",
),
"biomed": load_requirements("requirements/ingest/biomed.in"),
"box": load_requirements("requirements/ingest/box.in"),
"chroma": load_requirements("requirements/ingest/chroma.in"),
"clarifai": load_requirements("requirements/ingest/clarifai.in"),
"confluence": load_requirements("requirements/ingest/confluence.in"),
"delta-table": load_requirements("requirements/ingest/delta-table.in"),
"discord": load_requirements("requirements/ingest/discord.in"),
"dropbox": load_requirements("requirements/ingest/dropbox.in"),
"elasticsearch": load_requirements("requirements/ingest/elasticsearch.in"),
"gcs": load_requirements("requirements/ingest/gcs.in"),
"github": load_requirements("requirements/ingest/github.in"),
"gitlab": load_requirements("requirements/ingest/gitlab.in"),
"google-drive": load_requirements("requirements/ingest/google-drive.in"),
"hubspot": load_requirements("requirements/ingest/hubspot.in"),
"jira": load_requirements("requirements/ingest/jira.in"),
"mongodb": load_requirements("requirements/ingest/mongodb.in"),
"notion": load_requirements("requirements/ingest/notion.in"),
"onedrive": load_requirements("requirements/ingest/onedrive.in"),
"opensearch": load_requirements("requirements/ingest/opensearch.in"),
"outlook": load_requirements("requirements/ingest/outlook.in"),
"pinecone": load_requirements("requirements/ingest/pinecone.in"),
"postgres": load_requirements("requirements/ingest/postgres.in"),
"qdrant": load_requirements("requirements/ingest/qdrant.in"),
"reddit": load_requirements("requirements/ingest/reddit.in"),
"s3": load_requirements("requirements/ingest/s3.in"),
"sharepoint": load_requirements("requirements/ingest/sharepoint.in"),
"salesforce": load_requirements("requirements/ingest/salesforce.in"),
"sftp": load_requirements("requirements/ingest/sftp.in"),
"slack": load_requirements("requirements/ingest/slack.in"),
"wikipedia": load_requirements("requirements/ingest/wikipedia.in"),
"weaviate": load_requirements("requirements/ingest/weaviate.in"),
# Legacy extra requirements
"huggingface": load_requirements("requirements/huggingface.in"),
"local-inference": get_all_doc_reqs(),
"paddleocr": load_requirements("requirements/extra-paddleocr.in"),
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
"openai": load_requirements("requirements/ingest/embed-openai.in"),
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),
}


def get_extras() -> dict[str, List[str]]:
reqs = get_doc_reqs()
reqs["all-docs"] = get_all_doc_reqs()
reqs.update(get_connector_reqs())
return reqs
2 changes: 1 addition & 1 deletion test_unstructured/files/other_reqs.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# This is a child dependency file in the same directory
sphinx
sphinx<4.3.2
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
def test_load_requirements():
file = "./files/example.in"
reqs = load_requirements(file_list=[file])
desired_deps = ["torch", "httpx", "requests", "sphinx", "pandas"]
desired_deps = ["torch", "httpx", "requests", "sphinx<4.3.2", "pandas"]
assert len(reqs) == len(desired_deps)
assert sorted(reqs) == sorted(desired_deps)

0 comments on commit f5e1cb9

Please sign in to comment.