diff --git a/snippets/general-shared-text/sharepoint-api-placeholders.mdx b/snippets/general-shared-text/sharepoint-api-placeholders.mdx index 659996a8..f23b93ac 100644 --- a/snippets/general-shared-text/sharepoint-api-placeholders.mdx +++ b/snippets/general-shared-text/sharepoint-api-placeholders.mdx @@ -1,6 +1,9 @@ - `` (_required_) - A unique name for this connector. - `` (_required_) - The client ID provided by SharePoint for the app registration. - `` (_required_) - The base URL of the SharePoint site to connect to. -- `` (_required_) - The client secret associated with the client ID. +- `` (_required) - The **Directory (tenant) ID** for the Microsoft Entra ID app registration with the correct set of Microsoft Graph access permissions. +- `` - The authentication token provider URL for the Entra ID app registration. The default is https://login.microsoftonline.com. +- `` (_required_) - The UPN for the OneDrive account in the Entra ID tenant. +- `` (_required_) - The **Client secret** for the Entra ID app registration. - `` - The path from which to start parsing files. The default is `Shared Documents` if not otherwise specified. -- For `recursive` (source connector only), set to `true` to recursively process data from subfolders within the specified path. The default is `false` if not otherwise specified. \ No newline at end of file +- For `recursive`, set to `true` to recursively process data from subfolders within the specified path. The default is `false` if not otherwise specified. \ No newline at end of file diff --git a/snippets/general-shared-text/sharepoint-cli-api.mdx b/snippets/general-shared-text/sharepoint-cli-api.mdx index 262ce09e..54178fd0 100644 --- a/snippets/general-shared-text/sharepoint-cli-api.mdx +++ b/snippets/general-shared-text/sharepoint-cli-api.mdx @@ -10,13 +10,10 @@ import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-d The following environment variables: -- `SHAREPOINT_APP_CLIENT_ID` - The application (client) ID for the SharePoint app principal, represented by `--client-id` (CLI) or `client_id` (Python). -- `SHAREPOINT_APP_CLIENT_SECRET` - The client secret for the SharePoint app principal, represented by `--client-cred` (CLI) or `client_cred` (Python). -- `SHAREPOINT_SITE` - The SharePoint site URL, represented by `--site` (CLI) or `site` (Python). -- `SHAREPOINT_PATH` - The path in the SharePoint site from which to start parsing files, represented by `--path` (CLI) or `path` (Python). - -{/* -- `SHAREPOINT_APP_PERMISSIONS_CLIENT_ID` - The associated Azure application (client) ID, represented by `--permissions-application-id` (CLI) or `permissions_application_id` (Python). -- `SHAREPOINT_APP_PERMISSIONS_CLIENT_SECRET` - The client secret for the Azure application, represented by `--permissions-client-cred` (CLI) or `permissions_client_cred` (Python). -- `SHAREPOINT_APP_PERMISSIONS_TENANT` - The domian name of the tenant for the Azure application, which is typically `.onmicrosoft.com`, and which is represented by `--permissions-tenant` (CLI) or `permissions_tenant` (Python). - */} +- `ENTRA_ID_USER_PRINCIPAL_NAME` - The User Principal Name (UPN) for the target OneDrive account in the Microsoft Entra ID tenant. +- `SHAREPOINT_SITE_URL` - The SharePoint site URL, represented by `--site` (CLI) or `site` (Python). +- `SHAREPOINT_SITE_PATH` - The path in the SharePoint site from which to start parsing files, represented by `--path` (CLI) or `path` (Python). +- `ENTRA_ID_APP_CLIENT_ID` - The **Application (client) ID** value for the Microsoft Entra ID app registration, represented by `--client-id` (CLI) or `client_id` (Python). +- `ENTRA_ID_APP_TENANT_ID` - The **Directory (tenant) ID** value for the Entra ID app registration, represented by `--client-id` (CLI) or `client_id` (Python). +- `ENTRA_ID_APP_CLIENT_SECRET` - The **Client secret** value for the Entra ID app registration, represented by `--client-cred` (CLI) or `client_cred` (Python). +- `ENTRA_ID_TOKEN_AUTHORITY_URL` - The token authority URL for the Entra ID app registration (which is typically `https://login.microsoftonline.com`), represented by `--authority-url` (CLI) or `authority_url` (Python). \ No newline at end of file diff --git a/snippets/general-shared-text/sharepoint-platform.mdx b/snippets/general-shared-text/sharepoint-platform.mdx index d0181e09..f65c8c61 100644 --- a/snippets/general-shared-text/sharepoint-platform.mdx +++ b/snippets/general-shared-text/sharepoint-platform.mdx @@ -3,6 +3,9 @@ Fill in the following fields: - **Name** (_required_): A unique name for this connector. - **Site URL** (_required_): The base URL of the SharePoint site to connect to. - **Path** (_required_): The path from which to start parsing files, for example `Shared Documents`. -- **Recursive** (source connector only): Check this box to recursively process data from subfolders within the specified path. -- **Client ID** (_required_): The client ID provided by SharePoint for the app principal. -- **Client Credentials** (_required_): The client secret associated with the client ID. \ No newline at end of file +- **Recursive**: Check this box to recursively process data from subfolders within the specified path. +- **Client ID** (_required_): The **Application (client) ID** for the Microsoft Entra ID app registration with the correct set of Microsoft Graph access permissions. +- **Tenant ID** (_required_): The **Directory (tenant) ID** for the Entra ID app registration. +- **User Principal Name (UPN)** (_required_): The UPN for the OneDrive account in the Entra ID tenant. +- **Client Credentials** (_required_): The **Client secret** for the Entra ID app registration. +- **Authority URL** (_required_): The authentication token provider URL for the Entra ID app registration. The default is `https://login.microsoftonline.com`. diff --git a/snippets/general-shared-text/sharepoint.mdx b/snippets/general-shared-text/sharepoint.mdx index debd7be9..a5c874cd 100644 --- a/snippets/general-shared-text/sharepoint.mdx +++ b/snippets/general-shared-text/sharepoint.mdx @@ -1,117 +1,109 @@ - - -- The SharePoint site URL. + + If you are setting up the SharePoint connector for the first time, you can skip past this note. + + Previous versions of the SharePoint connector relied on SharePoint app principals for authentication. Current versions of the + SharePoint connector no longer support these SharePoint app principals. Microsoft deprecated support for Share Point app principals on November 27, 2023. + SharePoint app principals will no longer work for SharePoint tenants that were created on or after November 1, 2024, and they will stop working + for all SharePoint tenants as of April 2, 2026. [Learn more](https://learn.microsoft.com/sharepoint/dev/sp-add-ins/retirement-announcement-for-azure-acs). + + Current versions of the SharePoint connector now rely on Microsoft Entra ID app registrations for authentication. + + To migrate from SharePoint app princpals to Entra ID app regisrations, replace the following settings in your existing SharePoint connector, + as listed in the requirements following this note: + + - Replace the deprecated SharePoint app principal's application client ID value with your replacement Entra ID app registration's **Application (client) ID** value. + - Replace the deprecated SharePoint app principal's client secret value with your replacement Entra ID app registration's **Client secret** value. + - Add your replacement Entra ID app registration's **Directory (tenant) ID** value, token authority URL value, and the correct set of Microsoft Graph access permissions for SharePoint Online. + + If you need migration help, get assistance from our [Slack community](https://short.unstructured.io/pzw05l7) or [contact us](https://unstructured.io/contact) directly. + + +- A SharePoint Online plan, or a Microsoft 365 or Office 365 Business or enterprise plan that includes SharePoint Online. + [Learn more](https://www.microsoft.com/en-us/microsoft-365/SharePoint/compare-SharePoint-plans). + [Shop for business plans](https://www.microsoft.com/microsoft-365/business/compare-all-microsoft-365-business-products). + [Shop for enterprise plans](https://www.microsoft.com/microsoft-365/enterprise/microsoft365-plans-and-pricing). +- A OneDrive for business plan, or a Microsoft 365 or Office 365 Business or enterprise plan that includes OneDrive. + (Even if you only plan to use SharePoint Online, you still need a plan that includes OneDrive, because the SharePoint connector is built on OneDrive technology.) + [Learn more](https://www.microsoft.com/microsoft-365/onedrive/compare-onedrive-plans). + [Shop for business plans](https://www.microsoft.com/microsoft-365/business/compare-all-microsoft-365-business-products). + [Shop for enterprise plans](https://www.microsoft.com/microsoft-365/enterprise/microsoft365-plans-and-pricing). + OneDrive personal accounts, and Microsoft 365 Free, Basic, Personal, and Family plans are not supported. +- The SharePoint Online and OneDrive plans must share the same Microsoft Entra ID tenant. + [Learn more](https://learn.microsoft.com/microsoft-365/enterprise/subscriptions-licenses-accounts-and-tenants-for-microsoft-cloud-offerings?view=o365-worldwide). +- The User Principal Name (UPN) for the OneDrive account in the Microsoft Entra ID tenant. This is typically the OneDrive account user's email address. To find a UPN: + + 1. Depending on your plan, sign in to your Microsoft 365 admin center (typically [https://admin.microsoft.com](https://admin.microsoft.com)) using your administrator credentials, + or sign in to your Office 365 portal (typically [https://portal.office.com](https://portal.office.com)) using your credentials. + 2. In the **Users** section, click **Active users**. + 3. Locate the user account in the list of active users. + 4. The UPN is displayed in the **Username** column. + + The following video shows how to get a UPN: + + + +- The SharePoint Online site URL. - Site collection-level URLs typically have the format `https://.sharepoint.com/sites/`. - Root site collection-level URLs typically have the format `https://.sharepoint.com`. - - To process all sites within a tenant, use a site URL of `https://-admin.sharepoint.com`. + - To process all sites within a SharePoint tenant, use a site URL of `https://-admin.sharepoint.com`. [Learn more](https://learn.microsoft.com/microsoft-365/community/query-string-url-tricks-sharepoint-m365). -- The path in the SharePoint site from which to start parsing files, for example `"Shared Documents"`. If the connector is to process all sites within the tenant, this filter will be applied to all site document libraries. -- A SharePoint app principal with its application (client) ID, client secret, and the appropriate access permissions. - - Complete the steps in the following sections, depending on whether you want to access sites at the site collection level, the - root site collection level, or all sites within a tenant. - - - Two of the main factors in the following sections are the scope of access - and the level of administrative permissions required to create the app principal. Tenant-wide app principals offer the broadest access - but require the highest level of administrative rights, while site collection app principals are more restricted but can be created by users - with lower-level permissions. - - -## Tenant-wide SharePoint app principals - -Create a tenant-wide SharePoint app principal when you want the power and flexibility of a principal that can process all sites within a tenant. - -SharePoint app principals that are created in the SharePoint admin center have tenant-wide scope and can potentially access all sites within the tenant. -Only global or SharePoint administrators typically have access to the following URLs. - -1. To create a tenant-wide SharePoint app principal and then get its client ID and client secret, go to the following URL: - - `https://-admin.sharepoint.com/_layouts/15/appregnew.aspx` - -2. To add access permissions to a tenant-wide SharePoint app principal and then get its client ID and client secret, go to the following URL: - - `https://.sharepoint.com/_layouts/15/appinv.aspx` - -3. Apply the following permissions XML to the tenant-wide SharePoint app principal: - - ```xml - - - - ``` - Available `Right` settings include `Read`, `Write`, `Manage`, and `FullControl`. To learn more, see - [Add-in permissions in SharePoint](https://learn.microsoft.com/sharepoint/dev/sp-add-ins/add-in-permissions-in-sharepoint). - -[Learn how to complete these preceding steps](https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal). -Be sure to substitute the URLs and XML in the linked article with the ones in these preceding steps accordingly. - -## Root site collection-level SharePoint app principals - -Create a root site collection-level SharePoint app principal when you want a principal that can only access a root site collection, for example with a URL -that has the format `https://.sharepoint.com`. - -SharePoint app principals that are created at the root site collection level have a scope limited to the root site collection. Site collection administrators can usually access the following URLs. - -1. To create a root site collection-level SharePoint app principal and then get its client ID and client secret, go to the following URL: - - `https://.sharepoint.com/_layouts/15/appregnew.aspx` - -2. To add access permissions to a root site collection-level SharePoint app principal, go to the following URL: - - `https://.sharepoint.com/_layouts/15/appinv.aspx` - -3. Apply the following permissions XML to the root site collection-level SharePoint app principal: - - ```xml - - - - ``` - - Available `Right` settings include `Read`, `Write`, `Manage`, and `FullControl`. To learn more, see - [Add-in permissions in SharePoint](https://learn.microsoft.com/sharepoint/dev/sp-add-ins/add-in-permissions-in-sharepoint). - -[Learn how to complete these preceding steps](https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal). -Be sure to substitute the URLs and XML in the linked article with the ones in these preceding steps accordingly. - -## Site collection-level SharePoint app principals - -Create a site collection-level SharePoint app principal when you want a principal that can only access a specific site collection, for example with a URL -that has or starts with the format `https://.sharepoint.com/sites/`. - -SharePoint app principals that are created at the site collection level have the most limited scope, restricted to the specific subsite and its subsites. -Site owners or those with appropriate permissions on the subsite can access the following URLs. - -1. To create a site collection-level SharePoint app principal, go to the following URL: - - `https://.sharepoint.com/sites//_layouts/15/appregnew.aspx` - -2. To add access permissions to a site collection-level SharePoint app principal, go to the following URL: - - `https://.sharepoint.com/sites//_layouts/15/appinv.aspx` - -3. Apply the following permissions XML to the site collection-level SharePoint app principal: - - ```xml - - - - ``` - - Available `Right` settings include `Read`, `Write`, `Manage`, and `FullControl`. To learn more, see - [Add-in permissions in SharePoint](https://learn.microsoft.com/sharepoint/dev/sp-add-ins/add-in-permissions-in-sharepoint). - -[Learn how to complete these preceding steps](https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal). -Be sure to substitute the URLs and XML in the linked article with the ones in these preceding steps accordingly. +- The path in the SharePoint Online site from which to start parsing files, for example `"Shared Documents"`. If the SharePoint connector is to process all sites within the tenant, this filter will be applied to all site document libraries. + + The following video shows how to get the site URL and a path within the site: + + + +- The **Application (client) ID**, **Directory (tenant) ID**, and **Client secret** for the Microsoft Entra ID app registration with + the correct set of Microsoft Graph access permissions. These permissions include: + + - `Sites.ReadWrite.All` (if both reading and writing are needed) + - `User.Read.All` + [Learn more](https://learn.microsoft.com/answers/questions/2116616/service-principal-access-to-sharepoint-online). + 1. [Create an Entra ID app registration](https://learn.microsoft.com/entra/identity-platform/quickstart-register-app?pivots=portal). + 2. [Add Graph access permissions to an app registration](https://learn.microsoft.com/entra/identity-platform/howto-update-permissions?pivots=portal#add-permissions-to-an-application). + 3. [Grant consent for the added Graph permissions](https://learn.microsoft.com/entra/identity-platform/howto-update-permissions?pivots=portal#grant-consent-for-the-added-permissions-for-the-enterprise-application). + + The following video shows how to create an Entra ID app registration: + + + + The following video shows how to add the correct set of Graph access permissions to the Entra ID app registration: + + + +- The token authority URL for your Microsoft Entra ID app registration. This is typically `https://login.microsoftonline.com` \ No newline at end of file diff --git a/snippets/source_connectors/sharepoint.sh.mdx b/snippets/source_connectors/sharepoint.sh.mdx index 687ffd69..10116867 100644 --- a/snippets/source_connectors/sharepoint.sh.mdx +++ b/snippets/source_connectors/sharepoint.sh.mdx @@ -3,19 +3,19 @@ unstructured-ingest \ sharepoint \ - --client-id $SHAREPOINT_APP_CLIENT_ID \ - --client-cred $SHAREPOINT_APP_CLIENT_SECRET \ - --site $SHAREPOINT_SITE \ - --path $SHAREPOINT_PATH \ - --no-omit-files \ - --omit-pages \ - --omit-lists \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --num-processes 2 \ - --verbose \ + --client-cred $ENTRA_ID_APP_CLIENT_SECRET \ + --client-id $ENTRA_ID_APP_CLIENT_ID \ + --user-pname $ENTRA_ID_USER_PRINCIPAL_NAME \ + --tenant $ENTRA_ID_APP_TENANT_ID \ + --authority-url $ENTRA_ID_TOKEN_AUTHORITY_URL \ + --site $SHAREPOINT_SITE_URL \ + --path $SHAREPOINT_SITE_PATH \ + --recursive \ + --download-dir $LOCAL_FILE_DOWNLOAD_DIR\ --partition-by-api \ --api-key $UNSTRUCTURED_API_KEY \ --partition-endpoint $UNSTRUCTURED_API_URL \ --strategy hi_res \ + --output-dir $LOCAL_FILE_OUTPUT_DIR \ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" ``` diff --git a/snippets/source_connectors/sharepoint.v2.py.mdx b/snippets/source_connectors/sharepoint.v2.py.mdx index e75d8346..b4f925b0 100644 --- a/snippets/source_connectors/sharepoint.v2.py.mdx +++ b/snippets/source_connectors/sharepoint.v2.py.mdx @@ -3,18 +3,15 @@ import os from unstructured_ingest.v2.pipeline.pipeline import Pipeline from unstructured_ingest.v2.interfaces import ProcessorConfig -from pydantic import Secret from unstructured_ingest.v2.processes.connectors.sharepoint import ( SharepointIndexerConfig, SharepointDownloaderConfig, SharepointConnectionConfig, - SharepointAccessConfig, - SharepointPermissionsConfig + SharepointAccessConfig ) from unstructured_ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, LocalUploaderConfig ) @@ -28,17 +25,19 @@ if __name__ == "__main__": Pipeline.from_configs( context=ProcessorConfig(), indexer_config=SharepointIndexerConfig( - path=os.getenv("SHAREPOINT_PATH"), - recursive=True, # Recursively process files in their respective folders. (False (default) for no recursion.) - omit_lists=True, # Do not process lists. (False (default) to process lists.) - omit_pages=True, # Do not process site pages. (False (default) to process site pages.) - omit_files=False # Process files (default). (True to not process files.) + path=os.getenv("SHAREPOINT_SITE_PATH"), + recursive=True # True to recursively download files in their respective folders. ), downloader_config=SharepointDownloaderConfig(download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR")), source_connection_config=SharepointConnectionConfig( - access_config=SharepointAccessConfig(client_cred=os.getenv("SHAREPOINT_APP_CLIENT_SECRET")), - client_id=os.getenv("SHAREPOINT_APP_CLIENT_ID"), - site=os.getenv("SHAREPOINT_SITE"), + access_config=SharepointAccessConfig( + client_cred=os.getenv("ENTRA_ID_APP_CLIENT_SECRET") + ), + client_id=os.getenv("ENTRA_ID_APP_CLIENT_ID"), + user_pname=os.getenv("ENTRA_ID_USER_PRINCIPAL_NAME"), + tenant=os.getenv("ENTRA_ID_APP_TENANT_ID"), + authority_url=os.getenv("ENTRA_ID_TOKEN_AUTHORITY_URL"), + site=os.getenv("SHAREPOINT_SITE_URL") ), partitioner_config=PartitionerConfig( partition_by_api=True, diff --git a/snippets/source_connectors/sharepoint_rest_change.mdx b/snippets/source_connectors/sharepoint_rest_change.mdx index 265f979b..1e667f01 100644 --- a/snippets/source_connectors/sharepoint_rest_change.mdx +++ b/snippets/source_connectors/sharepoint_rest_change.mdx @@ -7,9 +7,12 @@ curl --request 'PUT' --location \ --data \ '{ "config": { - "client_id": "", + "client_id": "", "site": "", - "client_cred": "" + "tenant": "", + "authority_url": "", + "user_pname": "", + "client_cred": "", "path": "", "recursive": } diff --git a/snippets/source_connectors/sharepoint_rest_create.mdx b/snippets/source_connectors/sharepoint_rest_create.mdx index 5b5e28f6..005fc2a0 100644 --- a/snippets/source_connectors/sharepoint_rest_create.mdx +++ b/snippets/source_connectors/sharepoint_rest_create.mdx @@ -9,9 +9,12 @@ curl --request 'POST' --location \ "name": "", "type": "sharepoint", "config": { - "client_id": "", + "client_id": "", "site": "", - "client_cred": "" + "tenant": "", + "authority_url": "", + "user_pname": "", + "client_cred": "", "path": "", "recursive": }