From 946692420c459d8a6d6a463656b2176adfa56db0 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Fri, 21 Mar 2025 15:06:15 -0700 Subject: [PATCH 1/3] Supported languages for Fast, High-Res, and VLM partitioning --- ui/partitioning.mdx | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/ui/partitioning.mdx b/ui/partitioning.mdx index 8f8447e3..cc5f5d1b 100644 --- a/ui/partitioning.mdx +++ b/ui/partitioning.mdx @@ -23,3 +23,33 @@ import PlatformPartitioningStrategies from '/snippets/general-shared-text/platfo +## Supported languages + +**Fast** and **High Res** use Tesseract OCR. For the list of languages that Tesseract supports, see +[Languages/Scripts supported in different versions of Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). + +Language support for **VLM** depends on the model used. The list of supported languages for a particular model is maintained by +that model's provider. For the list of languages that each model supports, see the following, where provided: + +- Anthropic + + - Claude 3.5 Sonnet: Arabic, Bengali, Chinese (Simplified), English, French, German, Hindi, Indonesian, Italian, Japanese, Korean, + Portuguese (Brazil), Spanish, Swahili, and Yoruba are mentioned. ([Source](https://docs.anthropic.com/en/docs/build-with-claude/multilingual-support)) + +- OpenAI + + - GPT-4o: Arabic, Chinese, English, French, German, Gujarati, Hindi, Italian, Japanese, Korean, Marathi, Persian, + Portuguese, Russian, Spanish, Tamil, Telugu, Turkish, Urdu, and Vietnamese are mentioned. ([Source](https://openai.com/index/hello-gpt-4o/)) + +- Amazon Bedrock + + - Claude 3.5 Sonnet: "English, Spanish, Japanese, and multiple other languages" ([Source](https://aws.amazon.com/bedrock/claude/)) + - Claude 3 Opus: "English, Spanish, Japanese, and multiple other languages" ([Source](https://aws.amazon.com/bedrock/claude/)) + - Claude 3 Haiku: "English, Spanish, Japanese, and multiple other languages" ([Source](https://aws.amazon.com/bedrock/claude/)) + - Claude 3 Sonnet: "English, Spanish, Japanese, and multiple other languages" ([Source](https://aws.amazon.com/bedrock/claude/)) + - Amazon Nova Pro: "200+ languages" ([Source](https://aws.amazon.com/ai/generative-ai/nova/)) + - Amazon Nova Lite: "200+ languages" ([Source](https://aws.amazon.com/ai/generative-ai/nova/)) + - Meta Llama 3.2 90B Instruct: "English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai" ([Source](https://aws.amazon.com/bedrock/llama/)) + - Meta Llama 3.2 11B Instruct: "English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai" ([Source](https://aws.amazon.com/bedrock/llama/)) + + From 78871c3746c5bcdc26c4eb03788a458750406f46 Mon Sep 17 00:00:00 2001 From: Paul-Cornell Date: Fri, 21 Mar 2025 15:33:53 -0700 Subject: [PATCH 2/3] Update ui/partitioning.mdx Co-authored-by: cragwolfe --- ui/partitioning.mdx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ui/partitioning.mdx b/ui/partitioning.mdx index cc5f5d1b..46f8fc50 100644 --- a/ui/partitioning.mdx +++ b/ui/partitioning.mdx @@ -25,7 +25,9 @@ import PlatformPartitioningStrategies from '/snippets/general-shared-text/platfo ## Supported languages -**Fast** and **High Res** use Tesseract OCR. For the list of languages that Tesseract supports, see +**Fast** partitioning accepts any text inputs, though automatic language detection of those inputs is restricted to [langdetect](https://pypi.org/project/langdetect/) + +**High Res** partitioning leverages Tesseract OCR. For the list of languages that Tesseract supports, see: [Languages/Scripts supported in different versions of Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). Language support for **VLM** depends on the model used. The list of supported languages for a particular model is maintained by From 6fbe23ce631f038927fa104927db54d41e3e6d65 Mon Sep 17 00:00:00 2001 From: Paul-Cornell Date: Fri, 21 Mar 2025 15:34:20 -0700 Subject: [PATCH 3/3] Apply suggestions from code review --- ui/partitioning.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/partitioning.mdx b/ui/partitioning.mdx index 46f8fc50..728d0557 100644 --- a/ui/partitioning.mdx +++ b/ui/partitioning.mdx @@ -25,7 +25,7 @@ import PlatformPartitioningStrategies from '/snippets/general-shared-text/platfo ## Supported languages -**Fast** partitioning accepts any text inputs, though automatic language detection of those inputs is restricted to [langdetect](https://pypi.org/project/langdetect/) +**Fast** partitioning accepts any text inputs, though automatic language detection of those inputs is restricted to [langdetect](https://pypi.org/project/langdetect/). **High Res** partitioning leverages Tesseract OCR. For the list of languages that Tesseract supports, see: [Languages/Scripts supported in different versions of Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).