Added documentation and readme

Sicos1977 · May 4, 2022 · 76de6ff · 76de6ff
1 parent cfa1abc
commit 76de6ff
Show file tree

Hide file tree

Showing 2 changed files with 183 additions and 3 deletions.
diff --git a/TesseractOCR/README.md b/TesseractOCR/README.md
@@ -0,0 +1,174 @@
+![image](https://user-images.githubusercontent.com/6692947/150184680-1ae82d62-891e-4dbd-b52b-e975c57f9761.png)
+
+
+What is TesseractOCR
+=========
+
+It is a .NET wrapper for Tesseract 5.1.0 that is originally copied from Charles Weld (https://github.com/charlesw/tesseract) and modified for my own needs
+
+How to use
+============
+
+You need trained data in tessdata by language
+You can get them at https://github.com/tesseract-ocr/tessdata or https://github.com/tesseract-ocr/tessdata_fast
+
+## Microsoft Visual C++ runtimes
+
+The DLL's Tesseract51.dll (and exe) and leptonica-1.82.0.dll are compiled with Visual Studio 2022 you need these C++ runtimes for it on your computer
+
+- X86: https://aka.ms/vs/17/release/vc_redist.x86.exe
+- X64: https://aka.ms/vs/17/release/vc_redist.x64.exe
+
+## OCR a page
+
+```c#
+using var engine = new Engine(@"./tessdata", Language.English, EngineMode.Default);
+using var img = TesseractOCR.Pix.Image.LoadFromFile(testImagePath);
+using var page = engine.Process(img);
+Console.WriteLine("Mean confidence: {0}", page.MeanConfidence);
+Console.WriteLine("Text: \r\n{0}", page.Text);
+```
+
+## Iterate through the layout of a page
+
+```c#
+using var engine = new Engine(@"./tessdata", Language.English, EngineMode.Default);
+using var img = Pix.Image.LoadFromFile(testImagePath);
+using var page = engine.Process(img);
+
+foreach (var block in page.Layout)
+{
+    result.AppendLine($"Block confidence: {block.Confidence}");
+    if (block.BoundingBox != null)
+    {
+        var boundingBox = block.BoundingBox.Value;
+        result.AppendLine($"Block bounding box X1 '{boundingBox.X1}', Y1 '{boundingBox.Y2}', X2 " +
+                          $"'{boundingBox.X2}', Y2 '{boundingBox.Y2}', width '{boundingBox.Width}', height '{boundingBox.Height}'");
+    }
+    result.AppendLine($"Block text: {block.Text}");
+
+    foreach (var paragraph in block.Paragraphs)
+    {
+        result.AppendLine($"Paragraph confidence: {paragraph.Confidence}");
+        if (paragraph.BoundingBox != null)
+        {
+            var boundingBox = paragraph.BoundingBox.Value;
+            result.AppendLine($"Paragraph bounding box X1 '{boundingBox.X1}', Y1 '{boundingBox.Y2}', X2 " +
+                              $"'{boundingBox.X2}', Y2 '{boundingBox.Y2}', width '{boundingBox.Width}', height '{boundingBox.Height}'");
+        }
+        var info = paragraph.Info;
+        result.AppendLine($"Paragraph info justification: {info.Justification}");
+        result.AppendLine($"Paragraph info is list item: {info.IsListItem}");
+        result.AppendLine($"Paragraph info is crown: {info.IsCrown}");
+        result.AppendLine($"Paragraph info first line ident: {info.FirstLineIdent}");
+        result.AppendLine($"Paragraph text: {paragraph.Text}");
+
+        foreach (var textLine in paragraph.TextLines)
+        {
+            if (textLine.BoundingBox != null)
+            {
+                var boundingBox = textLine.BoundingBox.Value;
+                result.AppendLine($"Text line bounding box X1 '{boundingBox.X1}', Y1 '{boundingBox.Y2}', X2 " +
+                                  $"'{boundingBox.X2}', Y2 '{boundingBox.Y2}', width '{boundingBox.Width}', height '{boundingBox.Height}'");
+            }
+            result.AppendLine($"Text line confidence: {textLine.Confidence}");
+            result.AppendLine($"Text line text: {textLine.Text}");
+
+            foreach (var word in textLine.Words)
+            {
+                result.AppendLine($"Word confidence: {word.Confidence}");
+                if (word.BoundingBox != null)
+                {
+                    var boundingBox = word.BoundingBox.Value;
+                    result.AppendLine($"Word bounding box X1 '{boundingBox.X1}', Y1 '{boundingBox.Y2}', X2 " +
+                                      $"'{boundingBox.X2}', Y2 '{boundingBox.Y2}', width '{boundingBox.Width}', height '{boundingBox.Height}'");
+                }
+                result.AppendLine($"Word is from dictionary: {word.IsFromDictionary}");
+                result.AppendLine($"Word is numeric: {word.IsNumeric}");
+                result.AppendLine($"Word language: {word.Language}");
+                result.AppendLine($"Word text: {word.Text}");
+
+                foreach (var symbol in word.Symbols)
+                {
+                    result.AppendLine($"Symbol confidence: {symbol.Confidence}");
+                    if (symbol.BoundingBox != null)
+                    {
+                        var boundingBox = symbol.BoundingBox.Value;
+                        result.AppendLine($"Symbol bounding box X1 '{boundingBox.X1}', Y1 '{boundingBox.Y2}', X2 " +
+                                          $"'{boundingBox.X2}', Y2 '{boundingBox.Y2}', width '{boundingBox.Width}', height '{boundingBox.Height}'");
+                    }
+                    result.AppendLine($"Symbol is superscript: {symbol.IsSuperscript}");
+                    result.AppendLine($"Symbol is dropcap: {symbol.IsDropcap}");
+                    result.AppendLine($"Symbol text: {symbol.Text}");
+                }
+            }
+        }
+    }
+}
+```
+
+For more examples see https://github.com/Sicos1977/TesseractOCR/wiki/examples.md
+
+Supported input formats
+=======================
+
+Tesseract uses the Leptonica library to read images with one of these formats:
+
+- PNG - requires libpng, libz
+- JPEG - requires libjpeg / libjpeg-turbo
+- TIFF - requires libtiff, libz
+- JPEG 2000 - requires libopenjp2
+- GIF - requires libgif (giflib)
+- WebP (including animated WebP) - requires libwebp
+- BMP - no library required*
+= PNM - no library required*
+* Except Leptonica
+
+**I have dropped support for the Windows.Drawing.Image namespace since this only works good on Windows and not on other systems. You should be fine with Leptonica**
+
+Logging
+=======
+
+TesseractOCR uses the Microsoft ILogger interface (https://docs.microsoft.com/en-us/dotnet/api/microsoft.extensions.logging.ilogger?view=dotnet-plat-ext-5.0). You can use any logging library that uses this interface.
+
+TesseractOCR has some build in loggers that can be found in the ```TesseractOCR.Logger``` namespace. 
+
+For example
+
+```csharp
+var logger = !string.IsNullOrWhiteSpace(<some logfile>)
+                ? new TesseractOCR.Loggers.Stream(File.OpenWrite(<some logfile>))
+                : new TesseractOCR.Loggers.Console();
+```
+
+Installing via NuGet
+====================
+
+The easiest way to install TesseractOCR is via NuGet.
+
+In Visual Studio's Package Manager Console, simply enter the following command:
+
+    Install-Package TesseractOCR
+
+
+## License Information
+
+* Copyright 2012-2019 Charles Weld (https://github.com/charlesw)
+* Copyright 2021-2022 Kees van Spelde
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Core Team
+=========
+* [Sicos1977](https://github.com/sicos1977) (Kees van Spelde)
+* [charlesw](https://github.com/charlesw) (Charles Weld) - Copied repro from him
diff --git a/TesseractOCR/TesseractOCR.csproj b/TesseractOCR/TesseractOCR.csproj
@@ -14,13 +14,15 @@
 		<SignAssembly>false</SignAssembly>
 		<TargetFrameworks>netstandard21;net461;net48;net5.0;net6.0</TargetFrameworks>
 		<GeneratePackageOnBuild>True</GeneratePackageOnBuild>
-		<FileVersion>5.1.7.0</FileVersion>
-		<AssemblyVersion>5.1.7.0</AssemblyVersion>
+		<FileVersion>5.1.8.0</FileVersion>
+		<AssemblyVersion>5.1.8.0</AssemblyVersion>
 		<PackageRequireLicenseAcceptance>False</PackageRequireLicenseAcceptance>
-		<Version>5.1.7</Version>
+		<Version>5.1.8</Version>
 		<PackageIcon>ocr.png</PackageIcon>
     <GenerateDocumentationFile>true</GenerateDocumentationFile>
     <DocumentationFile>bin\$(Configuration)\$(TargetFramework)\$(AssemblyName).xml</DocumentationFile>
+    <RepositoryType>git</RepositoryType>
+    <PackageReadmeFile>README.md</PackageReadmeFile>
 	</PropertyGroup>
 	<ItemGroup>
 		<Folder Include="x86\" />
@@ -44,6 +46,10 @@
 	</ItemGroup>
 
 	<ItemGroup>
+		<None Include="..\README.md">
+		  <Pack>True</Pack>
+		  <PackagePath>\</PackagePath>
+		</None>
 		<None Include="ocr.png">
 			<Pack>True</Pack>
 			<PackagePath>\</PackagePath>