Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Kees committed May 4, 2022
2 parents be6e86d + 7dd6c14 commit cfa1abc
Show file tree
Hide file tree
Showing 14 changed files with 375 additions and 158 deletions.
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ You can get them at https://github.com/tesseract-ocr/tessdata or https://github.

## Microsoft Visual C++ runtimes

The DLL's Tesseract51.dll (and exe) and leptonica-1.82.0.dll are compiled with Visual Studio 2022 you need these C++ runtimes for it on your server
The DLL's Tesseract51.dll (and exe) and leptonica-1.82.0.dll are compiled with Visual Studio 2022 you need these C++ runtimes for it on your computer

- X86: https://aka.ms/vs/17/release/vc_redist.x86.exe
- X64: https://aka.ms/vs/17/release/vc_redist.x64.exe
Expand All @@ -32,7 +32,7 @@ Console.WriteLine("Text: \r\n{0}", page.Text);
## Iterate through the layout of a page

```c#
using var engine = CreateEngine();
using var engine = new Engine(@"./tessdata", Language.English, EngineMode.Default);
using var img = Pix.Image.LoadFromFile(testImagePath);
using var page = engine.Process(img);

Expand Down Expand Up @@ -126,6 +126,21 @@ Tesseract uses the Leptonica library to read images with one of these formats:

**I have dropped support for the Windows.Drawing.Image namespace since this only works good on Windows and not on other systems. You should be fine with Leptonica**

Logging
=======

TesseractOCR uses the Microsoft ILogger interface (https://docs.microsoft.com/en-us/dotnet/api/microsoft.extensions.logging.ilogger?view=dotnet-plat-ext-5.0). You can use any logging library that uses this interface.

TesseractOCR has some build in loggers that can be found in the ```TesseractOCR.Logger``` namespace.

For example

```csharp
var logger = !string.IsNullOrWhiteSpace(<some logfile>)
? new TesseractOCR.Loggers.Stream(File.OpenWrite(<some logfile>))
: new TesseractOCR.Loggers.Console();
```

Installing via NuGet
====================

Expand Down
46 changes: 0 additions & 46 deletions Tesseract.Drawing/Tesseract.Drawing.csproj

This file was deleted.

3 changes: 3 additions & 0 deletions Tesseract.sln.DotSettings
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@
<s:Boolean x:Key="/Default/UserDictionary/Words/=pageseg/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=pangle/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Panjabi/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=paraids/@EntryIndexedValue">True</s:Boolean>

<s:Boolean x:Key="/Default/UserDictionary/Words/=Pashto/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=pconf/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=phototest/@EntryIndexedValue">True</s:Boolean>
Expand Down Expand Up @@ -148,6 +150,7 @@
<s:Boolean x:Key="/Default/UserDictionary/Words/=Tesseract_0027s/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=testregion/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=TEXTLINE/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Textlines/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=textonly/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=textord/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Thaana/@EntryIndexedValue">True</s:Boolean>
Expand Down
4 changes: 2 additions & 2 deletions TesseractOCR.Net45Tests/TesseractOCR.Net48Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="6.0.1" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.1.0" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.8" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.8" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.10" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.10" />
</ItemGroup>

<ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="6.0.1" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.1.0" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.8" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.8" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.10" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.10" />
</ItemGroup>

<ItemGroup>
Expand Down
16 changes: 14 additions & 2 deletions TesseractOCR.Tests/BaseApiTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,23 @@ namespace Tesseract.Tests
public class BaseApiTests : TesseractTestBase
{
[TestMethod]
public void GetVersion_Is500()
public void GetVersion_Is510()
{
using var engine = CreateEngine();
var version = engine.Version;
Assert.IsTrue(version.StartsWith("5.0.0"));
Assert.IsTrue(version.StartsWith("5.1.0"));
}

[TestMethod]
public void LoadedLanguages()
{
using var engine = CreateEngine();
var dp = engine.DataPath;
engine.ClearAdaptiveClassifier();
engine.ClearPersistentCache();
var languages = engine.AvailableLanguages;
//Assert.IsTrue(version.StartsWith("5.0.0"));
}

}
}
22 changes: 1 addition & 21 deletions TesseractOCR.Tests/EngineTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -141,27 +141,6 @@ public void CanProcessDifferentRegionsInSameImage()
Assert.AreEqual(region2Text, expectedTextRegion2);
}

[TestMethod]
public void CanGetSegmentedRegions()
{
const int expectedCount = 8; // number of text lines in test image

using var engine = CreateEngine();
var imgPath = TestFilePath(TestImagePath);
using var img = TesseractOCR.Pix.Image.LoadFromFile(imgPath);
using var page = engine.Process(img);
var boxes = page.GetSegmentedRegions(PageIteratorLevel.TextLine);

for (var i = 0; i < boxes.Count; i++)
{
var box = boxes[i];
Console.WriteLine("Box[{0}]: x={1}, y={2}, w={3}, h={4}", i, box.X, box.Y, box.Width,
box.Height);
}

Assert.AreEqual(boxes.Count, expectedCount);
}

[TestMethod]
public void CanProcessEmptyPixUsingResultIterator()
{
Expand Down Expand Up @@ -265,6 +244,7 @@ public void CanProcessPixUsingResultIterator()

foreach (var paragraph in block.Paragraphs)
{
var regions = block.SegmentedRegions;
result.AppendLine($"Paragraph confidence: {paragraph.Confidence}");
if (paragraph.BoundingBox != null)
{
Expand Down
55 changes: 54 additions & 1 deletion TesseractOCR/Engine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,31 @@ public class Engine : DisposableBase
/// Gets or sets default <see cref="PageSegMode" /> mode used by one of the Process methods
/// </summary>
public PageSegMode DefaultPageSegMode { get; set; }

/// <summary>
/// Returns the current engine mode
/// </summary>
public EngineMode CurrentEngineMode => TessApi.Native.BaseAPIOem(_handle);

/// <summary>
/// Returns the <see cref="Language"/> used in the last valid initialization
/// </summary>
public Language InitLanguage => LanguageHelper.StringToEnum(MarshalHelper.PtrToString(TessApi.Native.BaseApiGetDatapath(_handle)));

/// <summary>
/// Returns the data path
/// </summary>
public string DataPath => MarshalHelper.PtrToString(TessApi.Native.BaseApiGetDatapath(_handle)).TrimEnd('/');

/// <summary>
/// Returns a list of loaded <see cref="Language"/>'s
/// </summary>
public List<Language> LoadedLanguages => TessApi.BaseApiLoadedLanguages(_handle);

/// <summary>
/// Returns a list of available <see cref="Language"/>'s
/// </summary>
public List<Language> AvailableLanguages => TessApi.BaseAPIGetAvailableLanguagesAsVector(_handle);
#endregion

#region Constructors
Expand Down Expand Up @@ -630,7 +655,35 @@ private void Initialize(string dataPath, Language language, EngineMode engineMod
}
#endregion

#region BaseApiSetDebugVariable
#region ClearAdaptiveClassifier
/// <summary>
/// Call between pages or documents etc to free up memory and forget adaptive data
/// </summary>
public void ClearAdaptiveClassifier()
{
TessApi.Native.BaseAPIClearAdaptiveClassifier(_handle);
}
#endregion

#region ClearPersistentCache
/// <summary>
/// Clear any library-level memory caches. There are a variety of expensive-to-load constant data structures
/// (mostly language dictionaries) that are cached globally -- surviving the Init() and End() of individual TessBaseAPI's.
/// This function allows the clearing of these caches
/// </summary>
public void ClearPersistentCache()
{
TessApi.Native.BaseAPIClearPersistentCache(_handle);
}
#endregion

#region SetDebugVariable
/// <summary>
/// Sets a debug variable.
/// </summary>
/// <param name="name"></param>
/// <param name="value"></param>
/// <returns></returns>
public bool SetDebugVariable(string name, string value)
{
return TessApi.BaseApiSetDebugVariable(_handle, name, value) != 0;
Expand Down
6 changes: 6 additions & 0 deletions TesseractOCR/Enums/Language.cs
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,12 @@ public enum Language
/// </summary>
[String("jpn")]
Japanese,

/// <summary>
/// Japanese (vertical)
/// </summary>
[String("jpn_vert")]
JapaneseVertical,

/// <summary>
/// Kannada
Expand Down
Loading

0 comments on commit cfa1abc

Please sign in to comment.