Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
**/.classpath
**/.dockerignore
**/.env
**/.git
**/.gitignore
**/.project
**/.settings
**/.toolstarget
**/.vs
**/.vscode
**/*.*proj.user
**/*.dbmdl
**/*.jfm
**/azds.yaml
**/bin
**/charts
**/docker-compose*
**/Dockerfile*
**/node_modules
**/npm-debug.log
**/obj
**/secrets.dev.yaml
**/values.dev.yaml
LICENSE
README.md
!**/.gitignore
!.git/HEAD
!.git/config
!.git/packed-refs
!.git/refs/heads/**
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.14.36408.4 d17.14
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OCR-with-Tesseract-in-Docker-on-Linux", "OCR-with-Tesseract-in-Docker-on-Linux\OCR-with-Tesseract-in-Docker-on-Linux.csproj", "{92C3B623-ED53-4127-8161-975BCD7AA532}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.Build.0 = Debug|Any CPU
{92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.ActiveCfg = Release|Any CPU
{92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {BEF3B3F0-759C-4D53-BF94-8EB1E0E7D2FE}
EndGlobalSection
EndGlobal
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging.

# This stage is used when running from VS in fast mode (Default for Debug configuration)
FROM mcr.microsoft.com/dotnet/runtime:8.0 AS
RUN apt-get update && apt-get install -y tesseract-ocr
USER $APP_UID
WORKDIR /app


# This stage is used to build the service project
FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
ARG BUILD_CONFIGURATION=Release
WORKDIR /src
COPY ["OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj", "OCR-with-Tesseract-in-Docker-on-Linux/"]
RUN dotnet restore "./OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj"
COPY . .
WORKDIR "/src/OCR-with-Tesseract-in-Docker-on-Linux"
RUN dotnet build "./OCR-with-Tesseract-in-Docker-on-Linux.csproj" -c $BUILD_CONFIGURATION -o /app/build

# This stage is used to publish the service project to be copied to the final stage
FROM build AS publish
ARG BUILD_CONFIGURATION=Release
RUN dotnet publish "./OCR-with-Tesseract-in-Docker-on-Linux.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false

# This stage is used in production or when running from VS in regular mode (Default when not using the Debug configuration)
FROM base AS final
WORKDIR /app
COPY --from=publish /app/publish .
ENTRYPOINT ["dotnet", "OCR-with-Tesseract-in-Docker-on-Linux.dll"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>OCR-with-Tesseract-in-Docker-on-Linux</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.22.1" />
<PackageReference Include="SkiaSharp.NativeAssets.Linux.NoDependencies" Version="*" />
<PackageReference Include="Syncfusion.PDF.OCR.Net.Core" Version="*" />
</ItemGroup>

<ItemGroup>
<None Update="Data\Input.pdf">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<ActiveDebugProfile>Container (Dockerfile)</ActiveDebugProfile>
</PropertyGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
using Syncfusion.Drawing;
using Syncfusion.OCRProcessor;
using Syncfusion.Pdf;
using Syncfusion.Pdf.Graphics;
using Syncfusion.Pdf.Parsing;
using System.Diagnostics;
using System.Xml.Linq;

// Main application logic
class Program
{
static void Main(string[] args)
{
string docPath = Path.GetFullPath(@"Data/Input.pdf");
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
FileStream fileStream = new FileStream(docPath, FileMode.Open, FileAccess.Read);
//Load a PDF document
PdfLoadedDocument lDoc = new PdfLoadedDocument(fileStream);
//Set OCR language to process
processor.Settings.Language = Languages.English;
IOcrEngine tesseractEngine = new Tesseract5OCREngine();
processor.ExternalEngine = tesseractEngine;
//Process OCR by providing the PDF document.
processor.PerformOCR(lDoc);
//Create memory stream
using (MemoryStream stream = new MemoryStream())
{
//Save the document to memory stream
lDoc.Save(stream);
lDoc.Close();
}
}
}
}

// Tesseract5OcrEngine implementation
class Tesseract5OCREngine : IOcrEngine
{
private float imageHeight;
private float imageWidth;

public OCRLayoutResult PerformOCR(Stream stream)
{
if (stream == null || !stream.CanRead)
throw new ArgumentException("Input stream is null or not readable for OCR.", nameof(stream));

stream.Position = 0;

using (var tempMemStream = new MemoryStream())
{
stream.CopyTo(tempMemStream);
tempMemStream.Position = 0;
var pdfTiffImage = new PdfTiffImage(tempMemStream);
imageHeight = pdfTiffImage.Height;
imageWidth = pdfTiffImage.Width;
}

string tempImageFile = Path.GetTempFileName();
string tempHocrFile = tempImageFile + ".hocr";

// Write stream to temp image file
using (var tempFileStream = new FileStream(tempImageFile, FileMode.Create, FileAccess.Write))
{
stream.Position = 0;
stream.CopyTo(tempFileStream);
}

var startInfo = new ProcessStartInfo
{
FileName = "tesseract",
Arguments = $"\"{tempImageFile}\" \"{tempImageFile}\" -l eng hocr",
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true
};

string hocrText = null;
using (var process = new Process { StartInfo = startInfo })
{
process.Start();
string errorOutput = process.StandardError.ReadToEnd();
process.WaitForExit();

if (process.ExitCode != 0)
throw new Exception($"Tesseract process failed with exit code {process.ExitCode}. Error: {errorOutput}");

if (!File.Exists(tempHocrFile))
throw new Exception("HOCR output file not found. Tesseract might have failed or not produced output.");

hocrText = File.ReadAllText(tempHocrFile);
}

// Clean up temp files
if (File.Exists(tempImageFile)) File.Delete(tempImageFile);
if (File.Exists(tempHocrFile)) File.Delete(tempHocrFile);

if (string.IsNullOrEmpty(hocrText))
throw new Exception("HOCR text could not be generated or was empty.");

var ocrLayoutResult = new OCRLayoutResult();
BuildOCRLayoutResult(ocrLayoutResult, hocrText, imageWidth, imageHeight);
ocrLayoutResult.ImageWidth = imageWidth;
ocrLayoutResult.ImageHeight = imageHeight;

return ocrLayoutResult;
}

void BuildOCRLayoutResult(OCRLayoutResult ocr, string hOcrText, float imageWidth, float imageHeight)
{
var doc = XDocument.Parse(hOcrText, LoadOptions.None);
var ns = "http://www.w3.org/1999/xhtml";

foreach (var pageElement in doc.Descendants(ns + "div").Where(d => d.Attribute("class")?.Value == "ocr_page"))
{
Page ocrPage = new Page();

foreach (var lineElement in pageElement.Descendants(ns + "span")
.Where(s => s.Attribute("class")?.Value == "ocr_line" || s.Attribute("class")?.Value == "ocr_header"))
{
Line ocrLine = new Line();

foreach (var wordElement in lineElement.Descendants(ns + "span")
.Where(s => s.Attribute("class")?.Value == "ocrx_word"))
{
Word ocrWord = new Word { Text = wordElement.Value };
String title = wordElement.Attribute("title")?.Value;

if (title != null)
{
String bboxString = title.Split(';')[0].Replace("bbox", "").Trim();
int[] coords = bboxString.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(int.Parse).ToArray();

if (coords.Length == 4)
{
float x = coords[0];
float y = coords[1];
float width = coords[2] - coords[0];
float height = coords[3] - coords[1];
ocrWord.Rectangle = new RectangleF(x, y, width, height);
}
}

ocrLine.Add(ocrWord);
}

ocrPage.Add(ocrLine);
}

ocr.Add(ocrPage);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"profiles": {
"OCR-with-Tesseract-in-Docker-on-Linux": {
"commandName": "Project"
},
"Container (Dockerfile)": {
"commandName": "Docker"
}
}
}
Loading