diff --git a/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/.dockerignore b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/.dockerignore new file mode 100644 index 00000000..fe1152bd --- /dev/null +++ b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/.dockerignore @@ -0,0 +1,30 @@ +**/.classpath +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/*.*proj.user +**/*.dbmdl +**/*.jfm +**/azds.yaml +**/bin +**/charts +**/docker-compose* +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +LICENSE +README.md +!**/.gitignore +!.git/HEAD +!.git/config +!.git/packed-refs +!.git/refs/heads/** \ No newline at end of file diff --git a/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.sln b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.sln new file mode 100644 index 00000000..cf12e2d2 --- /dev/null +++ b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.14.36408.4 d17.14 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OCR-with-Tesseract-in-Docker-on-Linux", "OCR-with-Tesseract-in-Docker-on-Linux\OCR-with-Tesseract-in-Docker-on-Linux.csproj", "{92C3B623-ED53-4127-8161-975BCD7AA532}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {92C3B623-ED53-4127-8161-975BCD7AA532}.Debug|Any CPU.Build.0 = Debug|Any CPU + {92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.ActiveCfg = Release|Any CPU + {92C3B623-ED53-4127-8161-975BCD7AA532}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {BEF3B3F0-759C-4D53-BF94-8EB1E0E7D2FE} + EndGlobalSection +EndGlobal diff --git a/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Data/Input.pdf b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Data/Input.pdf new file mode 100644 index 00000000..d9675bdf Binary files /dev/null and b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Data/Input.pdf differ diff --git a/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Dockerfile b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Dockerfile new file mode 100644 index 00000000..ad9f1651 --- /dev/null +++ b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Dockerfile @@ -0,0 +1,29 @@ +# See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging. + +# This stage is used when running from VS in fast mode (Default for Debug configuration) +FROM mcr.microsoft.com/dotnet/runtime:8.0 AS +RUN apt-get update && apt-get install -y tesseract-ocr +USER $APP_UID +WORKDIR /app + + +# This stage is used to build the service project +FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build +ARG BUILD_CONFIGURATION=Release +WORKDIR /src +COPY ["OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj", "OCR-with-Tesseract-in-Docker-on-Linux/"] +RUN dotnet restore "./OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj" +COPY . . +WORKDIR "/src/OCR-with-Tesseract-in-Docker-on-Linux" +RUN dotnet build "./OCR-with-Tesseract-in-Docker-on-Linux.csproj" -c $BUILD_CONFIGURATION -o /app/build + +# This stage is used to publish the service project to be copied to the final stage +FROM build AS publish +ARG BUILD_CONFIGURATION=Release +RUN dotnet publish "./OCR-with-Tesseract-in-Docker-on-Linux.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false + +# This stage is used in production or when running from VS in regular mode (Default when not using the Debug configuration) +FROM base AS final +WORKDIR /app +COPY --from=publish /app/publish . +ENTRYPOINT ["dotnet", "OCR-with-Tesseract-in-Docker-on-Linux.dll"] \ No newline at end of file diff --git a/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj new file mode 100644 index 00000000..5555d195 --- /dev/null +++ b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj @@ -0,0 +1,24 @@ + + + + Exe + net8.0 + OCR-with-Tesseract-in-Docker-on-Linux + enable + enable + Linux + + + + + + + + + + + Always + + + + diff --git a/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.user b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.user new file mode 100644 index 00000000..dd2d54cf --- /dev/null +++ b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.user @@ -0,0 +1,6 @@ + + + + Container (Dockerfile) + + \ No newline at end of file diff --git a/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Output/.gitkeep b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Output/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Program.cs b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Program.cs new file mode 100644 index 00000000..073245c1 --- /dev/null +++ b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Program.cs @@ -0,0 +1,154 @@ +using Syncfusion.Drawing; +using Syncfusion.OCRProcessor; +using Syncfusion.Pdf; +using Syncfusion.Pdf.Graphics; +using Syncfusion.Pdf.Parsing; +using System.Diagnostics; +using System.Xml.Linq; + +// Main application logic +class Program +{ + static void Main(string[] args) + { + string docPath = Path.GetFullPath(@"Data/Input.pdf"); + //Initialize the OCR processor. + using (OCRProcessor processor = new OCRProcessor()) + { + FileStream fileStream = new FileStream(docPath, FileMode.Open, FileAccess.Read); + //Load a PDF document + PdfLoadedDocument lDoc = new PdfLoadedDocument(fileStream); + //Set OCR language to process + processor.Settings.Language = Languages.English; + IOcrEngine tesseractEngine = new Tesseract5OCREngine(); + processor.ExternalEngine = tesseractEngine; + //Process OCR by providing the PDF document. + processor.PerformOCR(lDoc); + //Create memory stream + using (MemoryStream stream = new MemoryStream()) + { + //Save the document to memory stream + lDoc.Save(stream); + lDoc.Close(); + } + } + } +} + +// Tesseract5OcrEngine implementation +class Tesseract5OCREngine : IOcrEngine +{ + private float imageHeight; + private float imageWidth; + + public OCRLayoutResult PerformOCR(Stream stream) + { + if (stream == null || !stream.CanRead) + throw new ArgumentException("Input stream is null or not readable for OCR.", nameof(stream)); + + stream.Position = 0; + + using (var tempMemStream = new MemoryStream()) + { + stream.CopyTo(tempMemStream); + tempMemStream.Position = 0; + var pdfTiffImage = new PdfTiffImage(tempMemStream); + imageHeight = pdfTiffImage.Height; + imageWidth = pdfTiffImage.Width; + } + + string tempImageFile = Path.GetTempFileName(); + string tempHocrFile = tempImageFile + ".hocr"; + + // Write stream to temp image file + using (var tempFileStream = new FileStream(tempImageFile, FileMode.Create, FileAccess.Write)) + { + stream.Position = 0; + stream.CopyTo(tempFileStream); + } + + var startInfo = new ProcessStartInfo + { + FileName = "tesseract", + Arguments = $"\"{tempImageFile}\" \"{tempImageFile}\" -l eng hocr", + RedirectStandardError = true, + UseShellExecute = false, + CreateNoWindow = true + }; + + string hocrText = null; + using (var process = new Process { StartInfo = startInfo }) + { + process.Start(); + string errorOutput = process.StandardError.ReadToEnd(); + process.WaitForExit(); + + if (process.ExitCode != 0) + throw new Exception($"Tesseract process failed with exit code {process.ExitCode}. Error: {errorOutput}"); + + if (!File.Exists(tempHocrFile)) + throw new Exception("HOCR output file not found. Tesseract might have failed or not produced output."); + + hocrText = File.ReadAllText(tempHocrFile); + } + + // Clean up temp files + if (File.Exists(tempImageFile)) File.Delete(tempImageFile); + if (File.Exists(tempHocrFile)) File.Delete(tempHocrFile); + + if (string.IsNullOrEmpty(hocrText)) + throw new Exception("HOCR text could not be generated or was empty."); + + var ocrLayoutResult = new OCRLayoutResult(); + BuildOCRLayoutResult(ocrLayoutResult, hocrText, imageWidth, imageHeight); + ocrLayoutResult.ImageWidth = imageWidth; + ocrLayoutResult.ImageHeight = imageHeight; + + return ocrLayoutResult; + } + + void BuildOCRLayoutResult(OCRLayoutResult ocr, string hOcrText, float imageWidth, float imageHeight) + { + var doc = XDocument.Parse(hOcrText, LoadOptions.None); + var ns = "http://www.w3.org/1999/xhtml"; + + foreach (var pageElement in doc.Descendants(ns + "div").Where(d => d.Attribute("class")?.Value == "ocr_page")) + { + Page ocrPage = new Page(); + + foreach (var lineElement in pageElement.Descendants(ns + "span") + .Where(s => s.Attribute("class")?.Value == "ocr_line" || s.Attribute("class")?.Value == "ocr_header")) + { + Line ocrLine = new Line(); + + foreach (var wordElement in lineElement.Descendants(ns + "span") + .Where(s => s.Attribute("class")?.Value == "ocrx_word")) + { + Word ocrWord = new Word { Text = wordElement.Value }; + String title = wordElement.Attribute("title")?.Value; + + if (title != null) + { + String bboxString = title.Split(';')[0].Replace("bbox", "").Trim(); + int[] coords = bboxString.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(int.Parse).ToArray(); + + if (coords.Length == 4) + { + float x = coords[0]; + float y = coords[1]; + float width = coords[2] - coords[0]; + float height = coords[3] - coords[1]; + ocrWord.Rectangle = new RectangleF(x, y, width, height); + } + } + + ocrLine.Add(ocrWord); + } + + ocrPage.Add(ocrLine); + } + + ocr.Add(ocrPage); + } + } +} diff --git a/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Properties/launchSettings.json b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Properties/launchSettings.json new file mode 100644 index 00000000..d5874bb2 --- /dev/null +++ b/OCR/.NET/OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux/Properties/launchSettings.json @@ -0,0 +1,10 @@ +{ + "profiles": { + "OCR-with-Tesseract-in-Docker-on-Linux": { + "commandName": "Project" + }, + "Container (Dockerfile)": { + "commandName": "Docker" + } + } +} \ No newline at end of file