Merge pull request #146 from Frooodle/cleanups

Latest release
Stirling-Tools · May 13, 2023 · af54018 · af54018
2 parents 93edc56 + 6581bb4
commit af54018
Show file tree

Hide file tree

Showing 82 changed files with 3,398 additions and 845 deletions.
diff --git a/.gitignore b/.gitignore
@@ -109,4 +109,6 @@ local.properties
 *.tar.gz
 *.rar
 
-/build
+/build
+
+/.vscode
diff --git a/HowToAddNewLanguage.md b/HowToAddNewLanguage.md
@@ -8,11 +8,18 @@ Fork Stirling-PDF and make a new branch out of Main
 
 Then add reference to the language in the navbar by adding a new language entry to the dropdown
 
-https://github.com/Frooodle/Stirling-PDF/blob/main/src/main/resources/templates/fragments/navbar.html#L80
+https://github.com/Frooodle/Stirling-PDF/blob/main/src/main/resources/templates/fragments/navbar.html#L306
+and add a flag svg file to 
+https://github.com/Frooodle/Stirling-PDF/tree/main/src/main/resources/static/images/flags
+Any SVG flags are fine, i got most of mine from [here](https://flagicons.lipis.dev/)
+If your language isnt represented by a flag just find whichever closely matches it, such as for Arabic i chose Saudi Arabia
+
 
 For example to add Polish you would add 
 ```
-<a class="dropdown-item lang_dropdown-item" href="" data-language-code="pl_PL">Polish</a>
+<a class="dropdown-item lang_dropdown-item" href="" data-language-code="pl_PL">
+    <img src="images/flags/pl.svg" alt="icon" width="20" height="15"> Polski
+</a>
 ```
 The data-language-code is the code used to reference the file in the next step.
 

diff --git a/HowToUseOCR.md b/HowToUseOCR.md
@@ -34,14 +34,14 @@ services:
   your_service_name:
     image: your_docker_image_name
     volumes:
-      - /usr/share/tesseract-ocr/4.00/tessdata:/location/of/trainingData
+      - /location/of/trainingData:/usr/share/tesseract-ocr/4.00/tessdata
 ```
 
 
 #### Docker run
 Add the following to your existing docker run command
 ```bash
--v /usr/share/tesseract-ocr/4.00/tessdata:/location/of/trainingData
+-v /location/of/trainingData:/usr/share/tesseract-ocr/4.00/tessdata
 ```
 
 #### Non-Docker

diff --git a/LocalRunGuide.md b/LocalRunGuide.md
@@ -0,0 +1,137 @@
+
+To run the application without Docker, you will need to manually install all dependencies and build the necessary components.
+
+Note that some dependencies might not be available in the standard repositories of all Linux distributions, and may require additional steps to install.
+
+The following guide assumes you have a basic understanding of using a command line interface in your operating system.
+
+It should work on most Linux distributions and MacOS. For Windows, you might need to use Windows Subsystem for Linux (WSL) for certain steps.
+The amount of dependencies is to actually reduce overall size, ie installing LibreOffice sub components rather than full LibreOffice package.
+
+### Step 1: Prerequisites
+
+Install the following software, if not already installed:
+
+- Java 17 or later
+
+- Gradle 7.0 or later (included within repo so not needed on server)
+
+- Git
+
+- Python 3 (with pip)
+
+- Make
+
+- GCC/G++
+
+- Automake
+
+- Autoconf
+
+- libtool
+
+- pkg-config
+
+- zlib1g-dev
+
+- libleptonica-dev
+
+For Debian-based systems, you can use the following command:
+
+```bash
+sudo apt-get update
+sudo apt-get install -y git  automake  autoconf  libtool  libleptonica-dev  pkg-config zlib1g-dev make g++ java-17-openjdk python3 python3-pip
+```
+
+### Step 2: Clone and Build jbig2enc (Only required for certain OCR functionality)
+
+```bash
+git clone https:github.com/agl/jbig2enc
+cd jbig2enc
+./autogen.sh
+./configure
+make
+sudo make install
+```
+
+### Step 3: Install Additional Software
+Next we need to install LibreOffice for conversions, ocrmypdf for OCR, and opencv for patern recognition functionality.
+
+Install the following software:
+
+- libreoffice-core
+
+- libreoffice-common
+
+- libreoffice-writer
+
+- libreoffice-calc
+
+- libreoffice-impress
+
+- python3-uno
+
+- unoconv
+
+- pngquant
+
+- unpaper
+
+- ocrmypdf
+
+- opencv-python-headless
+
+For Debian-based systems, you can use the following command:
+
+```bash
+sudo apt-get install -y libreoffice-core libreoffice-common libreoffice-writer libreoffice-calc  libreoffice-impress python3-uno  unoconv  pngquant  unpaper  ocrmypdf
+pip3 install opencv-python-headless
+```
+
+### Step 4: Clone and Build Stirling-PDF
+
+```bash
+git clone https://github.com/Frooodle/Stirling-PDF.git
+cd Stirling-PDF
+./gradlew build
+```
+
+
+### Step 5: Move jar to desired location
+
+After the build process, a `.jar` file will be generated in the `build/libs` directory.
+You can move this file to a desired location, for example, `/opt/Stirling-PDF/`.
+You must also move the Script folder within the Stirling-PDF repo that you have downloaded to this directory.
+This folder is required for the python scripts using OpenCV
+
+### Step 6: Other files
+#### OCR
+If you plan to use the OCR (Optical Character Recognition) functionality, you might need to install language packs for Tesseract if running none english scanning.
+
+##### Installing Language Packs
+
+1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
+2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata`
+Please view  [OCRmyPDF install guide](https:ocrmypdf.readthedocs.io/en/latest/installation.html) for more info.
+**IMPORTANT:** DO NOT REMOVE EXISTING `eng.traineddata`, IT'S REQUIRED.
+
+
+
+### Step 7: Run Stirling-PDF
+
+```bash
+./gradlew bootRun
+or
+java -jar build/libs/app.jar
+```
+
+Remember to set the necessary environment variables before running the project if you want to customize the application the list can be seen in the main readme.
+
+You can do this in the terminal by using the `export` command or -D arguements to java -jar command:
+
+```bash
+export APP_HOME_NAME="Stirling PDF"
+or
+-DAPP_HOME_NAME="Stirling PDF" 
+```
+
diff --git a/README.md b/README.md
@@ -21,7 +21,12 @@ Feel free to request any features or bug fixes either in github issues or our [D
 - Merge multiple PDFs together into a single resultant file
 - Convert PDFs to and from images
 - Reorganize PDF pages into different orders.
-- Add images to PDFs at specified locations. (WIP)
+- Add/Generate signatures
+- Flatten PDFs
+- Repair PDFs
+- Detect and remove blank pages
+- Compare 2 PDFs and show differences in text
+- Add images to PDFs
 - Rotating PDFs in 90 degree increments.
 - Compressing PDFs to decrease their filesize. (Using OCRMyPDF)
 - Add and remove passwords
@@ -35,6 +40,9 @@ Feel free to request any features or bug fixes either in github issues or our [D
 - Dark mode support.
 - Custom download options (see [here](https://github.com/Frooodle/Stirling-PDF/blob/main/images/settings.png) for example)
 - Parallel file processing and downloads
+- API for integration with external scripts 
+
+Hosted instance/demo of the app can be seen [here](https://pdf.adminforge.de/) hosted by the team at adminforge.de
 
 ## Technologies used
 - Spring Boot + Thymeleaf
@@ -49,38 +57,62 @@ Feel free to request any features or bug fixes either in github issues or our [D
 ## How to use
 
 ### Locally
-
-Prerequisites
-- Java 17 or later
-- Gradle 7.0 or later
-
-1. Clone or download the repository.
-2. Build the project using Gradle by running `./gradlew build`
-3. Start the application by running `./gradlew bootRun` or by calling the build jar in build/libs with java -jar jarName.jar
-
+Please view https://github.com/Frooodle/Stirling-PDF/blob/main/LocalRunGuide.md
 
 ### Docker
 https://hub.docker.com/r/frooodle/s-pdf
 
 Docker Run
 ```
-docker run -p 8080:8080 frooodle/s-pdf
+docker run -d \
+  -p 8080:8080 \
+  -v /location/of/trainingData:/usr/share/tesseract-ocr/4.00/tessdata \
+  --name stirling-pdf \
+  frooodle/s-pdf
+  
+  
+  Can also add these for customisation but are not required
+  -e APP_HOME_NAME="Stirling PDF" \
+  -e APP_HOME_DESCRIPTION="Your locally hosted one-stop-shop for all your PDF needs." \
+  -e APP_NAVBAR_NAME="Stirling PDF" \
+  -e ALLOW_GOOGLE_VISABILITY="true" \
+  -e APP_ROOT_PATH="/" \
+  -e APP_LOCALE="en_GB" \
 ```
 Docker Compose
 ```
 version: '3.3'
 services:
-    s-pdf:
-        ports:
-            - '8080:8080'
-        image: frooodle/s-pdf
+  stirling-pdf:
+    image: frooodle/s-pdf
+    ports:
+      - '8080:8080'
+    volumes:
+      - /location/of/trainingData:/usr/share/tesseract-ocr/4.00/tessdata #Required for extra OCR languages
+#      - /location/of/extraConfigs:/configs
+#    environment:
+#      APP_LOCALE: en_GB
+#      APP_HOME_NAME: Stirling PDF
+#      APP_HOME_DESCRIPTION: Your locally hosted one-stop-shop for all your PDF needs.
+#      APP_NAVBAR_NAME: Stirling PDF
+#      APP_ROOT_PATH: /
+#      ALLOW_GOOGLE_VISABILITY: true
+
 ```
 
 
 ## Enable OCR/Compression feature
 Please view https://github.com/Frooodle/Stirling-PDF/blob/main/HowToUseOCR.md
 
 ## Want to add your own language?
+Stirling PDF currently supports
+- English
+- Arabic (العربية)
+- German (Deutsch)
+- French (Français)
+- Spanish (Español)
+- Chinese (简体中文)
+
 If you want to add your own language to Stirling-PDF please refer
 https://github.com/Frooodle/Stirling-PDF/blob/main/HowToAddNewLanguage.md
 
@@ -98,7 +130,10 @@ Stirling PDF allows easy customization of the visible application name.
 Simply use environment variables APP_HOME_NAME, APP_HOME_DESCRIPTION and APP_NAVBAR_NAME with Docker or Java. 
 If running Java directly, you can also pass these as properties using -D arguments.
 
-Using the same method you can also change the default language by providing APP_LOCALE with values like de-DE fr-FR or ar-AR to select your default language (Will always default to English on invalid locale)
+Using the same method you can also change 
+- The default language by providing APP_LOCALE with values like de-DE fr-FR or ar-AR to select your default language (Will always default to English on invalid locale)
+- Enable/Disable search engine visablility with ALLOW_GOOGLE_VISABILITY with true / false values. Default disable visability.
+- Change root URI for Stirling-PDF ie change server.com/ to server.com/pdf-app by running APP_ROOT_PATH as pdf-app
 
 ## API
 For those wanting to use Stirling-PDFs backend API to link with their own custom scripting to edit PDFs you can view all existing API documentation

diff --git a/build.gradle b/build.gradle
@@ -5,7 +5,7 @@ plugins {
 }
 
 group = 'stirling.software'
-version = '0.7.0'
+version = '0.8.0'
 sourceCompatibility = '17'
 
 repositories {

diff --git a/scripts/detect-blank-pages.py b/scripts/detect-blank-pages.py
@@ -0,0 +1,40 @@
+import cv2
+import numpy as np
+import sys
+import argparse
+
+def is_blank_image(image_path, threshold=10, white_percent=99, white_value=255, blur_size=5):
+    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+
+    if image is None:
+        print(f"Error: Unable to read the image file: {image_path}")
+        return False
+
+    # Apply Gaussian blur to reduce noise
+    blurred_image = cv2.GaussianBlur(image, (blur_size, blur_size), 0)
+
+    _, thresholded_image = cv2.threshold(blurred_image, white_value - threshold, white_value, cv2.THRESH_BINARY)
+
+    # Calculate the percentage of white pixels in the thresholded image
+    white_pixels = np.sum(thresholded_image == white_value)
+    total_pixels = thresholded_image.size
+    white_pixel_percentage = (white_pixels / total_pixels) * 100
+    print(f"Page has white pixel percent of {white_pixel_percentage}")
+    return white_pixel_percentage > white_percent
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Detect if an image is considered blank or not.')
+    parser.add_argument('image_path', help='The path to the image file.')
+    parser.add_argument('-t', '--threshold', type=int, default=10, help='Threshold for determining white pixels. The default value is 10.')
+    parser.add_argument('-w', '--white_percent', type=float, default=99, help='The percentage of white pixels for an image to be considered blank. The default value is 99.')
+    args = parser.parse_args()
+
+    blank = is_blank_image(args.image_path, args.threshold, args.white_percent)
+
+    if blank:
+        # Return code 1: The image is considered blank.
+        sys.exit(1)
+    else:
+        # Return code 0: The image is not considered blank.
+        sys.exit(0)
diff --git a/src/main/java/stirling/software/SPDF/config/OpenApiConfig.java b/src/main/java/stirling/software/SPDF/config/OpenApiConfig.java
@@ -6,15 +6,17 @@
 import io.swagger.v3.oas.models.Components;
 import io.swagger.v3.oas.models.OpenAPI;
 import io.swagger.v3.oas.models.info.Info;
-import io.swagger.v3.oas.models.info.License;
 
 @Configuration
 public class OpenApiConfig {
 
     @Bean
     public OpenAPI customOpenAPI() {
+        String version = getClass().getPackage().getImplementationVersion();
+        version =  (version != null) ? version : "1.0.0";
+
         return new OpenAPI().components(new Components()).info(
-                new Info().title("Your API Title").version("1.0.0").description("Your API Description").license(new License().name("Your License Name").url("Your License URL")));
+                new Info().title("Stirling PDF API").version(version).description("API documentation for all Server-Side processing.\nPlease note some functionality might be UI only and missing from here."));
     }
 
 }
diff --git a/src/main/java/stirling/software/SPDF/controller/api/MergeController.java b/src/main/java/stirling/software/SPDF/controller/api/MergeController.java
@@ -15,6 +15,8 @@
 import org.springframework.web.bind.annotation.RestController;
 import org.springframework.web.multipart.MultipartFile;
 
+import io.swagger.v3.oas.annotations.Operation;
+import io.swagger.v3.oas.annotations.Parameter;
 import stirling.software.SPDF.utils.PdfUtils;
 
 @RestController
@@ -43,8 +45,15 @@ private PDDocument mergeDocuments(List<PDDocument> documents) throws IOException
     }
 
     @PostMapping(consumes = "multipart/form-data", value = "/merge-pdfs")
-    public ResponseEntity<byte[]> mergePdfs(@RequestPart(required = true, value = "fileInput") MultipartFile[] files) throws IOException {
-        // Read the input PDF files into PDDocument objects
+    @Operation(
+        summary = "Merge multiple PDF files into one",
+        description = "This endpoint merges multiple PDF files into a single PDF file. The merged file will contain all pages from the input files in the order they were provided."
+    )
+    public ResponseEntity<byte[]> mergePdfs(
+        @RequestPart(required = true, value = "fileInput")
+        @Parameter(description = "The input PDF files to be merged into a single file", required = true)
+            MultipartFile[] files) throws IOException {
+    	// Read the input PDF files into PDDocument objects
         List<PDDocument> documents = new ArrayList<>();
 
         // Loop through the files array and read each file into a PDDocument