Skip to content

Commit

Permalink
utf8 bug fix and scan pages (#113)
Browse files Browse the repository at this point in the history
  • Loading branch information
Frooodle committed May 1, 2023
1 parent 2d4aff3 commit 5bee714
Show file tree
Hide file tree
Showing 52 changed files with 2,495 additions and 1,343 deletions.
9 changes: 7 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Build jbig2enc in a separate stage
FROM frooodle/stirling-pdf-base:latest
FROM frooodle/stirling-pdf-base:beta2

# Create scripts folder and copy local scripts
RUN mkdir /scripts
COPY ./scripts/* /scripts/

# Copy the application JAR file
COPY build/libs/*.jar app.jar
Expand All @@ -13,7 +17,8 @@ ENV APP_HOME_NAME="Stirling PDF"
#ENV APP_NAVBAR_NAME="Stirling PDF"

# Run the application
ENTRYPOINT java -jar /app.jar
ENTRYPOINT ["/scripts/init.sh"]
CMD ["java", "-jar", "/app.jar"]



34 changes: 26 additions & 8 deletions DockerfileBase
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@ RUN git clone https://github.com/agl/jbig2enc && \
make && \
make install

# Main stage
FROM openjdk:17-jdk-slim

# Install necessary dependencies
# Main stage
FROM openjdk:17-jdk-slim AS base
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libreoffice-core \
Expand All @@ -33,12 +32,31 @@ RUN apt-get update && \
libreoffice-calc \
libreoffice-impress \
python3-uno \
python3-pip \
python3-pip \
unoconv \
pngquant \
unpaper \
pngquant \
unpaper \
ocrmypdf && \
pip install --user --upgrade ocrmypdf
rm -rf /var/lib/apt/lists/* && \
mkdir /usr/share/tesseract-ocr-original && \
cp -r /usr/share/tesseract-ocr/* /usr/share/tesseract-ocr-original && \
rm -rf /usr/share/tesseract-ocr

# Python packages stage
FROM base AS python-packages
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
libffi-dev \
libssl-dev \
zlib1g-dev \
libjpeg-dev && \
pip install --upgrade pip && \
pip install --no-cache-dir \
opencv-python-headless && \
rm -rf /var/lib/apt/lists/*

# Copy the jbig2enc binary from the builder stage
# Final stage: Copy necessary files from the previous stage
FROM base
COPY --from=python-packages /usr/local /usr/local
COPY --from=jbig2enc_builder /usr/local/bin/jbig2 /usr/local/bin/jbig2
2 changes: 2 additions & 0 deletions HowToUseOCR.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ Depending on your requirements, you can choose the appropriate language pack for
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata`

# DO NOT REMOVE EXISTING ENG.TRAINEDDATA, ITS REQUIRED.

#### Docker

If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs.
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,7 @@ Simply use environment variables APP_HOME_NAME, APP_HOME_DESCRIPTION and APP_NAV
If running Java directly, you can also pass these as properties using -D arguments.

Using the same method you can also change the default language by providing APP_LOCALE with values like de-DE fr-FR or ar-AR to select your default language (Will always default to English on invalid locale)

## API
For those wanting to use Stirling-PDFs backend API to link with their own custom scripting to edit PDFs you can view all existing API documentation
[here](https://app.swaggerhub.com/apis-docs/Frooodle/Stirling-PDF/1.0.0) or navigate to /swagger-ui/index.html of your stirling-pdf instance for your versions documentation
8 changes: 4 additions & 4 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ plugins {
}

group = 'stirling.software'
version = '0.6.0'
version = '0.7.0'
sourceCompatibility = '17'

repositories {
Expand All @@ -19,11 +19,11 @@ dependencies {
// https://mvnrepository.com/artifact/org.apache.pdfbox/jbig2-imageio
implementation group: 'org.apache.pdfbox', name: 'jbig2-imageio', version: '3.0.4'
implementation 'commons-io:commons-io:2.11.0'


implementation 'org.springdoc:springdoc-openapi-starter-webmvc-ui:2.1.0'

//general PDF
implementation 'org.apache.pdfbox:pdfbox:2.0.28'

implementation 'com.itextpdf:itextpdf:5.5.13.3'
developmentOnly("org.springframework.boot:spring-boot-devtools")

}
Expand Down
9 changes: 9 additions & 0 deletions scripts/init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

# Copy the original tesseract-ocr files to the volume directory without overwriting existing files
echo "Copying original files without overwriting existing files"
mkdir -p /usr/share/tesseract-ocr
cp -rn /usr/share/tesseract-ocr-original/* /usr/share/tesseract-ocr

# Run the main command
exec "$@"
134 changes: 134 additions & 0 deletions scripts/split_photos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import sys
import cv2
import numpy as np
import os

def find_photo_boundaries(image, background_color, tolerance=30, min_area=10000, min_contour_area=500):
mask = cv2.inRange(image, background_color - tolerance, background_color + tolerance)
mask = cv2.bitwise_not(mask)
kernel = np.ones((5,5),np.uint8)
mask = cv2.dilate(mask, kernel, iterations=2)
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

photo_boundaries = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = w * h
contour_area = cv2.contourArea(contour)
if area >= min_area and contour_area >= min_contour_area:
photo_boundaries.append((x, y, w, h))

return photo_boundaries

def estimate_background_color(image, sample_points=5):
h, w, _ = image.shape
points = [
(0, 0),
(w - 1, 0),
(w - 1, h - 1),
(0, h - 1),
(w // 2, h // 2),
]

colors = []
for x, y in points:
colors.append(image[y, x])

return np.median(colors, axis=0)

def auto_rotate(image, angle_threshold=10):
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
ret, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

if len(contours) == 0:
return image

largest_contour = max(contours, key=cv2.contourArea)
mu = cv2.moments(largest_contour)

if mu["m00"] == 0:
return image

x_centroid = int(mu["m10"] / mu["m00"])
y_centroid = int(mu["m01"] / mu["m00"])

coords = np.column_stack(np.where(binary > 0))
u, _, vt = np.linalg.svd(coords - np.array([[y_centroid, x_centroid]]), full_matrices=False)

angle = np.arctan2(u[1, 0], u[0, 0]) * 180 / np.pi

if angle < -45:
angle = -(90 + angle)
else:
angle = -angle

if abs(angle) < angle_threshold:
return image

(h, w) = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
return cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)



def crop_borders(image, border_color, tolerance=30):
mask = cv2.inRange(image, border_color - tolerance, border_color + tolerance)

contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if len(contours) == 0:
return image

largest_contour = max(contours, key=cv2.contourArea)
x, y, w, h = cv2.boundingRect(largest_contour)

return image[y:y+h, x:x+w]

def split_photos(input_file, output_directory, tolerance=30, min_area=10000, min_contour_area=500, angle_threshold=10, border_size=0):
image = cv2.imread(input_file)
background_color = estimate_background_color(image)

# Add a constant border around the image
image = cv2.copyMakeBorder(image, border_size, border_size, border_size, border_size, cv2.BORDER_CONSTANT, value=background_color)

photo_boundaries = find_photo_boundaries(image, background_color, tolerance)

if not os.path.exists(output_directory):
os.makedirs(output_directory)

# Get the input file's base name without the extension
input_file_basename = os.path.splitext(os.path.basename(input_file))[0]

for idx, (x, y, w, h) in enumerate(photo_boundaries):
cropped_image = image[y:y+h, x:x+w]
cropped_image = auto_rotate(cropped_image, angle_threshold)

# Remove the added border
cropped_image = cropped_image[border_size:-border_size, border_size:-border_size]

output_path = os.path.join(output_directory, f"{input_file_basename}_{idx+1}.png")
cv2.imwrite(output_path, cropped_image)
print(f"Saved {output_path}")

if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python3 split_photos.py <input_file> <output_directory> [tolerance] [min_area] [min_contour_area] [angle_threshold] [border_size]")
print("\nParameters:")
print(" <input_file> - The input scanned image containing multiple photos.")
print(" <output_directory> - The directory where the result images should be placed.")
print(" [tolerance] - Optional. Determines the range of color variation around the estimated background color (default: 30).")
print(" [min_area] - Optional. Sets the minimum area threshold for a photo (default: 10000).")
print(" [min_contour_area] - Optional. Sets the minimum contour area threshold for a photo (default: 500).")
print(" [angle_threshold] - Optional. Sets the minimum absolute angle required for the image to be rotated (default: 10).")
print(" [border_size] - Optional. Sets the size of the border added and removed to prevent white borders in the output (default: 0).")
sys.exit(1)

input_file = sys.argv[1]
output_directory = sys.argv[2]
tolerance = int(sys.argv[3]) if len(sys.argv) > 3 else 20
min_area = int(sys.argv[4]) if len(sys.argv) > 4 else 8000
min_contour_area = int(sys.argv[5]) if len(sys.argv) > 5 else 500
angle_threshold = int(sys.argv[6]) if len(sys.argv) > 6 else 60
border_size = int(sys.argv[7]) if len(sys.argv) > 7 else 0
split_photos(input_file, output_directory, tolerance=tolerance, min_area=min_area, min_contour_area=min_contour_area, angle_threshold=angle_threshold, border_size=border_size)
1 change: 1 addition & 0 deletions src/main/java/stirling/software/SPDF/config/Beans.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public class Beans implements WebMvcConfigurer {
@Override
public void addInterceptors(InterceptorRegistry registry) {
registry.addInterceptor(localeChangeInterceptor());
registry.addInterceptor(new CleanUrlInterceptor());
}

@Bean
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package stirling.software.SPDF.config;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.springframework.web.servlet.HandlerInterceptor;
import org.springframework.web.servlet.ModelAndView;

import jakarta.servlet.http.HttpServletRequest;
import jakarta.servlet.http.HttpServletResponse;

public class CleanUrlInterceptor implements HandlerInterceptor {

private static final Pattern LANG_PATTERN = Pattern.compile("&?lang=([^&]+)");

@Override
public boolean preHandle(HttpServletRequest request, HttpServletResponse response, Object handler) throws Exception {
String queryString = request.getQueryString();
if (queryString != null && !queryString.isEmpty()) {
String requestURI = request.getRequestURI();

// Keep the lang parameter if it exists
Matcher langMatcher = LANG_PATTERN.matcher(queryString);
String langQueryString = langMatcher.find() ? "lang=" + langMatcher.group(1) : "";

// Check if there are any other query parameters besides the lang parameter
String remainingQueryString = queryString.replaceAll(LANG_PATTERN.pattern(), "").replaceAll("&+", "&").replaceAll("^&|&$", "");

if (!remainingQueryString.isEmpty()) {
// Redirect to the URL without other query parameters
String redirectUrl = requestURI + (langQueryString.isEmpty() ? "" : "?" + langQueryString);
response.sendRedirect(redirectUrl);
return false;
}
}
return true;
}

@Override
public void postHandle(HttpServletRequest request, HttpServletResponse response, Object handler, ModelAndView modelAndView) {
}

@Override
public void afterCompletion(HttpServletRequest request, HttpServletResponse response, Object handler, Exception ex) {
}
}
20 changes: 20 additions & 0 deletions src/main/java/stirling/software/SPDF/config/OpenApiConfig.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package stirling.software.SPDF.config;

import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

import io.swagger.v3.oas.models.Components;
import io.swagger.v3.oas.models.OpenAPI;
import io.swagger.v3.oas.models.info.Info;
import io.swagger.v3.oas.models.info.License;

@Configuration
public class OpenApiConfig {

@Bean
public OpenAPI customOpenAPI() {
return new OpenAPI().components(new Components()).info(
new Info().title("Your API Title").version("1.0.0").description("Your API Description").license(new License().name("Your License Name").url("Your License URL")));
}

}

This file was deleted.

25 changes: 0 additions & 25 deletions src/main/java/stirling/software/SPDF/controller/PdfController.java

This file was deleted.

0 comments on commit 5bee714

Please sign in to comment.