VikParuchuri · robin-collins · Apr 9, 2024
diff --git a/.github/workflows/build-push-cpu.yml b/.github/workflows/build-push-cpu.yml
@@ -0,0 +1,51 @@
+#
+name: Create and publish CPU Docker image
+
+# Configures this workflow to run every time a change is pushed to the branch called `release`.
+on:
+  push:
+    branches: ['release']
+  workflow_dispatch:
+# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+    # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
+    permissions:
+      contents: read
+      packages: write
+      # 
+    steps:
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
+      - name: Log in to the Container registry
+        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+      # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
+      # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
+      # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
+      - name: Build and push cpu.Docker image
+        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
+        with:
+          context: .
+          file: ./cpu.Dockerfile
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/build-push-gpu.yml b/.github/workflows/build-push-gpu.yml
@@ -0,0 +1,51 @@
+#
+name: Create and publish GPU Docker image
+
+# Configures this workflow to run every time a change is pushed to the branch called `release`.
+on:
+  push:
+    branches: ['release']
+  workflow_dispatch:
+# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+    # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
+    permissions:
+      contents: read
+      packages: write
+      # 
+    steps:
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
+      - name: Log in to the Container registry
+        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+      # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
+      # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
+      # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
+      - name: Build and push gpu.Docker image
+        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
+        with:
+          context: .
+          file: ./gpu.Dockerfile
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/README.md b/README.md
@@ -40,6 +40,48 @@ The above results are with marker and nougat setup so they each take ~3GB of VRA
 
 See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
 
+# Quickstart with Docker
+
+The easiest way to get started with Marker is to build the Docker images. There are two options available:
+
+```bash
+./build-docker-containers.sh --build
+
+## Prerequisites
+
+- Docker installed on your system.
+
+
+## Running Marker with Docker
+
+### Convert a single file
+
+To convert a single PDF file to Markdown using Marker with Docker, run the following command:
+
+```bash
+docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache speeddemonau/marker-gpu single /input/file.pdf /output/file.md [--parallel_factor N] [--max_pages N]
+```
+
+- Replace `/path/to/input` with the path to the directory containing your input PDF file.
+- Replace `/path/to/output` with the path to the directory where you want the output Markdown file to be saved.
+- Replace `/path/to/cache` with the path to a directory for caching.
+- Adjust the `--parallel_factor` and `--max_pages` options as needed (see [Convert a single file](#convert-a-single-file) section for details).
+
+### Convert multiple files
+
+To convert multiple PDF files to Markdown using Marker with Docker, run the following command:
+
+```bash
+docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache speeddemonau/marker-gpu multi /input /output [--workers N] [--max N] [--metadata_file FILE] [--min_length N]
+```
+
+- Replace `/path/to/input` with the path to the directory containing your input PDF files.
+- Replace `/path/to/output` with the path to the directory where you want the output Markdown files to be saved.
+- Replace `/path/to/cache` with the path to a directory for caching.
+- Adjust the `--workers`, `--max`, `--metadata_file`, and `--min_length` options as needed (see [Convert multiple files](#convert-multiple-files) section for details).
+
+Make sure to use the appropriate Docker image tag (`speeddemonau/marker-cpu` or `speeddemonau/marker-gpu`) depending on whether you want to run Marker on CPU or GPU.
+
 # Community
 
 [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
@@ -149,6 +191,12 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 bas
 
 Note that the env variables above are specific to this script, and cannot be set in `local.env`.
 
+# Additional Notes
+
+- The Docker images are built with support for multiple languages. See the `TESSERACT_LANGUAGES` setting in `settings.py` for the list of supported languages or to add your own.
+- The GPU image requires a NVIDIA GPU with CUDA support. Make sure you have the NVIDIA Docker runtime installed to use the GPU image.
+- The cache directory mounted at `/app/.cache` inside the container is used to store cached data and models. This can help speed up subsequent runs.
+
 # Benchmarks
 
 Benchmarking PDF extraction quality is hard.  I've created a test set by finding books and scientific papers that have a pdf version and a latex source.  I convert the latex to text, and compare the reference to the output of text extraction methods.

diff --git a/build-docker-containers.sh b/build-docker-containers.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+# Function to display help
+show_help() {
+    echo "Usage: $0 [OPTION]"
+    echo "Options:"
+    echo "  --build             Build both GPU and CPU images."
+    echo "  --build gpu         Build only the GPU image."
+    echo "  --build cpu         Build only the CPU image."
+    echo "  --help              Display this help and exit."
+    echo "If no options are provided, both images are built."
+    echo "Example usage:"
+    echo "  $0 --build gpu      Builds only the GPU image."
+}
+
+# Function to build images
+build_images() {
+    if [ "$1" = "gpu" ] || [ -z "$1" ]; then
+        echo "Building marker-gpu"
+        docker build -f gpu.Dockerfile -t marker-gpu .
+    fi
+    if [ "$1" = "cpu" ] || [ -z "$1" ]; then
+        echo "Building marker-cpu"
+        docker build -f cpu.Dockerfile -t marker-cpu .
+    fi
+}
+
+# Main script starts here
+case $1 in
+    --build)
+        case $2 in
+            gpu|cpu)
+                build_images $2
+                ;;
+            '')
+                build_images
+                ;;
+            *)
+                show_help
+                exit 1
+                ;;
+        esac
+        ;;
+    --help)
+        show_help
+        ;;
+    '')
+        build_images
+        echo "Done"
+        ;;
+    *)
+        show_help
+        exit 1
+        ;;
+esac
diff --git a/cpu.Dockerfile b/cpu.Dockerfile
@@ -0,0 +1,69 @@
+FROM python:3.9
+
+# Set the working directory
+WORKDIR /app
+
+# set environment variables for poetry
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+ENV LANGUAGE=C.UTF-8
+
+# Set environment variables for TORCH to use CPU
+ENV TORCH_DEVICE=cpu
+
+# Install system requirements
+RUN apt-get update && \
+    apt-get install -y git curl wget unzip apt-transport-https \
+    ghostscript lsb-release
+
+# Clone the marker repository
+RUN git clone https://github.com/VikParuchuri/marker.git .
+
+# create a directory for the app and .cache
+RUN mkdir -p /app/.cache
+
+# Set the cache directory
+ENV CACHE_DIR=/app/.cache
+
+
+# Install tesseract 5 (optional)
+RUN echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null && \
+    apt-get update -oAcquire::AllowInsecureRepositories=true && \
+    apt-get install -y --allow-unauthenticated notesalexp-keyring && \
+    apt-get update && \
+    apt-get install -y --allow-unauthenticated tesseract-ocr libtesseract-dev \
+    libmagic1 ocrmypdf tesseract-ocr-eng tesseract-ocr-deu \
+    tesseract-ocr-por tesseract-ocr-spa tesseract-ocr-rus \
+    tesseract-ocr-fra tesseract-ocr-chi-sim tesseract-ocr-jpn \
+    tesseract-ocr-kor tesseract-ocr-hin
+
+RUN pip install --no-cache-dir --upgrade pip
+RUN pip install --no-cache-dir --upgrade setuptools wheel
+RUN pip install --no-cache-dir poetry
+
+
+# Disable virtual env creation by poetry (not needed in Docker)
+# and install dependencies based on the lock file without updating
+RUN poetry config virtualenvs.create false \
+  && poetry lock --no-update \
+  && poetry install --no-dev  # Exclude development dependencies
+
+RUN poetry remove torch
+
+RUN mkdir -p /app/static
+
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+
+
+
+# Set the tesseract data folder path for Ubuntu 22.04 with tesseract 5
+ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
+
+# Copy the entrypoint script
+COPY entrypoint.sh /entrypoint.sh
+
+# Set the entrypoint
+ENTRYPOINT ["/entrypoint.sh"]
+
+# Set the default command
+CMD ["bash"]
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Check if the correct number of arguments is provided
+if [ "$#" -lt 2 ]; then
+    echo "Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name [COMMAND] [ARGS]"
+    echo ""
+    echo "Commands:"
+    echo "  single /input/file.pdf /output/file.md [OPTIONS]"
+    echo "    Convert a single file"
+    echo "    Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name single /input/file.pdf /output/file.md [--parallel_factor N] [--max_pages N]"
+    echo "    Options:"
+    echo "      --parallel_factor N  Increase batch size and parallel OCR workers by N (default: 1)"
+    echo "      --max_pages N        Maximum number of pages to process (default: all)"
+    echo ""
+    echo "  multi /input /output [OPTIONS]"
+    echo "    Convert multiple files"
+    echo "    Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name multi /input /output [--workers N] [--max N] [--metadata_file FILE] [--min_length N]"
+    echo "    Options:"
+    echo "      --workers N           Number of PDFs to convert in parallel (default: 1)"
+    echo "      --max N               Maximum number of PDFs to convert (default: all)"
+    echo "      --metadata_file FILE  Path to JSON file with per-PDF metadata (default: none)"
+    echo "      --min_length N        Minimum number of characters to extract before processing (default: 0)"
+    exit 1
+fi
+
+# Get the command
+COMMAND=$1
+shift
+
+# Activate the poetry shell
+poetry shell
+
+# Run the specified command with the provided arguments
+case $COMMAND in
+  single)
+    # Check if the correct number of arguments is provided
+    if [ "$#" -lt 2 ]; then
+      echo "Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name single /input/file.pdf /output/file.md [--parallel_factor N] [--max_pages N]"
+      exit 1
+    fi
+
+    # Set the input file and output file from the arguments
+    INPUT_FILE=$1
+    OUTPUT_FILE=$2
+    shift 2
+
+    # Run the convert_single.py script with the provided arguments
+    poetry run python /app/convert_single.py "$INPUT_FILE" "$OUTPUT_FILE" "$@"
+    ;;
+
+  multi)
+    # Check if the correct number of arguments is provided 
+    if [ "$#" -lt 2 ]; then
+      echo "Usage: docker run -v /path/to/input:/input -v /path/to/output:/output -v /path/to/cache:/app/.cache image_name multi /input /output [--workers N] [--max N] [--metadata_file FILE] [--min_length N]"
+      exit 1
+    fi
+
+    # Set the input and output directories from the arguments
+    INPUT_DIR=$1
+    OUTPUT_DIR=$2
+    shift 2
+
+    # Run the convert.py script with the provided arguments
+    poetry run python /app/convert.py "$INPUT_DIR" "$OUTPUT_DIR" "$@"
+    ;;
+
+  *)
+    echo "Unknown command: $COMMAND"
+    exit 1
+    ;;
+esac