Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,32 @@
# Changelog

## 2.3.1

### New: brotli-compressed `.socket.facts.json` upload

The reachability facts file (`.socket.facts.json`) is now brotli-compressed before it is
uploaded as part of a full scan. The Socket API transparently decompresses any multipart
part named exactly `.socket.facts.json.br` and stores it as plain `.socket.facts.json`, so
the stored result is unchanged — but the on-the-wire payload shrinks dramatically (a
~262 MB facts file compresses to roughly 15–30 MB).

This fixes large tier‑1 reachability scans that previously failed when the uncompressed
facts file exceeded the API's per‑file upload size cap (surfaced to the CLI as an HTTP
4xx/“502”, leaving the scan stuck with no report).

Details:

- Compression happens at the upload boundary (`Core.create_full_scan`); the file on disk is
left untouched, so local consumers (SARIF/JSON output, tier‑1 finalize, alert selection)
continue to read the plain `.socket.facts.json`.
- Only a file whose basename is exactly `.socket.facts.json` is compressed (the API matches
that exact name). A custom `--reach-output-file` name is uploaded uncompressed, as before.
- Empty baseline-scan placeholder files are not compressed.
- Compression never blocks an upload: if it fails for any reason it falls back to uploading
the plain file, and a partially-written `.socket.facts.json.br` is removed rather than
left behind in the target directory.
- Adds a `brotli` (CPython) / `brotlicffi` (PyPy) dependency.

## 2.3.0

### New: `--exit-code-on-api-error`
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "hatchling.build"

[project]
name = "socketsecurity"
version = "2.3.0"
version = "2.3.1"
requires-python = ">= 3.11"
license = {"file" = "LICENSE"}
dependencies = [
Expand All @@ -19,6 +19,8 @@ dependencies = [
"socketdev>=3.0.33,<4.0.0",
"bs4>=0.0.2",
"markdown>=3.10",
"brotli>=1.0.9; platform_python_implementation == 'CPython'",
"brotlicffi>=1.0.9; platform_python_implementation != 'CPython'",
]
readme = "README.md"
description = "Socket Security CLI for CI/CD"
Expand Down
2 changes: 1 addition & 1 deletion socketsecurity/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__author__ = 'socket.dev'
__version__ = '2.3.0'
__version__ = '2.3.1'
USER_AGENT = f'SocketPythonCLI/{__version__}'
130 changes: 129 additions & 1 deletion socketsecurity/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,26 @@

_HUMANIZE_BOUNDARY = re.compile(r"(?<=[a-z0-9])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])")

# Reachability facts-file upload compression.
#
# The Socket full-scan endpoint transparently brotli-decompresses any multipart part
# whose basename is exactly ``.socket.facts.json.br`` and stores it as plain
# ``.socket.facts.json``. Compressing the facts file on upload keeps it well under the
# server's per-file size cap (a ~262 MB facts file compresses to roughly 15-30 MB),
# which is required for large reachability (tier 1) scans to succeed.
#
# The server matches the *exact* name ``.socket.facts.json.br``, so we only compress
# files whose basename is exactly ``.socket.facts.json`` (a custom ``--reach-output-file``
# name would not be decompressed server-side, so it is left as a plain upload).
SOCKET_FACTS_FILENAME = ".socket.facts.json"
SOCKET_FACTS_BROTLI_FILENAME = ".socket.facts.json.br"
# Brotli quality (0-11); 5 is a good speed/ratio tradeoff for large JSON payloads.
SOCKET_FACTS_BROTLI_QUALITY = 5
# Largest brotli window (2**24 bytes); improves the ratio on large facts files.
SOCKET_FACTS_BROTLI_LGWIN = 24
# Stream the facts file in 1 MiB chunks so large files aren't held fully in memory.
SOCKET_FACTS_BROTLI_CHUNK_SIZE = 1024 * 1024


def _humanize_alert_type(alert_type: str) -> str:
"""Convert a camelCase/PascalCase alert type into a Title-Cased label.
Expand Down Expand Up @@ -544,6 +564,102 @@ def finalize_tier1_scan(self, full_scan_id: str, facts_file_path: str) -> bool:
log.debug(f"Unable to finalize tier 1 scan: {e}")
return False

@staticmethod
def _compress_facts_file(source_path: str) -> str:
"""Brotli-compress a ``.socket.facts.json`` file to a sibling ``.socket.facts.json.br``.

The source is streamed in chunks so a large facts file (hundreds of MB) never has
to be held in memory at once. The compressed file is written next to the source so
that the multipart key the SDK derives keeps the same directory prefix, only with a
``.br`` basename. Any existing ``.socket.facts.json.br`` sibling is overwritten, and a
partially-written output is removed if compression fails part-way through (e.g. the
disk fills up mid-stream) so no orphaned ``.br`` is left in the target directory.

Args:
source_path: Path to the plain ``.socket.facts.json`` file.

Returns:
Path to the compressed sibling file.
"""
# Imported lazily so the dependency is only needed when actually uploading a facts
# file. brotlicffi is the API-compatible fallback used on PyPy / non-CPython runtimes.
try:
import brotli
except ImportError:
import brotlicffi as brotli

target_path = os.path.join(os.path.dirname(source_path), SOCKET_FACTS_BROTLI_FILENAME)
compressor = brotli.Compressor(
quality=SOCKET_FACTS_BROTLI_QUALITY,
lgwin=SOCKET_FACTS_BROTLI_LGWIN,
)
try:
with open(source_path, "rb") as src, open(target_path, "wb") as dst:
while True:
chunk = src.read(SOCKET_FACTS_BROTLI_CHUNK_SIZE)
if not chunk:
break
compressed = compressor.process(chunk)
if compressed:
dst.write(compressed)
dst.write(compressor.finish())
except BaseException:
# Don't leave a half-written .br behind for the caller to miss (it only tracks
# the path for cleanup once this returns). Remove it, then re-raise so the caller
# falls back to uploading the plain file.
try:
os.unlink(target_path)
except OSError:
pass
raise
return target_path

def _compress_facts_files_for_upload(self, files: List[str]) -> Tuple[List[str], List[str]]:
"""Replace any ``.socket.facts.json`` upload entry with a brotli-compressed ``.br`` sibling.

The Socket full-scan endpoint transparently decompresses a multipart part named
exactly ``.socket.facts.json.br``, so compressing here keeps a large facts file under
the server's per-file size cap without changing the stored result. Files whose
basename is not exactly ``.socket.facts.json`` are left untouched (the server only
matches that exact name), as are empty placeholder files (e.g. baseline scans).

Compression never blocks an upload: if it fails for any reason (missing optional
``brotli`` dependency, unwritable directory, etc.) the original plain file is used.

Args:
files: The list of file paths about to be uploaded.

Returns:
``(upload_files, temp_paths)`` where ``upload_files`` is the possibly-rewritten
list to upload and ``temp_paths`` are compressed files the caller must delete
once the upload completes.
"""
upload_files: List[str] = []
temp_paths: List[str] = []
for file_path in files:
try:
if (
os.path.basename(file_path) == SOCKET_FACTS_FILENAME
and os.path.isfile(file_path)
and os.path.getsize(file_path) > 0
):
compressed_path = self._compress_facts_file(file_path)
log.debug(
f"Brotli-compressed {file_path} for upload: "
f"{os.path.getsize(file_path)} -> {os.path.getsize(compressed_path)} bytes "
f"(uploading as {SOCKET_FACTS_BROTLI_FILENAME})"
)
upload_files.append(compressed_path)
temp_paths.append(compressed_path)
continue
except Exception as e:
# Never let compression break an upload: fall back to the plain file.
log.warning(
f"Failed to brotli-compress facts file {file_path}, uploading uncompressed: {e}"
)
upload_files.append(file_path)
return upload_files, temp_paths

def create_full_scan(self, files: List[str], params: FullScanParams, base_paths: Optional[List[str]] = None) -> FullScan:
"""
Creates a new full scan via the Socket API.
Expand All @@ -559,7 +675,19 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths:
log.info("Creating new full scan")
create_full_start = time.time()

res = self.sdk.fullscans.post(files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
# Brotli-compress the reachability facts file (if present) so it is uploaded as a
# `.socket.facts.json.br` part. The API decompresses it server-side, keeping a large
# facts file under the per-file upload size cap. See _compress_facts_files_for_upload.
upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files)
try:
res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
finally:
for temp_file in compressed_temp_files:
try:
os.unlink(temp_file)
log.debug(f"Cleaned up temporary compressed facts file: {temp_file}")
except OSError as cleanup_error:
log.debug(f"Failed to clean up temporary compressed facts file {temp_file}: {cleanup_error}")
if not res.success:
log.error(f"Error creating full scan: {res.message}, status: {res.status}")
raise Exception(f"Error creating full scan: {res.message}, status: {res.status}")
Expand Down
137 changes: 137 additions & 0 deletions tests/core/test_facts_compression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Tests for brotli compression of the reachability facts file on upload.

The Socket full-scan endpoint transparently decompresses a multipart part named exactly
`.socket.facts.json.br`, so the CLI compresses the facts file before uploading it. These
tests cover the helpers in `Core` that do that rewriting.
"""
import json
import os

import pytest

try:
import brotli
except ImportError: # pragma: no cover - PyPy / non-CPython fallback
import brotlicffi as brotli

from socketsecurity.core import (
SOCKET_FACTS_BROTLI_FILENAME,
SOCKET_FACTS_FILENAME,
Core,
)


def _write(path, data: bytes):
with open(path, "wb") as f:
f.write(data)
return path


def test_compress_facts_file_roundtrips(tmp_path):
"""The compressed sibling decompresses back to the exact original bytes."""
source = tmp_path / SOCKET_FACTS_FILENAME
payload = json.dumps({"components": [{"id": str(i)} for i in range(1000)]}).encode()
_write(str(source), payload)

compressed_path = Core._compress_facts_file(str(source))

# Compressed file is a sibling named exactly `.socket.facts.json.br`.
assert compressed_path == str(tmp_path / SOCKET_FACTS_BROTLI_FILENAME)
assert os.path.basename(compressed_path) == SOCKET_FACTS_BROTLI_FILENAME
# The original is untouched (other code paths still read it locally).
assert source.read_bytes() == payload
# Roundtrip matches.
with open(compressed_path, "rb") as f:
assert brotli.decompress(f.read()) == payload


def test_compress_for_upload_rewrites_facts_entry(tmp_path):
"""A `.socket.facts.json` entry is replaced by its `.br` sibling; others pass through."""
core = Core.__new__(Core)
facts = _write(str(tmp_path / SOCKET_FACTS_FILENAME), b'{"a": 1}')
manifest = _write(str(tmp_path / "package.json"), b"{}")

upload_files, temp_paths = core._compress_facts_files_for_upload([facts, manifest])

expected_br = str(tmp_path / SOCKET_FACTS_BROTLI_FILENAME)
assert upload_files == [expected_br, manifest]
assert temp_paths == [expected_br]
assert os.path.isfile(expected_br)
# Non-facts files are never compressed.
assert manifest in upload_files


def test_compress_facts_file_removes_partial_output_on_failure(tmp_path, monkeypatch):
"""If compression fails mid-stream, the half-written .br is removed (not orphaned)."""
source = _write(str(tmp_path / SOCKET_FACTS_FILENAME), b'{"a": 1}' * 1000)

class ExplodingCompressor:
def __init__(self, *args, **kwargs):
pass

def process(self, _data):
raise RuntimeError("disk full")

def finish(self): # pragma: no cover - never reached
return b""

# Patch the module the helper imports (brotli on CPython, brotlicffi elsewhere).
monkeypatch.setattr(brotli, "Compressor", ExplodingCompressor)

with pytest.raises(RuntimeError, match="disk full"):
Core._compress_facts_file(source)

# No orphaned .br left behind in the target directory.
assert not (tmp_path / SOCKET_FACTS_BROTLI_FILENAME).exists()


def test_compress_for_upload_preserves_directory_prefix(tmp_path):
"""The `.br` sibling keeps the facts file's directory so the relative key is preserved."""
core = Core.__new__(Core)
subdir = tmp_path / "nested"
subdir.mkdir()
facts = _write(str(subdir / SOCKET_FACTS_FILENAME), b'{"a": 1}')

upload_files, temp_paths = core._compress_facts_files_for_upload([facts])

assert upload_files == [str(subdir / SOCKET_FACTS_BROTLI_FILENAME)]
assert temp_paths == [str(subdir / SOCKET_FACTS_BROTLI_FILENAME)]


def test_empty_facts_file_is_not_compressed(tmp_path):
"""Empty placeholder facts files (e.g. baseline scans) are uploaded as-is."""
core = Core.__new__(Core)
empty_facts = _write(str(tmp_path / SOCKET_FACTS_FILENAME), b"")

upload_files, temp_paths = core._compress_facts_files_for_upload([empty_facts])

assert upload_files == [empty_facts]
assert temp_paths == []
assert not (tmp_path / SOCKET_FACTS_BROTLI_FILENAME).exists()


def test_custom_named_facts_file_is_not_compressed(tmp_path):
"""A custom --reach-output-file name is not compressed (server only matches the exact name)."""
core = Core.__new__(Core)
custom = _write(str(tmp_path / "custom.facts.json"), b'{"a": 1}')

upload_files, temp_paths = core._compress_facts_files_for_upload([custom])

assert upload_files == [custom]
assert temp_paths == []


def test_compression_failure_falls_back_to_plain_file(tmp_path, monkeypatch):
"""If compression raises, the original plain file is uploaded instead of failing."""
core = Core.__new__(Core)
facts = _write(str(tmp_path / SOCKET_FACTS_FILENAME), b'{"a": 1}')

def boom(_source_path):
raise RuntimeError("brotli unavailable")

monkeypatch.setattr(Core, "_compress_facts_file", staticmethod(boom))

upload_files, temp_paths = core._compress_facts_files_for_upload([facts])

assert upload_files == [facts]
assert temp_paths == []
Loading
Loading