Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions src/anthias_server/celery_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,14 +760,16 @@ def _validate_remote_video_response(resp: Any, uri: str) -> None:
'live streams are not auto-downloaded'
)
# Accept ``video/*`` and ``application/octet-stream`` (some CDNs
# serve video files this way). An empty Content-Type also passes
# — a few origins omit it. Anything else (HTML error page, JSON
# error envelope) gets rejected so we don't store a 200 OK error
# page as the asset.
# serve video files this way). Reject everything else, including
# an empty Content-Type. Well-behaved origins always send one; a
# missing header is a stronger signal of a misbehaving origin
# than evidence of a real video — and accepting it would let an
# HTML error page land on disk as a multi-GB asset, where the
# row stays orphaned because the cleanup() sweep won't touch a
# file that's still referenced by an (errored) row.
if (
base_type.startswith('video/')
or base_type == 'application/octet-stream'
or base_type == ''
):
return
raise RemoteVideoDownloadError(
Expand Down
29 changes: 29 additions & 0 deletions tests/test_celery_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,35 @@ def test_download_remote_video_asset_accepts_octet_stream(
mock_dispatch.assert_called_once_with('rv-1')


@pytest.mark.django_db
def test_download_remote_video_asset_empty_content_type_aborts(
remote_video_asset_dir: str,
) -> None:
"""A 200 OK with no Content-Type header is a stronger signal of a
misbehaving origin than evidence of a real video. Accepting it
would let an HTML error page land on disk as a multi-GB asset
that the cleanup sweep can't recover (the row still references
the file, so it isn't an orphan). Reject and let the operator
see the explicit Content-Type-missing failure on the row."""
_make_remote_video_asset(remote_video_asset_dir)
with (
mock.patch(
'anthias_server.celery_tasks._session.get',
return_value=_fake_response(content_type='', body=b'whatever'),
),
mock.patch('anthias_server.processing.dispatch_normalize_video'),
mock.patch('anthias_server.app.consumers.notify_asset_update'),
):
with pytest.raises(RemoteVideoDownloadError, match='Content-Type'):
download_remote_video_asset(
'rv-1', 'https://example.com/no-headers.mp4'
)
# Nothing landed on disk; the staging cleanup wiped the .part too.
dest = path.join(remote_video_asset_dir, 'rv-1.mp4')
assert not path.exists(dest)
assert not path.exists(f'{dest}.part')


@pytest.mark.django_db
def test_download_remote_video_asset_zero_bytes_aborts(
remote_video_asset_dir: str,
Expand Down