diff --git a/src/anthias_server/celery_tasks.py b/src/anthias_server/celery_tasks.py index 60d4d3111..fdb70cc36 100755 --- a/src/anthias_server/celery_tasks.py +++ b/src/anthias_server/celery_tasks.py @@ -760,14 +760,16 @@ def _validate_remote_video_response(resp: Any, uri: str) -> None: 'live streams are not auto-downloaded' ) # Accept ``video/*`` and ``application/octet-stream`` (some CDNs - # serve video files this way). An empty Content-Type also passes - # — a few origins omit it. Anything else (HTML error page, JSON - # error envelope) gets rejected so we don't store a 200 OK error - # page as the asset. + # serve video files this way). Reject everything else, including + # an empty Content-Type. Well-behaved origins always send one; a + # missing header is a stronger signal of a misbehaving origin + # than evidence of a real video — and accepting it would let an + # HTML error page land on disk as a multi-GB asset, where the + # row stays orphaned because the cleanup() sweep won't touch a + # file that's still referenced by an (errored) row. if ( base_type.startswith('video/') or base_type == 'application/octet-stream' - or base_type == '' ): return raise RemoteVideoDownloadError( diff --git a/tests/test_celery_tasks.py b/tests/test_celery_tasks.py index 0676b694f..ab87df7db 100644 --- a/tests/test_celery_tasks.py +++ b/tests/test_celery_tasks.py @@ -1208,6 +1208,35 @@ def test_download_remote_video_asset_accepts_octet_stream( mock_dispatch.assert_called_once_with('rv-1') +@pytest.mark.django_db +def test_download_remote_video_asset_empty_content_type_aborts( + remote_video_asset_dir: str, +) -> None: + """A 200 OK with no Content-Type header is a stronger signal of a + misbehaving origin than evidence of a real video. Accepting it + would let an HTML error page land on disk as a multi-GB asset + that the cleanup sweep can't recover (the row still references + the file, so it isn't an orphan). Reject and let the operator + see the explicit Content-Type-missing failure on the row.""" + _make_remote_video_asset(remote_video_asset_dir) + with ( + mock.patch( + 'anthias_server.celery_tasks._session.get', + return_value=_fake_response(content_type='', body=b'whatever'), + ), + mock.patch('anthias_server.processing.dispatch_normalize_video'), + mock.patch('anthias_server.app.consumers.notify_asset_update'), + ): + with pytest.raises(RemoteVideoDownloadError, match='Content-Type'): + download_remote_video_asset( + 'rv-1', 'https://example.com/no-headers.mp4' + ) + # Nothing landed on disk; the staging cleanup wiped the .part too. + dest = path.join(remote_video_asset_dir, 'rv-1.mp4') + assert not path.exists(dest) + assert not path.exists(f'{dest}.part') + + @pytest.mark.django_db def test_download_remote_video_asset_zero_bytes_aborts( remote_video_asset_dir: str,