From e08abbcef29b8cba7a7c0f8750f26630234de124 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sat, 9 May 2026 15:43:19 +0530 Subject: [PATCH 1/5] Assessment (HotFix): Gemini Batch Fix --- backend/app/crud/assessment/batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/crud/assessment/batch.py b/backend/app/crud/assessment/batch.py index 7b5966d5e..b45603853 100644 --- a/backend/app/crud/assessment/batch.py +++ b/backend/app/crud/assessment/batch.py @@ -332,7 +332,7 @@ def build_google_jsonl( jsonl_data.append( { - "metadata": {"key": f"row_{idx}"}, + "key": f"row_{idx}", "request": request, } ) From 15ad20db42c9ffa32c4f8a301f9d70cb257149b3 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Tue, 12 May 2026 10:46:04 +0530 Subject: [PATCH 2/5] Add dataset preview functionality and update related models and endpoints --- .../app/api/docs/assessment/get_dataset.md | 4 + backend/app/api/routes/assessment/datasets.py | 37 ++++++- backend/app/models/assessment.py | 10 ++ backend/app/services/assessment/dataset.py | 104 ++++++++++++++++++ backend/app/tests/assessment/test_batch.py | 2 +- 5 files changed, 154 insertions(+), 3 deletions(-) diff --git a/backend/app/api/docs/assessment/get_dataset.md b/backend/app/api/docs/assessment/get_dataset.md index 5ba766d5b..a359a3d23 100644 --- a/backend/app/api/docs/assessment/get_dataset.md +++ b/backend/app/api/docs/assessment/get_dataset.md @@ -1,3 +1,7 @@ Get a single assessment dataset by ID. Optionally include a signed URL to download the original uploaded file. + +Pass `limit_rows=N` (1-100) to additionally include a lightweight preview +of the dataset's column headers and the first N data rows. When omitted, +the underlying file is not fetched and the response stays small. diff --git a/backend/app/api/routes/assessment/datasets.py b/backend/app/api/routes/assessment/datasets.py index d4e71d184..fd189823a 100644 --- a/backend/app/api/routes/assessment/datasets.py +++ b/backend/app/api/routes/assessment/datasets.py @@ -12,8 +12,14 @@ get_assessment_dataset_by_id, list_assessment_datasets, ) -from app.models.assessment import AssessmentDatasetResponse +from app.models.assessment import ( + AssessmentDatasetPreview, + AssessmentDatasetResponse, +) from app.models.evaluation import EvaluationDataset +from app.services.assessment.dataset import ( + preview_dataset as preview_assessment_dataset, +) from app.services.assessment.dataset import upload_dataset as upload_assessment_dataset from app.services.assessment.validators import validate_dataset_file from app.utils import APIResponse, load_description @@ -26,6 +32,7 @@ def _dataset_to_response( dataset: EvaluationDataset, signed_url: str | None = None, + preview: AssessmentDatasetPreview | None = None, ) -> AssessmentDatasetResponse: metadata = dataset.dataset_metadata or {} return AssessmentDatasetResponse( @@ -36,6 +43,7 @@ def _dataset_to_response( file_extension=metadata.get("file_extension"), object_store_url=dataset.object_store_url, signed_url=signed_url, + preview=preview, ) @@ -111,6 +119,16 @@ def get_dataset( include_signed_url: bool = Query( False, description="Include a signed URL for downloading the raw file from S3" ), + limit_rows: int + | None = Query( + None, + ge=1, + le=100, + description=( + "If set, fetch the underlying file and include a preview of the first " + "N data rows plus column headers. Skip to avoid the file download." + ), + ), ) -> APIResponse[AssessmentDatasetResponse]: """Get a specific assessment dataset.""" dataset = get_assessment_dataset_by_id( @@ -127,8 +145,23 @@ def get_dataset( ) signed_url = storage.get_signed_url(dataset.object_store_url) + preview: AssessmentDatasetPreview | None = None + if limit_rows is not None: + headers, rows = preview_assessment_dataset( + session=session, + dataset=dataset, + project_id=auth_context.project_.id, + limit=limit_rows, + ) + preview = AssessmentDatasetPreview( + headers=headers, + rows=rows, + returned_rows=len(rows), + truncated=len(rows) >= limit_rows, + ) + return APIResponse.success_response( - data=_dataset_to_response(dataset, signed_url=signed_url) + data=_dataset_to_response(dataset, signed_url=signed_url, preview=preview) ) diff --git a/backend/app/models/assessment.py b/backend/app/models/assessment.py index 78035a738..25ac0f00e 100644 --- a/backend/app/models/assessment.py +++ b/backend/app/models/assessment.py @@ -333,6 +333,15 @@ class AssessmentExportRow(BaseModel): updated_at: datetime +class AssessmentDatasetPreview(BaseModel): + """Lightweight preview of a dataset's columns and first N rows.""" + + headers: list[str] + rows: list[list[str]] + returned_rows: int = 0 + truncated: bool = False + + class AssessmentDatasetResponse(BaseModel): """Response model for assessment dataset.""" @@ -343,3 +352,4 @@ class AssessmentDatasetResponse(BaseModel): file_extension: str | None = None object_store_url: str | None = None signed_url: str | None = None + preview: AssessmentDatasetPreview | None = None diff --git a/backend/app/services/assessment/dataset.py b/backend/app/services/assessment/dataset.py index 943fb34d4..a98f0d91a 100644 --- a/backend/app/services/assessment/dataset.py +++ b/backend/app/services/assessment/dataset.py @@ -125,6 +125,110 @@ def _count_rows(content: bytes, file_ext: str) -> int: return _count_csv_rows(content) +def _stringify(value: object) -> str: + if value is None: + return "" + return str(value) + + +def _preview_csv(content: bytes, limit: int) -> tuple[list[str], list[list[str]]]: + for encoding in ("utf-8-sig", "utf-8", "latin-1"): + try: + text = content.decode(encoding) + break + except (UnicodeDecodeError, ValueError): + continue + else: + text = content.decode("utf-8", errors="replace") + + reader = csv.reader(io.StringIO(text)) + header = next(reader, None) or [] + headers = [_stringify(cell) for cell in header] + + rows: list[list[str]] = [] + for row in reader: + if not any(cell.strip() for cell in row): + continue + rows.append([_stringify(cell) for cell in row]) + if len(rows) >= limit: + break + return headers, rows + + +def _preview_excel(content: bytes, limit: int) -> tuple[list[str], list[list[str]]]: + import openpyxl + + wb = None + try: + wb = openpyxl.load_workbook(io.BytesIO(content), read_only=True, data_only=True) + ws = wb.active + if ws is None: + return [], [] + + rows_iter = ws.iter_rows(values_only=True) + header = next(rows_iter, None) or () + headers = [_stringify(cell) for cell in header] + + rows: list[list[str]] = [] + for row in rows_iter: + if not row or not any(cell is not None for cell in row): + continue + rows.append([_stringify(cell) for cell in row]) + if len(rows) >= limit: + break + return headers, rows + finally: + if wb is not None: + wb.close() + + +def preview_dataset( + session: Session, + dataset: EvaluationDataset, + project_id: int, + limit: int, +) -> tuple[list[str], list[list[str]]]: + """Return the first `limit` data rows (plus header) of a dataset file.""" + if not dataset.object_store_url: + raise HTTPException( + status_code=404, detail="Dataset has no underlying file to preview." + ) + + file_ext = (dataset.dataset_metadata or {}).get("file_extension") + if file_ext == ".xls": + raise HTTPException( + status_code=422, + detail="Legacy Excel format (.xls) is not supported.", + ) + + storage = get_cloud_storage(session=session, project_id=project_id) + try: + content = storage.get(dataset.object_store_url) + except Exception as e: + logger.warning( + f"[preview_dataset] Failed to fetch file | dataset_id={dataset.id} | {e}", + exc_info=True, + ) + raise HTTPException( + status_code=502, detail="Failed to fetch dataset file from storage." + ) from e + + try: + if file_ext == ".xlsx": + return _preview_excel(content, limit) + return _preview_csv(content, limit) + except InvalidFileException as e: + raise HTTPException(status_code=422, detail="Invalid XLSX file content.") from e + except Exception as e: + logger.warning( + f"[preview_dataset] Failed to parse file | dataset_id={dataset.id} | {e}", + exc_info=True, + ) + raise HTTPException( + status_code=422, detail="Unable to parse dataset file for preview." + ) from e + + def upload_dataset( session: Session, file_content: bytes, diff --git a/backend/app/tests/assessment/test_batch.py b/backend/app/tests/assessment/test_batch.py index b91e59b2c..6d524e81f 100644 --- a/backend/app/tests/assessment/test_batch.py +++ b/backend/app/tests/assessment/test_batch.py @@ -419,7 +419,7 @@ def test_build_openai_and_google_jsonl(self) -> None: google_params={"temperature": 0.2, "instructions": "system"}, ) assert len(google_jsonl) == 1 - assert google_jsonl[0]["metadata"]["key"] == "row_0" + assert google_jsonl[0]["key"] == "row_0" assert google_jsonl[0]["request"]["systemInstruction"] == { "parts": [{"text": "system"}] } From fa5e47636f60ae4d5d0b26dd863ff3e1e388f9f3 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Tue, 12 May 2026 12:10:05 +0530 Subject: [PATCH 3/5] Refactor: Update limit_rows parameter type to use Annotated for better validation --- backend/app/api/routes/assessment/datasets.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/backend/app/api/routes/assessment/datasets.py b/backend/app/api/routes/assessment/datasets.py index fd189823a..22f000c4d 100644 --- a/backend/app/api/routes/assessment/datasets.py +++ b/backend/app/api/routes/assessment/datasets.py @@ -1,6 +1,7 @@ """Assessment dataset endpoints.""" import logging +from typing import Annotated from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile @@ -119,16 +120,18 @@ def get_dataset( include_signed_url: bool = Query( False, description="Include a signed URL for downloading the raw file from S3" ), - limit_rows: int - | None = Query( - None, - ge=1, - le=100, - description=( - "If set, fetch the underlying file and include a preview of the first " - "N data rows plus column headers. Skip to avoid the file download." + limit_rows: Annotated[ + int | None, + Query( + ge=1, + le=100, + description=( + "If set, fetch the underlying file and include a preview of the " + "first N data rows plus column headers. Skip to avoid the file " + "download." + ), ), - ), + ] = None, ) -> APIResponse[AssessmentDatasetResponse]: """Get a specific assessment dataset.""" dataset = get_assessment_dataset_by_id( From 0a20a8b37411d7121a6c32fd3cc57054597117af Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Tue, 12 May 2026 19:20:18 +0530 Subject: [PATCH 4/5] Add dataset preview tests and enhance get_dataset functionality to include preview with limit_rows --- backend/app/tests/assessment/test_dataset.py | 120 +++++++++++++++++++ backend/app/tests/assessment/test_routes.py | 33 +++++ 2 files changed, 153 insertions(+) diff --git a/backend/app/tests/assessment/test_dataset.py b/backend/app/tests/assessment/test_dataset.py index ceca9c854..d40910281 100644 --- a/backend/app/tests/assessment/test_dataset.py +++ b/backend/app/tests/assessment/test_dataset.py @@ -10,6 +10,9 @@ _count_csv_rows, _count_excel_rows, _count_rows, + _preview_csv, + _preview_excel, + preview_dataset, upload_dataset, ) @@ -131,6 +134,123 @@ def test_upload_dataset_success(self) -> None: create_ds.assert_called_once() assert create_ds.call_args.kwargs["dataset_metadata"]["total_items_count"] == 2 + def test_preview_csv_returns_headers_and_rows(self) -> None: + headers, rows = _preview_csv(b"a,b\n1,2\n\n3,4\n5,6\n", limit=2) + assert headers == ["a", "b"] + assert rows == [["1", "2"], ["3", "4"]] + + def test_preview_csv_handles_latin1_fallback(self) -> None: + # \xff is invalid utf-8 -> falls back to latin-1 + headers, rows = _preview_csv(b"name\nca\xfffe\n", limit=5) + assert headers == ["name"] + assert rows and rows[0][0].startswith("ca") + + def test_preview_excel_returns_headers_and_rows(self) -> None: + import io + + import openpyxl + + wb = openpyxl.Workbook() + ws = wb.active + ws.append(["x", "y"]) + ws.append([1, 2]) + ws.append([None, None]) + ws.append([3, 4]) + buf = io.BytesIO() + wb.save(buf) + headers, rows = _preview_excel(buf.getvalue(), limit=10) + assert headers == ["x", "y"] + assert rows == [["1", "2"], ["3", "4"]] + + def test_preview_excel_empty_workbook(self) -> None: + import io + + import openpyxl + + wb = openpyxl.Workbook() + buf = io.BytesIO() + wb.save(buf) + headers, rows = _preview_excel(buf.getvalue(), limit=10) + assert headers == [""] or headers == [] + assert rows == [] + + def test_preview_dataset_missing_url_returns_404(self) -> None: + ds = MagicMock() + ds.object_store_url = None + with pytest.raises(HTTPException) as exc_info: + preview_dataset(session=MagicMock(), dataset=ds, project_id=1, limit=10) + assert exc_info.value.status_code == 404 + + def test_preview_dataset_legacy_xls_returns_422(self) -> None: + ds = MagicMock() + ds.object_store_url = "s3://x" + ds.dataset_metadata = {"file_extension": ".xls"} + with pytest.raises(HTTPException) as exc_info: + preview_dataset(session=MagicMock(), dataset=ds, project_id=1, limit=10) + assert exc_info.value.status_code == 422 + + def test_preview_dataset_storage_failure_returns_502(self) -> None: + ds = MagicMock() + ds.object_store_url = "s3://x" + ds.dataset_metadata = {"file_extension": ".csv"} + storage = MagicMock() + storage.get.side_effect = RuntimeError("boom") + with patch( + "app.services.assessment.dataset.get_cloud_storage", return_value=storage + ): + with pytest.raises(HTTPException) as exc_info: + preview_dataset(session=MagicMock(), dataset=ds, project_id=1, limit=10) + assert exc_info.value.status_code == 502 + + def test_preview_dataset_invalid_xlsx_returns_422(self) -> None: + ds = MagicMock() + ds.object_store_url = "s3://x" + ds.dataset_metadata = {"file_extension": ".xlsx"} + storage = MagicMock() + storage.get.return_value = b"not-a-real-xlsx" + with patch( + "app.services.assessment.dataset.get_cloud_storage", return_value=storage + ), patch( + "app.services.assessment.dataset._preview_excel", + side_effect=InvalidFileException("bad"), + ): + with pytest.raises(HTTPException) as exc_info: + preview_dataset(session=MagicMock(), dataset=ds, project_id=1, limit=10) + assert exc_info.value.status_code == 422 + assert "Invalid XLSX" in exc_info.value.detail + + def test_preview_dataset_parse_error_returns_422(self) -> None: + ds = MagicMock() + ds.object_store_url = "s3://x" + ds.dataset_metadata = {"file_extension": ".csv"} + storage = MagicMock() + storage.get.return_value = b"a,b\n1,2\n" + with patch( + "app.services.assessment.dataset.get_cloud_storage", return_value=storage + ), patch( + "app.services.assessment.dataset._preview_csv", + side_effect=RuntimeError("boom"), + ): + with pytest.raises(HTTPException) as exc_info: + preview_dataset(session=MagicMock(), dataset=ds, project_id=1, limit=10) + assert exc_info.value.status_code == 422 + assert "Unable to parse" in exc_info.value.detail + + def test_preview_dataset_csv_success(self) -> None: + ds = MagicMock() + ds.object_store_url = "s3://x" + ds.dataset_metadata = {"file_extension": ".csv"} + storage = MagicMock() + storage.get.return_value = b"a,b\n1,2\n3,4\n" + with patch( + "app.services.assessment.dataset.get_cloud_storage", return_value=storage + ): + headers, rows = preview_dataset( + session=MagicMock(), dataset=ds, project_id=1, limit=10 + ) + assert headers == ["a", "b"] + assert rows == [["1", "2"], ["3", "4"]] + def test_upload_dataset_object_store_failure_returns_500(self) -> None: session = MagicMock() with patch( diff --git a/backend/app/tests/assessment/test_routes.py b/backend/app/tests/assessment/test_routes.py index 0271f8a2f..979036603 100644 --- a/backend/app/tests/assessment/test_routes.py +++ b/backend/app/tests/assessment/test_routes.py @@ -156,6 +156,39 @@ def test_get_dataset_with_signed_url(self) -> None: assert resp.data is not None assert resp.data.signed_url == "signed-url" + def test_get_dataset_with_limit_rows_includes_preview(self) -> None: + with patch( + "app.api.routes.assessment.datasets.get_assessment_dataset_by_id", + return_value=_dataset(), + ), patch( + "app.api.routes.assessment.datasets.preview_assessment_dataset", + return_value=(["a", "b"], [["1", "2"], ["3", "4"]]), + ) as preview_mock: + resp = get_dataset( + 7, + session=MagicMock(), + auth_context=_auth_context(), + limit_rows=2, + ) + preview_mock.assert_called_once() + assert resp.data is not None + assert resp.data.preview is not None + assert resp.data.preview.headers == ["a", "b"] + assert resp.data.preview.returned_rows == 2 + assert resp.data.preview.truncated is True + + def test_get_dataset_without_limit_rows_skips_preview(self) -> None: + with patch( + "app.api.routes.assessment.datasets.get_assessment_dataset_by_id", + return_value=_dataset(), + ), patch( + "app.api.routes.assessment.datasets.preview_assessment_dataset" + ) as preview_mock: + resp = get_dataset(7, session=MagicMock(), auth_context=_auth_context()) + preview_mock.assert_not_called() + assert resp.data is not None + assert resp.data.preview is None + def test_delete_dataset_success_and_error(self) -> None: with patch( "app.api.routes.assessment.datasets.get_assessment_dataset_by_id", From d4c56eeb6d6cc2b7f51976c8c7ccde0247d7aabb Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Wed, 13 May 2026 07:13:50 +0530 Subject: [PATCH 5/5] Enhance preview_dataset function to validate file extensions and add corresponding tests --- backend/app/services/assessment/dataset.py | 8 ++++- backend/app/tests/assessment/test_dataset.py | 32 ++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/backend/app/services/assessment/dataset.py b/backend/app/services/assessment/dataset.py index a98f0d91a..22ebaae70 100644 --- a/backend/app/services/assessment/dataset.py +++ b/backend/app/services/assessment/dataset.py @@ -194,12 +194,18 @@ def preview_dataset( status_code=404, detail="Dataset has no underlying file to preview." ) - file_ext = (dataset.dataset_metadata or {}).get("file_extension") + raw_ext = (dataset.dataset_metadata or {}).get("file_extension") + file_ext = raw_ext.strip().lower() if isinstance(raw_ext, str) else None if file_ext == ".xls": raise HTTPException( status_code=422, detail="Legacy Excel format (.xls) is not supported.", ) + if file_ext not in {".csv", ".xlsx"}: + raise HTTPException( + status_code=422, + detail="Unsupported or missing file extension.", + ) storage = get_cloud_storage(session=session, project_id=project_id) try: diff --git a/backend/app/tests/assessment/test_dataset.py b/backend/app/tests/assessment/test_dataset.py index d40910281..2535c2acd 100644 --- a/backend/app/tests/assessment/test_dataset.py +++ b/backend/app/tests/assessment/test_dataset.py @@ -181,6 +181,38 @@ def test_preview_dataset_missing_url_returns_404(self) -> None: preview_dataset(session=MagicMock(), dataset=ds, project_id=1, limit=10) assert exc_info.value.status_code == 404 + def test_preview_dataset_missing_extension_returns_422(self) -> None: + ds = MagicMock() + ds.object_store_url = "s3://x" + ds.dataset_metadata = {} + with pytest.raises(HTTPException) as exc_info: + preview_dataset(session=MagicMock(), dataset=ds, project_id=1, limit=10) + assert exc_info.value.status_code == 422 + assert "Unsupported or missing" in exc_info.value.detail + + def test_preview_dataset_unknown_extension_returns_422(self) -> None: + ds = MagicMock() + ds.object_store_url = "s3://x" + ds.dataset_metadata = {"file_extension": ".json"} + with pytest.raises(HTTPException) as exc_info: + preview_dataset(session=MagicMock(), dataset=ds, project_id=1, limit=10) + assert exc_info.value.status_code == 422 + + def test_preview_dataset_normalizes_extension_case(self) -> None: + ds = MagicMock() + ds.object_store_url = "s3://x" + ds.dataset_metadata = {"file_extension": " .CSV "} + storage = MagicMock() + storage.get.return_value = b"a,b\n1,2\n" + with patch( + "app.services.assessment.dataset.get_cloud_storage", return_value=storage + ): + headers, rows = preview_dataset( + session=MagicMock(), dataset=ds, project_id=1, limit=10 + ) + assert headers == ["a", "b"] + assert rows == [["1", "2"]] + def test_preview_dataset_legacy_xls_returns_422(self) -> None: ds = MagicMock() ds.object_store_url = "s3://x"