Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/app/api/docs/assessment/get_dataset.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Get a single assessment dataset by ID.

Optionally include a signed URL to download the original uploaded file.

Pass `limit_rows=N` (1-100) to additionally include a lightweight preview
of the dataset's column headers and the first N data rows. When omitted,
the underlying file is not fetched and the response stays small.
40 changes: 38 additions & 2 deletions backend/app/api/routes/assessment/datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Assessment dataset endpoints."""

import logging
from typing import Annotated

from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile

Expand All @@ -12,8 +13,14 @@
get_assessment_dataset_by_id,
list_assessment_datasets,
)
from app.models.assessment import AssessmentDatasetResponse
from app.models.assessment import (
AssessmentDatasetPreview,
AssessmentDatasetResponse,
)
from app.models.evaluation import EvaluationDataset
from app.services.assessment.dataset import (
preview_dataset as preview_assessment_dataset,
)
from app.services.assessment.dataset import upload_dataset as upload_assessment_dataset
from app.services.assessment.validators import validate_dataset_file
from app.utils import APIResponse, load_description
Expand All @@ -26,6 +33,7 @@
def _dataset_to_response(
dataset: EvaluationDataset,
signed_url: str | None = None,
preview: AssessmentDatasetPreview | None = None,
) -> AssessmentDatasetResponse:
metadata = dataset.dataset_metadata or {}
return AssessmentDatasetResponse(
Expand All @@ -36,6 +44,7 @@ def _dataset_to_response(
file_extension=metadata.get("file_extension"),
object_store_url=dataset.object_store_url,
signed_url=signed_url,
preview=preview,
)


Expand Down Expand Up @@ -111,6 +120,18 @@ def get_dataset(
include_signed_url: bool = Query(
False, description="Include a signed URL for downloading the raw file from S3"
),
limit_rows: Annotated[
int | None,
Query(
ge=1,
le=100,
description=(
"If set, fetch the underlying file and include a preview of the "
"first N data rows plus column headers. Skip to avoid the file "
"download."
),
),
] = None,
) -> APIResponse[AssessmentDatasetResponse]:
"""Get a specific assessment dataset."""
dataset = get_assessment_dataset_by_id(
Expand All @@ -127,8 +148,23 @@ def get_dataset(
)
signed_url = storage.get_signed_url(dataset.object_store_url)

preview: AssessmentDatasetPreview | None = None
if limit_rows is not None:
headers, rows = preview_assessment_dataset(
session=session,
dataset=dataset,
project_id=auth_context.project_.id,
limit=limit_rows,
)
preview = AssessmentDatasetPreview(
headers=headers,
rows=rows,
returned_rows=len(rows),
truncated=len(rows) >= limit_rows,
)
Comment thread
Ayush8923 marked this conversation as resolved.

return APIResponse.success_response(
data=_dataset_to_response(dataset, signed_url=signed_url)
data=_dataset_to_response(dataset, signed_url=signed_url, preview=preview)
)


Expand Down
2 changes: 1 addition & 1 deletion backend/app/crud/assessment/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def build_google_jsonl(

jsonl_data.append(
{
"metadata": {"key": f"row_{idx}"},
"key": f"row_{idx}",
"request": request,
}
)
Expand Down
10 changes: 10 additions & 0 deletions backend/app/models/assessment.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,15 @@ class AssessmentExportRow(BaseModel):
updated_at: datetime


class AssessmentDatasetPreview(BaseModel):
"""Lightweight preview of a dataset's columns and first N rows."""

headers: list[str]
rows: list[list[str]]
returned_rows: int = 0
truncated: bool = False


class AssessmentDatasetResponse(BaseModel):
"""Response model for assessment dataset."""

Expand All @@ -343,3 +352,4 @@ class AssessmentDatasetResponse(BaseModel):
file_extension: str | None = None
object_store_url: str | None = None
signed_url: str | None = None
preview: AssessmentDatasetPreview | None = None
110 changes: 110 additions & 0 deletions backend/app/services/assessment/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,116 @@ def _count_rows(content: bytes, file_ext: str) -> int:
return _count_csv_rows(content)


def _stringify(value: object) -> str:
if value is None:
return ""
return str(value)


def _preview_csv(content: bytes, limit: int) -> tuple[list[str], list[list[str]]]:
for encoding in ("utf-8-sig", "utf-8", "latin-1"):
try:
text = content.decode(encoding)
break
except (UnicodeDecodeError, ValueError):
continue
else:
text = content.decode("utf-8", errors="replace")

reader = csv.reader(io.StringIO(text))
header = next(reader, None) or []
headers = [_stringify(cell) for cell in header]

rows: list[list[str]] = []
for row in reader:
if not any(cell.strip() for cell in row):
continue
rows.append([_stringify(cell) for cell in row])
if len(rows) >= limit:
break
return headers, rows


def _preview_excel(content: bytes, limit: int) -> tuple[list[str], list[list[str]]]:
import openpyxl

wb = None
try:
wb = openpyxl.load_workbook(io.BytesIO(content), read_only=True, data_only=True)
ws = wb.active
if ws is None:
return [], []

rows_iter = ws.iter_rows(values_only=True)
header = next(rows_iter, None) or ()
headers = [_stringify(cell) for cell in header]

rows: list[list[str]] = []
for row in rows_iter:
if not row or not any(cell is not None for cell in row):
continue
rows.append([_stringify(cell) for cell in row])
if len(rows) >= limit:
break
return headers, rows
finally:
if wb is not None:
wb.close()


def preview_dataset(
session: Session,
dataset: EvaluationDataset,
project_id: int,
limit: int,
) -> tuple[list[str], list[list[str]]]:
"""Return the first `limit` data rows (plus header) of a dataset file."""
if not dataset.object_store_url:
raise HTTPException(
status_code=404, detail="Dataset has no underlying file to preview."
)

raw_ext = (dataset.dataset_metadata or {}).get("file_extension")
file_ext = raw_ext.strip().lower() if isinstance(raw_ext, str) else None
if file_ext == ".xls":
raise HTTPException(
status_code=422,
detail="Legacy Excel format (.xls) is not supported.",
)
if file_ext not in {".csv", ".xlsx"}:
raise HTTPException(
status_code=422,
detail="Unsupported or missing file extension.",
)

storage = get_cloud_storage(session=session, project_id=project_id)
try:
content = storage.get(dataset.object_store_url)
except Exception as e:
logger.warning(
f"[preview_dataset] Failed to fetch file | dataset_id={dataset.id} | {e}",
exc_info=True,
)
raise HTTPException(
status_code=502, detail="Failed to fetch dataset file from storage."
) from e

try:
if file_ext == ".xlsx":
return _preview_excel(content, limit)
return _preview_csv(content, limit)
except InvalidFileException as e:
raise HTTPException(status_code=422, detail="Invalid XLSX file content.") from e
except Exception as e:
logger.warning(
f"[preview_dataset] Failed to parse file | dataset_id={dataset.id} | {e}",
exc_info=True,
)
raise HTTPException(
status_code=422, detail="Unable to parse dataset file for preview."
) from e


def upload_dataset(
session: Session,
file_content: bytes,
Expand Down
2 changes: 1 addition & 1 deletion backend/app/tests/assessment/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def test_build_openai_and_google_jsonl(self) -> None:
google_params={"temperature": 0.2, "instructions": "system"},
)
assert len(google_jsonl) == 1
assert google_jsonl[0]["metadata"]["key"] == "row_0"
assert google_jsonl[0]["key"] == "row_0"
assert google_jsonl[0]["request"]["systemInstruction"] == {
"parts": [{"text": "system"}]
}
Loading
Loading