diff --git a/.env.example b/.env.example index e48ea3faa..1722e336e 100644 --- a/.env.example +++ b/.env.example @@ -133,3 +133,5 @@ HISTORY_ACCESS_PUBLISHED_DATA_GROUPS="" HISTORY_ACCESS_POLICIES_GROUPS="" HISTORY_ACCESS_DATABLOCK_GROUPS="" HISTORY_ACCESS_ATTACHMENT_GROUPS="" + +DATAFILES_METADATA_SCHEMA="datafilesMetadataSchema.example.json" diff --git a/.gitignore b/.gitignore index 13e09a0a3..9ed1a8429 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ jobConfig.yaml metricsConfig.json publishedDataConfig.json openSearchConfig.json +datafilesMetadataSchema.json # Configs .env diff --git a/datafilesMetadataSchema.example.json b/datafilesMetadataSchema.example.json new file mode 100644 index 000000000..e14942a10 --- /dev/null +++ b/datafilesMetadataSchema.example.json @@ -0,0 +1,6 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": {}, + "additionalProperties": false +} \ No newline at end of file diff --git a/docs/developer-guide/datafiles_metadata.md b/docs/developer-guide/datafiles_metadata.md new file mode 100644 index 000000000..63a45437f --- /dev/null +++ b/docs/developer-guide/datafiles_metadata.md @@ -0,0 +1,200 @@ +# Datafile Metadata + +Datafile objects can carry optional file-specific metadata in the `metadata` +field. This field is available on each entry of an origdatablock +`dataFileList`. + +```json +{ + "path": "raw/run-0001.nxs", + "size": 1048576, + "time": "2026-06-02T08:00:00Z", + "chk": "2cf24dba5fb0a30e26e83b2ac5b9e29e", + "metadata": { + "duration": 12.5, + "measurement_type": "scan" + } +} +``` + +The backend stores `metadata` as a JSON object: + +```ts +metadata?: Record; +``` + +>**Important**: +>This field is intended for facility specific file-level metadata. For aggregate metadata that should be searchable and shown more prominently to users, prefer the dataset `scientificMetadata` field. Note that these metadata are not searchable at Dataset level. + +## Configuration + +Datafile metadata validation is configured with the +`DATAFILES_METADATA_SCHEMA` environment variable. + +```sh +DATAFILES_METADATA_SCHEMA="datafilesMetadataSchema.example.json" +``` + +The environment variable points to a JSON Schema file. During application +configuration, `src/config/configuration.ts` reads and parses that file, then +exposes the parsed schema object through the Nest configuration key +`datafilesMetadataSchema`. + +If `DATAFILES_METADATA_SCHEMA` is not set, `configuration.ts` uses +`datafilesMetadataSchema.json` as the default schema path. If that file is +missing, it falls back to `datafilesMetadataSchema.example.json`. The default +schema shipped with the backend is closed and rejects non-empty metadata until a +facility configures allowed fields. If both the default schema file and example +schema file are missing, the current configuration loader stores a schema +object (`{type: "object", additionalProperties: false}`), which would still reject all non empty metadata. + +The example schema is deliberately closed: + +```json +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": {}, + "additionalProperties": false +} +``` + +With this schema, omitted or empty datafile metadata is valid, but any +non-empty metadata object is rejected until the facility configures the allowed +metadata fields. + +## Schema Draft + +The validation pipe uses the default Ajv import, which validates draft-07 +schemas. Schema files should declare the draft-07 meta-schema: + +```json +{ + "$schema": "http://json-schema.org/draft-07/schema#" +} +``` + +## Example Facility Schema + +```json +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "duration": { + "type": "number", + "minimum": 0 + }, + "measurement_type": { + "type": "string", + "enum": ["scan", "calibration", "dark"] + }, + "detector": { + "type": "object", + "properties": { + "name": { "type": "string" }, + "distance_mm": { "type": "number" } + }, + "required": ["name"], + "additionalProperties": false + } + }, + "required": ["measurement_type"], + "additionalProperties": false +} +``` + +To allow arbitrary top-level metadata keys, configure +`additionalProperties: true` explicitly or simply remove it entirely: + +```json +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", +} +``` + +For nested objects, define `additionalProperties` in the nested schema too if +unknown nested keys should be rejected. + +## Validation Mechanism + +Validation is implemented by `DatafilesMetadataValidationPipe` in +`src/origdatablocks/pipes/datafiles-metadata-validation.pipe.ts`. + +For each request body handled by this pipe: + +1. The default schema in case of missing configuration is: + ```json + { + "type": "object", + "additionalProperties": false + } + ``` +2. If the request body has no `dataFileList`, the pipe returns without metadata + validation. This allows partial update bodies that do not touch files. + +3. Each entry in `dataFileList` is checked. +4. Each datafile `metadata` value is validated. If a datafile omits `metadata`, + the pipe validates an empty object (`{}`). +5. Invalid metadata causes HTTP 400 with the validation error details. +6. A schema that cannot be compiled by Ajv causes HTTP 500 because it is a + server configuration problem. + +Invalid metadata produces an error like: + +```text +Datafile metadata is not following the configured schema: metadata/duration must be number +``` + +## Validated Routes + +The pipe validates origdatablock request bodies on routes where +`@UsePipes(DatafilesMetadataValidationPipe)` is applied. + +Current covered routes: + +- `POST /origdatablocks` in the v3 controller +- `PATCH /origdatablocks/:id` in the v3 controller +- `POST /origdatablocks` in the v4 controller +- `POST /origdatablocks/isValid` in the v4 controller +- `PATCH /origdatablocks/:id` in the v4 controller + +Any future route that accepts origdatablock `dataFileList` input must also use +this pipe if datafile metadata should be validated there. + +## Request Examples + +Accepted when the configured schema allows `duration` and +`measurement_type`: + +```json +{ + "datasetId": "20.500.12345/example-dataset", + "size": 1048576, + "dataFileList": [ + { + "path": "raw/run-0001.nxs", + "size": 1048576, + "time": "2026-06-02T08:00:00Z", + "metadata": { + "duration": 12.5, + "measurement_type": "scan" + } + } + ] +} +``` + +Rejected by the closed example schema: + +```json +{ + "path": "raw/run-0001.nxs", + "size": 1048576, + "time": "2026-06-02T08:00:00Z", + "metadata": { + "operator_comment": "extra key" + } +} +``` \ No newline at end of file diff --git a/src/common/dto/datafile.dto.ts b/src/common/dto/datafile.dto.ts index a80e77ae2..e0bdd5db7 100644 --- a/src/common/dto/datafile.dto.ts +++ b/src/common/dto/datafile.dto.ts @@ -1,5 +1,11 @@ import { ApiProperty } from "@nestjs/swagger"; -import { IsDateString, IsNumber, IsOptional, IsString } from "class-validator"; +import { + IsDateString, + IsNumber, + IsOptional, + IsString, + IsObject, +} from "class-validator"; export class DataFileDto { @ApiProperty({ @@ -72,4 +78,14 @@ export class DataFileDto { @IsString() @IsOptional() readonly type: string; + + @ApiProperty({ + type: Object, + required: false, + description: + "File-specific metadata. The Dataset field scientificMetadata should be preferred for aggregate metadata, as it is searchable and displayed more prominently to users.", + }) + @IsObject() + @IsOptional() + readonly metadata: Record; } diff --git a/src/common/interfaces/common.interface.ts b/src/common/interfaces/common.interface.ts index bea0aa5e0..6b22eb456 100644 --- a/src/common/interfaces/common.interface.ts +++ b/src/common/interfaces/common.interface.ts @@ -67,6 +67,7 @@ export interface IDatafileFilter { gid?: string; perm?: string; type?: string; + metadata?: Record; } export type IFiltersV4 = Pick< diff --git a/src/common/schemas/datafile.schema.ts b/src/common/schemas/datafile.schema.ts index 247d9a68c..e7e503e02 100644 --- a/src/common/schemas/datafile.schema.ts +++ b/src/common/schemas/datafile.schema.ts @@ -94,6 +94,18 @@ export class DataFile { required: false, }) type?: string; + + @ApiProperty({ + type: Object, + required: false, + description: + "File-specific metadata. The Dataset field scientificMetadata should be preferred for aggregate metadata, as it is searchable and displayed more prominently to users.", + }) + @Prop({ + type: Object, + required: false, + }) + metadata?: Record; } export const DataFileSchema = SchemaFactory.createForClass(DataFile); diff --git a/src/config/configuration.ts b/src/config/configuration.ts index 54451bc08..36e38ed73 100644 --- a/src/config/configuration.ts +++ b/src/config/configuration.ts @@ -75,6 +75,7 @@ const configuration = () => { datasetTypes: {}, proposalTypes: {}, opensearchConfig: {}, + datafilesMetadataSchema: { type: "object", additionalProperties: false }, }; const jsonConfigFileList: { [key: string]: string } = { frontendConfig: @@ -89,6 +90,8 @@ const configuration = () => { process.env.PUBLISHED_DATA_CONFIG_FILE || "publishedDataConfig.json", opensearchConfig: process.env.OPENSEARCH_CONFIG_FILE || "opensearchConfig.json", + datafilesMetadataSchema: + process.env.DATAFILES_METADATA_SCHEMA || "datafilesMetadataSchema.json", }; Object.keys(jsonConfigFileList).forEach((key) => { const filePath = jsonConfigFileList[key]; @@ -106,6 +109,7 @@ const configuration = () => { const configsWithExampleFallback = [ "publishedDataConfig", "opensearchConfig", + "datafilesMetadataSchema", ]; if (configsWithExampleFallback.includes(key)) { console.warn( @@ -458,6 +462,7 @@ const configuration = () => { publishedDataConfig: jsonConfigMap.publishedDataConfig, ajvCustomDefinitions: ajvCustomDefinitions, opensearchConfig: jsonConfigMap.opensearchConfig, + datafilesMetadataSchema: jsonConfigMap.datafilesMetadataSchema, }; return merge(config, localconfiguration); }; diff --git a/src/datablocks/datablocks.service.spec.ts b/src/datablocks/datablocks.service.spec.ts index c8881f80f..e3ed4a796 100644 --- a/src/datablocks/datablocks.service.spec.ts +++ b/src/datablocks/datablocks.service.spec.ts @@ -29,6 +29,9 @@ const mockDatablock: Datablock = { uid: "testUid", gid: "testGid", perm: "testPerm", + metadata: { + key: "value", + }, }, ], }; diff --git a/src/origdatablocks/origdatablocks.controller.ts b/src/origdatablocks/origdatablocks.controller.ts index 33d99e5dd..9833b5015 100644 --- a/src/origdatablocks/origdatablocks.controller.ts +++ b/src/origdatablocks/origdatablocks.controller.ts @@ -13,6 +13,7 @@ import { Req, ForbiddenException, NotFoundException, + UsePipes, } from "@nestjs/common"; import { Request } from "express"; import { OrigDatablocksService } from "./origdatablocks.service"; @@ -48,6 +49,7 @@ import { CreateRawDatasetObsoleteDto } from "src/datasets/dto/create-raw-dataset import { CreateDerivedDatasetObsoleteDto } from "src/datasets/dto/create-derived-dataset-obsolete.dto"; import { logger } from "@user-office-software/duo-logger"; import { FullFacetFilters, FullFacetResponse } from "src/common/types"; +import { DatafilesMetadataValidationPipe } from "src/origdatablocks/pipes/datafiles-metadata-validation.pipe"; @ApiBearerAuth() @ApiTags("origdatablocks") @@ -163,6 +165,7 @@ export class OrigDatablocksController { // POST /origdatablocks @UseGuards(PoliciesGuard) + @UsePipes(DatafilesMetadataValidationPipe) @CheckPolicies("origdatablocks", (ability: AppAbility) => ability.can(Action.OrigdatablockCreate, OrigDatablock), ) @@ -619,6 +622,7 @@ export class OrigDatablocksController { @CheckPolicies("origdatablocks", (ability: AppAbility) => ability.can(Action.OrigdatablockUpdate, OrigDatablock), ) + @UsePipes(DatafilesMetadataValidationPipe) @Patch("/:id") @ApiOperation({ summary: "It updates the origdatablock.", diff --git a/src/origdatablocks/origdatablocks.module.ts b/src/origdatablocks/origdatablocks.module.ts index c9224fde3..39b8a7c0a 100644 --- a/src/origdatablocks/origdatablocks.module.ts +++ b/src/origdatablocks/origdatablocks.module.ts @@ -10,6 +10,7 @@ import { OrigDatablocksPublicV4Controller } from "./origdatablocks-public.v4.con import { OrigDatablocksV4Controller } from "./origdatablocks.v4.controller"; import { CaslModule } from "src/casl/casl.module"; import { DatasetsModule } from "src/datasets/datasets.module"; +import { DatafilesMetadataValidationPipe } from "./pipes/datafiles-metadata-validation.pipe"; @Module({ imports: [ @@ -28,6 +29,6 @@ import { DatasetsModule } from "src/datasets/datasets.module"; OrigDatablocksV4Controller, ], exports: [OrigDatablocksService], - providers: [OrigDatablocksService], + providers: [OrigDatablocksService, DatafilesMetadataValidationPipe], }) export class OrigDatablocksModule {} diff --git a/src/origdatablocks/origdatablocks.v4.controller.ts b/src/origdatablocks/origdatablocks.v4.controller.ts index ac621fff6..8816c24da 100644 --- a/src/origdatablocks/origdatablocks.v4.controller.ts +++ b/src/origdatablocks/origdatablocks.v4.controller.ts @@ -14,6 +14,7 @@ import { ForbiddenException, NotFoundException, InternalServerErrorException, + UsePipes, } from "@nestjs/common"; import { Request } from "express"; import { OrigDatablocksService } from "./origdatablocks.service"; @@ -67,6 +68,7 @@ import { } from "./types/origdatablock-lookup"; import { IncludeValidationPipe } from "src/common/pipes/include-validation.pipe"; import { FilterValidationPipe } from "src/common/pipes/filter-validation.pipe"; +import { DatafilesMetadataValidationPipe } from "./pipes/datafiles-metadata-validation.pipe"; import { parseDate } from "src/common/utils"; @ApiBearerAuth() @@ -387,6 +389,7 @@ export class OrigDatablocksV4Controller { @CheckPolicies("origdatablocks", (ability: AppAbility) => ability.can(Action.OrigdatablockCreate, OrigDatablock), ) + @UsePipes(DatafilesMetadataValidationPipe) @HttpCode(HttpStatus.CREATED) @Post() @ApiOperation({ @@ -444,6 +447,7 @@ export class OrigDatablocksV4Controller { @CheckPolicies("origdatablocks", (ability: AppAbility) => ability.can(Action.OrigdatablockCreate, OrigDatablock), ) + @UsePipes(DatafilesMetadataValidationPipe) @HttpCode(HttpStatus.OK) @Post("/isValid") @ApiOperation({ @@ -747,6 +751,7 @@ export class OrigDatablocksV4Controller { @CheckPolicies("origdatablocks", (ability: AppAbility) => ability.can(Action.OrigdatablockUpdate, OrigDatablock), ) + @UsePipes(DatafilesMetadataValidationPipe) @Patch("/:id") @ApiOperation({ summary: "It updates the origdatablock", diff --git a/src/origdatablocks/pipes/datafiles-metadata-validation.pipe.spec.ts b/src/origdatablocks/pipes/datafiles-metadata-validation.pipe.spec.ts new file mode 100644 index 000000000..f9ab2f640 --- /dev/null +++ b/src/origdatablocks/pipes/datafiles-metadata-validation.pipe.spec.ts @@ -0,0 +1,171 @@ +import { BadRequestException } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { DatafilesMetadataValidationPipe } from "./datafiles-metadata-validation.pipe"; + +type OrigdatablockInput = Parameters< + DatafilesMetadataValidationPipe["transform"] +>[0]; + +const schema = { + $schema: "http://json-schema.org/draft-07/schema#", + type: "object", + properties: { + duration: { type: "number" }, + measurement_type: { type: "string" }, + }, +}; + +const createPipe = ( + datafilesMetadataSchema?: Record, +): DatafilesMetadataValidationPipe => { + const configService = { + get: jest.fn((key: string) => + key === "datafilesMetadataSchema" ? datafilesMetadataSchema : undefined, + ), + } as unknown as ConfigService; + + return new DatafilesMetadataValidationPipe(configService); +}; + +const createOrigdatablockDto = (metadata?: Record) => + ({ + datasetId: "test-dataset", + size: 1, + dataFileList: [ + { + path: "file.nxs", + size: 10000, + time: "2026-06-01T00:00:00.000Z", + ...(metadata === undefined ? {} : { metadata }), + }, + ], + }) as unknown as OrigdatablockInput; + +const expectBadRequest = (callback: () => void, message: string) => { + let thrownError: unknown; + + try { + callback(); + } catch (error) { + thrownError = error; + } + + expect(thrownError).toBeInstanceOf(BadRequestException); + expect((thrownError as BadRequestException).message).toContain(message); +}; + +describe("DatafilesMetadataValidationPipe", () => { + afterEach(() => { + jest.restoreAllMocks(); + }); + + it("should allow datafiles without metadata when no schema is configured", () => { + const pipe = createPipe(); + const dto = createOrigdatablockDto(); + + expect(pipe.transform(dto)).toEqual(dto); + }); + + it("should reject datafile metadata when no schema is configured", () => { + const pipe = createPipe(); + const dto = createOrigdatablockDto({ duration: 12.5 }); + + expectBadRequest( + () => pipe.transform(dto), + "metadata must NOT have additional properties", + ); + }); + + it("should allow datafile metadata that matches the configured schema", () => { + const pipe = createPipe(schema); + const dto = createOrigdatablockDto({ + duration: 12.5, + measurement_type: "scan", + }); + + expect(pipe.transform(dto)).toEqual(dto); + }); + + it("should reject datafile metadata that does not match the configured schema", () => { + const pipe = createPipe(schema); + const dto = createOrigdatablockDto({ + duration: "12.5", + measurement_type: "scan", + }); + + expectBadRequest( + () => pipe.transform(dto), + "Datafile metadata is not following the configured schema", + ); + }); + + it("should allow undeclared top-level metadata keys by deafult", () => { + const pipe = createPipe({ + type: "object", + properties: { + duration: { type: "number" }, + }, + additionalProperties: true, + }); + const dto = createOrigdatablockDto({ + duration: 12.5, + operator_comment: "extra field", + }); + + expect(pipe.transform(dto)).toEqual(dto); + }); + + it("should reject non empty metadata with default schema", () => { + const pipe = createPipe(); + const dto = createOrigdatablockDto({ + operator_comment: "extra field", + }); + + expectBadRequest( + () => pipe.transform(dto), + "Datafile metadata is not following the configured schema", + ); + }); + + it("should allow patch Origdatablock bodies without dataFileList / no validation", () => { + const pipe = createPipe(schema); + const dto = { + ownerGroup: "group1", + } as unknown as OrigdatablockInput; + + expect(pipe.transform(dto)).toEqual(dto); + }); + + it("should validate every datafile in dataFileList", () => { + const pipe = createPipe(schema); + const dto = { + datasetId: "test-dataset", + size: 2, + dataFileList: [ + { + path: "valid.nxs", + size: 1, + time: "2026-01-01T00:00:00.000Z", + metadata: { + duration: 12.5, + measurement_type: "scan", + }, + }, + { + path: "invalid.nxs", + size: 1, + time: "2026-01-02T00:00:00.000Z", + metadata: { + duration: "12.5", + measurement_type: "scan", + }, + }, + ], + } as unknown as OrigdatablockInput; + + expectBadRequest( + () => pipe.transform(dto), + "Datafile metadata is not following the configured schema", + ); + }); +}); diff --git a/src/origdatablocks/pipes/datafiles-metadata-validation.pipe.ts b/src/origdatablocks/pipes/datafiles-metadata-validation.pipe.ts new file mode 100644 index 000000000..ebf794110 --- /dev/null +++ b/src/origdatablocks/pipes/datafiles-metadata-validation.pipe.ts @@ -0,0 +1,78 @@ +import Ajv, { ValidateFunction } from "ajv"; +import addFormats from "ajv-formats"; +import { + BadRequestException, + InternalServerErrorException, + PipeTransform, + Injectable, +} from "@nestjs/common"; +import { CreateOrigDatablockDto } from "../dto/create-origdatablock.dto"; +import { + UpdateOrigDatablockDto, + PartialUpdateOrigDatablockDto, +} from "../dto/update-origdatablock.dto"; +import { CreateDatasetOrigDatablockDto } from "../dto/create-dataset-origdatablock"; + +import { ConfigService } from "@nestjs/config"; + +type OrigdatablockDto = + | CreateOrigDatablockDto + | UpdateOrigDatablockDto + | PartialUpdateOrigDatablockDto + | CreateDatasetOrigDatablockDto; + +@Injectable() +export class DatafilesMetadataValidationPipe implements PipeTransform< + OrigdatablockDto, + OrigdatablockDto +> { + constructor(private readonly configService: ConfigService) {} + + transform(origdatablockDto: OrigdatablockDto): OrigdatablockDto { + const jsonDto = JSON.parse(JSON.stringify(origdatablockDto)); + const datafiles = origdatablockDto.dataFileList; + const schema = this.configService.get>( + "datafilesMetadataSchema", + ) || { type: "object", additionalProperties: false }; + + if (!datafiles) { + return jsonDto; + } + + const ajv = new Ajv({ + allErrors: true, + strict: false, + }); + addFormats(ajv); + const validateMetadata = this.compileSchema(ajv, schema); + + for (const datafile of datafiles) { + const jsonMetadata = JSON.parse(JSON.stringify(datafile.metadata ?? {})); + const valid = validateMetadata(jsonMetadata); + if (!valid) { + const validationErrors = ajv.errorsText(validateMetadata.errors, { + dataVar: "metadata", + separator: "; ", + }); + throw new BadRequestException( + `Datafile metadata is not following the configured schema: ${validationErrors}`, + ); + } + } + return jsonDto; + } + + private compileSchema( + ajv: Ajv, + schema: Record, + ): ValidateFunction { + try { + return ajv.compile(schema); + } catch (error) { + const message = error instanceof Error ? error.message : `${error}`; + throw new InternalServerErrorException( + `Datafile metadata schema file could not be compiled: ${message}`, + ); + } + } +}