Merge branch 'release/v0.25.0'

ONSdigital · Mar 30, 2023 · 3b7b745 · 3b7b745
2 parents f098b27 + 048662c
commit 3b7b745
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 49 deletions.
diff --git a/features/publish_data_dataset.feature b/features/publish_data_dataset.feature
@@ -51,9 +51,9 @@ Feature: Data extractor should listen to the relevant topic and publish extracte
       "description":  "description",
       "keywords":     [ "keyword1", "keyword2" ],
       "dimensions": [
-         { "id": "dim1", "label": "label 1 (11 categories)" },
-         { "id": "dim2", "label": "label 2 (22 categories)", "is_area_type": true },
-         { "id": "dim3", "label": "label 3 (33 categories)" }
+         { "id": "dim1", "label": "label 1 (11 categories)", "is_area_type": true },
+         { "id": "dim3.0", "label": "label 3 (33 categories)" },
+         { "id": "dim3.1", "label": "label 3 (40 categories)" }
       ]
     }
     """
@@ -77,13 +77,13 @@ Feature: Data extractor should listen to the relevant topic and publish extracte
         "Title":       "title",
         "Topics":      [],
         "PopulationType": {
+          "Key": "all-usual-residents-in-households",
+          "AggKey": "all-usual-residents-in-households###All usual residents in households",
           "Name":  "UR_HH",
-          "Label": "All usual residents in households",
-          "AggKey": "UR_HH###All usual residents in households"
+          "Label": "All usual residents in households"
         },
         "Dimensions": [
-          { "Name": "dim1", "Label": "label 1", "RawLabel": "label 1 (11 categories)", "AggKey": "dim1###label 1" },
-          { "Name": "dim3", "Label": "label 3", "RawLabel": "label 3 (33 categories)", "AggKey": "dim3###label 3"}
+          { "Key": "label-3", "AggKey": "label-3###label 3", "Name": "dim3.0,dim3.1", "Label": "label 3", "RawLabel": "label 3 (33 categories),label 3 (40 categories)"}
         ]
       }
       """
diff --git a/handler/datasets.go b/handler/datasets.go
@@ -39,7 +39,7 @@ func (h *ContentPublished) handleDatasetDataType(ctx context.Context, cpEvent *m
 	// Make a call to DatasetAPI
 	datasetMetadataPublished, err := h.DatasetCli.GetVersionMetadata(ctx, "", h.Cfg.ServiceAuthToken, cpEvent.CollectionID, datasetID, edition, version)
 	if err != nil {
-		log.Error(ctx, "cannot get dataset published contents version %s from api", err)
+		log.Error(ctx, "cannot get dataset published metadata from api", err)
 		return err
 	}
 	log.Info(ctx, "successfully obtained metadata from dataset api", log.Data{

diff --git a/models/event.go b/models/event.go
@@ -48,16 +48,18 @@ type ReleaseDateDetails struct {
 // Dimension represents the required information for each dataset dimension: name (unique ID) and label
 // and an aggregation key which combines name and label
 type Dimension struct {
+	Key      string `avro:"key"`
+	AggKey   string `avro:"agg_key"`
 	Name     string `avro:"name"`
-	RawLabel string `avro:"raw_label"`
 	Label    string `avro:"label"`
-	AggKey   string `avro:"agg_key"`
+	RawLabel string `avro:"raw_label"`
 }
 
 // PopulationType represents the population type name (unique ID) and label
 // and an aggregation key which combines name and label
 type PopulationType struct {
+	Key    string `avro:"key"`
+	AggKey string `avro:"agg_key"`
 	Name   string `avro:"name"`
 	Label  string `avro:"label"`
-	AggKey string `avro:"agg_key"`
 }
diff --git a/models/mapper_dataset.go b/models/mapper_dataset.go
@@ -88,36 +88,76 @@ func (s *SearchDataImport) PopulateCantabularFields(ctx context.Context, metadat
 		"num_dimensions": len(metadata.Dimensions)},
 	)
 
-	s.Dimensions = []Dimension{}
-	for i := range metadata.Dimensions {
+	s.Dimensions = MapDimensions(ctx, metadata.Dimensions)
+	s.PopulationType = MapPopulationType(ctx, metadata.DatasetDetails.IsBasedOn.ID)
+}
+
+// MapDimensions returns a slice of dimensions corresponding to the provided slice of dataset versionDimensions.
+// The new dimensions are keyed by human friendly label. If multiple dimensions have the same key, they will be collapsed into 1 single dimension.
+// Collapsed dimensions keep all the original names and labels as csv values, as this information is very valuable to know what was combined, if necessary.
+func MapDimensions(ctx context.Context, dimensions []dataset.VersionDimension) []Dimension {
+	dimensionsByKey := map[string]*Dimension{}
+	for i := range dimensions {
 		// Using pointers to prevent copying lots of data.
 		// TODO consider changing type to []*VersionDimension in dp-api-clients-go
-		dim := &metadata.Dimensions[i]
+		dim := &dimensions[i]
 		if dim.IsAreaType != nil && *dim.IsAreaType {
 			continue
 		}
-		label := cleanDimensionLabel(dim.Label)
-		s.Dimensions = append(s.Dimensions, Dimension{
-			Name:     dim.ID,
-			RawLabel: dim.Label,
-			Label:    label,
-			AggKey:   aggregationKey(ctx, dim.ID, label),
-		})
+
+		lbl := cleanDimensionLabel(dim.Label)
+		k := key(lbl)
+		_, ok := dimensionsByKey[k]
+		if !ok {
+			// If no dimension with the same key exists, create a new one
+			dimensionsByKey[k] = &Dimension{
+				Key:      k,
+				AggKey:   aggregationKey(ctx, k, lbl),
+				Name:     dim.ID,
+				Label:    lbl,
+				RawLabel: dim.Label,
+			}
+		} else {
+			// If the dimension key already exists, they collapse into a single searchable dimension,
+			// but we keep the name and raw label for all the original dimensions before collapsing as csv values
+			if dim.ID != "" {
+				dimensionsByKey[k].Name += fmt.Sprintf(",%s", dim.ID)
+			}
+			if dim.Label != "" {
+				dimensionsByKey[k].RawLabel += fmt.Sprintf(",%s", dim.Label)
+			}
+		}
+	}
+
+	// efficiently create the slice to be returned from the map of dimensions
+	dims := make([]Dimension, len(dimensionsByKey))
+	i := 0
+	for _, dim := range dimensionsByKey {
+		dims[i] = *dim
+		i++
 	}
+	return dims
+}
 
-	popTypeLabel, ok := PopulationTypes[metadata.DatasetDetails.IsBasedOn.ID]
+// MapPopulationType a PopulationType that contains a
+// The new dimensions are keyed by human friendly label. If multiple dimensions have the same key, they will be collapsed into 1 single dimension.
+// Collapsed dimensions keep all the original names and labels as csv values, as this information is very valuable to know what was combined, if necessary.
+func MapPopulationType(ctx context.Context, basedOnID string) PopulationType {
+	lbl, ok := PopulationTypes[basedOnID]
 	if !ok {
 		log.Warn(ctx, "population type not identified",
 			log.Data{
-				"pop_type":    metadata.DatasetDetails.IsBasedOn.ID,
+				"pop_type":    basedOnID,
 				"valid_types": PopulationTypes,
 			},
 		)
 	}
-	s.PopulationType = PopulationType{
-		Name:   metadata.DatasetDetails.IsBasedOn.ID,
-		Label:  popTypeLabel,
-		AggKey: aggregationKey(ctx, metadata.DatasetDetails.IsBasedOn.ID, popTypeLabel),
+	k := key(lbl)
+	return PopulationType{
+		Key:    k,
+		AggKey: aggregationKey(ctx, k, lbl),
+		Name:   basedOnID,
+		Label:  lbl,
 	}
 }
 
@@ -142,18 +182,29 @@ func GetURI(metadata *dataset.Metadata) string {
 	return metadata.Version.Links.Version.URL
 }
 
-// aggregationKey generates an aggregation key from the provided name (unique ID) and label (human friendly string)
-func aggregationKey(ctx context.Context, name, label string) string {
-	if name == "" && label == "" {
+// key generates a key from the provided label by lower casing and converting spaces to hyphens
+func key(label string) string {
+	return strings.ReplaceAll(
+		strings.ToLower(
+			strings.TrimSpace(label),
+		),
+		" ", "-",
+	)
+}
+
+// aggregationKey generates an aggregation key
+// from the provided key (unique identifier) and label (human friendly string)
+func aggregationKey(ctx context.Context, key, label string) string {
+	if key == "" && label == "" {
 		return ""
 	}
 
-	if strings.Contains(name, aggSep) {
-		log.Warn(ctx, "found aggregation key separator in name", log.Data{"name": name, "separator": aggSep})
+	if strings.Contains(key, aggSep) {
+		log.Warn(ctx, "found aggregation key separator in name", log.Data{"name": key, "separator": aggSep})
 	}
 	if strings.Contains(label, aggSep) {
 		log.Warn(ctx, "found aggregation key separator in label", log.Data{"label": label, "separator": aggSep})
 	}
 
-	return fmt.Sprintf("%s%s%s", name, aggSep, label)
+	return fmt.Sprintf("%s%s%s", key, aggSep, label)
 }
diff --git a/models/mapper_dataset_test.go b/models/mapper_dataset_test.go
@@ -133,9 +133,7 @@ func TestPopulateCantabularFields(t *testing.T) {
 		})
 	})
 
-	Convey("Given a dataset metadata with is_based_on field with a cantabular type and 4 dimensions, one being area type", t, func() {
-		areaTypeTrue := true
-		areaTypeFalse := false
+	Convey("Given a dataset metadata with is_based_on field with a cantabular type with a dimension", t, func() {
 		metadata := &dataset.Metadata{
 			DatasetDetails: dataset.DatasetDetails{
 				IsBasedOn: &dataset.IsBasedOn{
@@ -144,10 +142,7 @@ func TestPopulateCantabularFields(t *testing.T) {
 			},
 			Version: dataset.Version{
 				Dimensions: []dataset.VersionDimension{
-					{ID: "dim1", Label: "label 1 (10 categories)"},
-					{ID: "dim2", Label: "label 2 (12 Categories)", IsAreaType: &areaTypeFalse},
-					{ID: "dim3", IsAreaType: &areaTypeTrue},
-					{ID: "dim4", Label: "label 4 (1 category)"},
+					{ID: "dim1", Label: "Label 1 (10 categories)"},
 				},
 			},
 		}
@@ -159,14 +154,12 @@ func TestPopulateCantabularFields(t *testing.T) {
 			}
 			s.PopulateCantabularFields(ctx, metadata)
 
-			Convey("Then only the non-area-type dimensions are populated, with the expected values", func() {
+			Convey("Then the expeced dimension is populated", func() {
 				So(*s, ShouldResemble, models.SearchDataImport{
 					Summary:  testSummary,
 					DataType: "dataset_landing_page",
 					Dimensions: []models.Dimension{
-						{Name: "dim1", RawLabel: "label 1 (10 categories)", Label: "label 1", AggKey: "dim1###label 1"},
-						{Name: "dim2", RawLabel: "label 2 (12 Categories)", Label: "label 2", AggKey: "dim2###label 2"},
-						{Name: "dim4", RawLabel: "label 4 (1 category)", Label: "label 4", AggKey: "dim4###label 4"},
+						{Key: "label-1", AggKey: "label-1###Label 1", Name: "dim1", Label: "Label 1", RawLabel: "Label 1 (10 categories)"},
 					},
 				})
 			})
@@ -196,16 +189,69 @@ func TestPopulateCantabularFields(t *testing.T) {
 					DataType:   "dataset_landing_page",
 					Dimensions: []models.Dimension{},
 					PopulationType: models.PopulationType{
+						Key:    "all-usual-residents-in-households",
+						AggKey: "all-usual-residents-in-households###All usual residents in households",
 						Name:   "UR_HH",
 						Label:  "All usual residents in households",
-						AggKey: "UR_HH###All usual residents in households",
 					},
 				})
 			})
 		})
 	})
 }
 
+func TestMapDimensions(t *testing.T) {
+	ctx := context.Background()
+
+	Convey("Given 2 dimensions with the same label and different number of categories", t, func() {
+		dims := []dataset.VersionDimension{
+			{ID: "dim1", Label: "Label 1 (10 categories)"},
+			{ID: "dim2", Label: "Label 1 (1 category)"},
+		}
+
+		Convey("Then MapDimensions collapses them into a single dimension with the expected values", func() {
+			mappedDimensions := models.MapDimensions(ctx, dims)
+			So(mappedDimensions, ShouldHaveLength, 1)
+			So(mappedDimensions[0], ShouldResemble, models.Dimension{
+				Key:      "label-1",
+				AggKey:   "label-1###Label 1",
+				Name:     "dim1,dim2",
+				Label:    "Label 1",
+				RawLabel: "Label 1 (10 categories),Label 1 (1 category)",
+			})
+		})
+	})
+
+	Convey("Given 3 dimensions, only one being area type", t, func() {
+		areaTypeTrue := true
+		areaTypeFalse := false
+		dims := []dataset.VersionDimension{
+			{ID: "dim1", Label: "Label 1 (10 categories)"},
+			{ID: "dim2", Label: "Label 2", IsAreaType: &areaTypeTrue},
+			{ID: "dim3", Label: "Label 3", IsAreaType: &areaTypeFalse},
+		}
+
+		Convey("Then only the non-area type dimensions are mapped", func() {
+			mappedDimensions := models.MapDimensions(ctx, dims)
+			So(mappedDimensions, ShouldHaveLength, 2)
+			So(mappedDimensions, ShouldContain, models.Dimension{
+				Key:      "label-1",
+				AggKey:   "label-1###Label 1",
+				Name:     "dim1",
+				Label:    "Label 1",
+				RawLabel: "Label 1 (10 categories)",
+			})
+			So(mappedDimensions, ShouldContain, models.Dimension{
+				Key:      "label-3",
+				AggKey:   "label-3###Label 3",
+				Name:     "dim3",
+				Label:    "Label 3",
+				RawLabel: "Label 3",
+			})
+		})
+	})
+}
+
 func TestGetURI(t *testing.T) {
 	dl := dataset.Links{
 		LatestVersion: dataset.Link{

diff --git a/schema/schema.go b/schema/schema.go
@@ -60,17 +60,19 @@ var searchDataImport = `{
       "name": "Dimension",
       "type" : "record",
       "fields": [
+        { "name": "key", "type": "string", "default": "" },
+        { "name": "agg_key", "type": "string", "default": "" },
         { "name": "name", "type": "string", "default": "" },
-        { "name": "raw_label", "type": "string", "default": "" },
         { "name": "label", "type": "string", "default": "" },
-        { "name": "agg_key", "type": "string", "default": "" }
+        { "name": "raw_label", "type": "string", "default": "" }
       ]
     }}},
     {"name": "population_type", "type": {
       "name": "PopulationType", "type": "record", "fields": [
+        { "name": "key", "type": "string", "default": "" },
+        { "name": "agg_key", "type": "string", "default": "" },
         { "name": "name", "type": "string", "default": ""},
-        { "name": "label", "type": "string", "default": ""},
-        { "name": "agg_key", "type": "string", "default": "" }
+        { "name": "label", "type": "string", "default": ""}
       ]
     }}
   ]