Skip to content

Commit

Permalink
Merge branch 'release/v0.25.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidSubiros committed Mar 30, 2023
2 parents f098b27 + 048662c commit 3b7b745
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 49 deletions.
14 changes: 7 additions & 7 deletions features/publish_data_dataset.feature
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ Feature: Data extractor should listen to the relevant topic and publish extracte
"description": "description",
"keywords": [ "keyword1", "keyword2" ],
"dimensions": [
{ "id": "dim1", "label": "label 1 (11 categories)" },
{ "id": "dim2", "label": "label 2 (22 categories)", "is_area_type": true },
{ "id": "dim3", "label": "label 3 (33 categories)" }
{ "id": "dim1", "label": "label 1 (11 categories)", "is_area_type": true },
{ "id": "dim3.0", "label": "label 3 (33 categories)" },
{ "id": "dim3.1", "label": "label 3 (40 categories)" }
]
}
"""
Expand All @@ -77,13 +77,13 @@ Feature: Data extractor should listen to the relevant topic and publish extracte
"Title": "title",
"Topics": [],
"PopulationType": {
"Key": "all-usual-residents-in-households",
"AggKey": "all-usual-residents-in-households###All usual residents in households",
"Name": "UR_HH",
"Label": "All usual residents in households",
"AggKey": "UR_HH###All usual residents in households"
"Label": "All usual residents in households"
},
"Dimensions": [
{ "Name": "dim1", "Label": "label 1", "RawLabel": "label 1 (11 categories)", "AggKey": "dim1###label 1" },
{ "Name": "dim3", "Label": "label 3", "RawLabel": "label 3 (33 categories)", "AggKey": "dim3###label 3"}
{ "Key": "label-3", "AggKey": "label-3###label 3", "Name": "dim3.0,dim3.1", "Label": "label 3", "RawLabel": "label 3 (33 categories),label 3 (40 categories)"}
]
}
"""
2 changes: 1 addition & 1 deletion handler/datasets.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func (h *ContentPublished) handleDatasetDataType(ctx context.Context, cpEvent *m
// Make a call to DatasetAPI
datasetMetadataPublished, err := h.DatasetCli.GetVersionMetadata(ctx, "", h.Cfg.ServiceAuthToken, cpEvent.CollectionID, datasetID, edition, version)
if err != nil {
log.Error(ctx, "cannot get dataset published contents version %s from api", err)
log.Error(ctx, "cannot get dataset published metadata from api", err)
return err
}
log.Info(ctx, "successfully obtained metadata from dataset api", log.Data{
Expand Down
8 changes: 5 additions & 3 deletions models/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,18 @@ type ReleaseDateDetails struct {
// Dimension represents the required information for each dataset dimension: name (unique ID) and label
// and an aggregation key which combines name and label
type Dimension struct {
Key string `avro:"key"`
AggKey string `avro:"agg_key"`
Name string `avro:"name"`
RawLabel string `avro:"raw_label"`
Label string `avro:"label"`
AggKey string `avro:"agg_key"`
RawLabel string `avro:"raw_label"`
}

// PopulationType represents the population type name (unique ID) and label
// and an aggregation key which combines name and label
type PopulationType struct {
Key string `avro:"key"`
AggKey string `avro:"agg_key"`
Name string `avro:"name"`
Label string `avro:"label"`
AggKey string `avro:"agg_key"`
}
95 changes: 73 additions & 22 deletions models/mapper_dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,36 +88,76 @@ func (s *SearchDataImport) PopulateCantabularFields(ctx context.Context, metadat
"num_dimensions": len(metadata.Dimensions)},
)

s.Dimensions = []Dimension{}
for i := range metadata.Dimensions {
s.Dimensions = MapDimensions(ctx, metadata.Dimensions)
s.PopulationType = MapPopulationType(ctx, metadata.DatasetDetails.IsBasedOn.ID)
}

// MapDimensions returns a slice of dimensions corresponding to the provided slice of dataset versionDimensions.
// The new dimensions are keyed by human friendly label. If multiple dimensions have the same key, they will be collapsed into 1 single dimension.
// Collapsed dimensions keep all the original names and labels as csv values, as this information is very valuable to know what was combined, if necessary.
func MapDimensions(ctx context.Context, dimensions []dataset.VersionDimension) []Dimension {
dimensionsByKey := map[string]*Dimension{}
for i := range dimensions {
// Using pointers to prevent copying lots of data.
// TODO consider changing type to []*VersionDimension in dp-api-clients-go
dim := &metadata.Dimensions[i]
dim := &dimensions[i]
if dim.IsAreaType != nil && *dim.IsAreaType {
continue
}
label := cleanDimensionLabel(dim.Label)
s.Dimensions = append(s.Dimensions, Dimension{
Name: dim.ID,
RawLabel: dim.Label,
Label: label,
AggKey: aggregationKey(ctx, dim.ID, label),
})

lbl := cleanDimensionLabel(dim.Label)
k := key(lbl)
_, ok := dimensionsByKey[k]
if !ok {
// If no dimension with the same key exists, create a new one
dimensionsByKey[k] = &Dimension{
Key: k,
AggKey: aggregationKey(ctx, k, lbl),
Name: dim.ID,
Label: lbl,
RawLabel: dim.Label,
}
} else {
// If the dimension key already exists, they collapse into a single searchable dimension,
// but we keep the name and raw label for all the original dimensions before collapsing as csv values
if dim.ID != "" {
dimensionsByKey[k].Name += fmt.Sprintf(",%s", dim.ID)
}
if dim.Label != "" {
dimensionsByKey[k].RawLabel += fmt.Sprintf(",%s", dim.Label)
}
}
}

// efficiently create the slice to be returned from the map of dimensions
dims := make([]Dimension, len(dimensionsByKey))
i := 0
for _, dim := range dimensionsByKey {
dims[i] = *dim
i++
}
return dims
}

popTypeLabel, ok := PopulationTypes[metadata.DatasetDetails.IsBasedOn.ID]
// MapPopulationType a PopulationType that contains a
// The new dimensions are keyed by human friendly label. If multiple dimensions have the same key, they will be collapsed into 1 single dimension.
// Collapsed dimensions keep all the original names and labels as csv values, as this information is very valuable to know what was combined, if necessary.
func MapPopulationType(ctx context.Context, basedOnID string) PopulationType {
lbl, ok := PopulationTypes[basedOnID]
if !ok {
log.Warn(ctx, "population type not identified",
log.Data{
"pop_type": metadata.DatasetDetails.IsBasedOn.ID,
"pop_type": basedOnID,
"valid_types": PopulationTypes,
},
)
}
s.PopulationType = PopulationType{
Name: metadata.DatasetDetails.IsBasedOn.ID,
Label: popTypeLabel,
AggKey: aggregationKey(ctx, metadata.DatasetDetails.IsBasedOn.ID, popTypeLabel),
k := key(lbl)
return PopulationType{
Key: k,
AggKey: aggregationKey(ctx, k, lbl),
Name: basedOnID,
Label: lbl,
}
}

Expand All @@ -142,18 +182,29 @@ func GetURI(metadata *dataset.Metadata) string {
return metadata.Version.Links.Version.URL
}

// aggregationKey generates an aggregation key from the provided name (unique ID) and label (human friendly string)
func aggregationKey(ctx context.Context, name, label string) string {
if name == "" && label == "" {
// key generates a key from the provided label by lower casing and converting spaces to hyphens
func key(label string) string {
return strings.ReplaceAll(
strings.ToLower(
strings.TrimSpace(label),
),
" ", "-",
)
}

// aggregationKey generates an aggregation key
// from the provided key (unique identifier) and label (human friendly string)
func aggregationKey(ctx context.Context, key, label string) string {
if key == "" && label == "" {
return ""
}

if strings.Contains(name, aggSep) {
log.Warn(ctx, "found aggregation key separator in name", log.Data{"name": name, "separator": aggSep})
if strings.Contains(key, aggSep) {
log.Warn(ctx, "found aggregation key separator in name", log.Data{"name": key, "separator": aggSep})
}
if strings.Contains(label, aggSep) {
log.Warn(ctx, "found aggregation key separator in label", log.Data{"label": label, "separator": aggSep})
}

return fmt.Sprintf("%s%s%s", name, aggSep, label)
return fmt.Sprintf("%s%s%s", key, aggSep, label)
}
70 changes: 58 additions & 12 deletions models/mapper_dataset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,7 @@ func TestPopulateCantabularFields(t *testing.T) {
})
})

Convey("Given a dataset metadata with is_based_on field with a cantabular type and 4 dimensions, one being area type", t, func() {
areaTypeTrue := true
areaTypeFalse := false
Convey("Given a dataset metadata with is_based_on field with a cantabular type with a dimension", t, func() {
metadata := &dataset.Metadata{
DatasetDetails: dataset.DatasetDetails{
IsBasedOn: &dataset.IsBasedOn{
Expand All @@ -144,10 +142,7 @@ func TestPopulateCantabularFields(t *testing.T) {
},
Version: dataset.Version{
Dimensions: []dataset.VersionDimension{
{ID: "dim1", Label: "label 1 (10 categories)"},
{ID: "dim2", Label: "label 2 (12 Categories)", IsAreaType: &areaTypeFalse},
{ID: "dim3", IsAreaType: &areaTypeTrue},
{ID: "dim4", Label: "label 4 (1 category)"},
{ID: "dim1", Label: "Label 1 (10 categories)"},
},
},
}
Expand All @@ -159,14 +154,12 @@ func TestPopulateCantabularFields(t *testing.T) {
}
s.PopulateCantabularFields(ctx, metadata)

Convey("Then only the non-area-type dimensions are populated, with the expected values", func() {
Convey("Then the expeced dimension is populated", func() {
So(*s, ShouldResemble, models.SearchDataImport{
Summary: testSummary,
DataType: "dataset_landing_page",
Dimensions: []models.Dimension{
{Name: "dim1", RawLabel: "label 1 (10 categories)", Label: "label 1", AggKey: "dim1###label 1"},
{Name: "dim2", RawLabel: "label 2 (12 Categories)", Label: "label 2", AggKey: "dim2###label 2"},
{Name: "dim4", RawLabel: "label 4 (1 category)", Label: "label 4", AggKey: "dim4###label 4"},
{Key: "label-1", AggKey: "label-1###Label 1", Name: "dim1", Label: "Label 1", RawLabel: "Label 1 (10 categories)"},
},
})
})
Expand Down Expand Up @@ -196,16 +189,69 @@ func TestPopulateCantabularFields(t *testing.T) {
DataType: "dataset_landing_page",
Dimensions: []models.Dimension{},
PopulationType: models.PopulationType{
Key: "all-usual-residents-in-households",
AggKey: "all-usual-residents-in-households###All usual residents in households",
Name: "UR_HH",
Label: "All usual residents in households",
AggKey: "UR_HH###All usual residents in households",
},
})
})
})
})
}

func TestMapDimensions(t *testing.T) {
ctx := context.Background()

Convey("Given 2 dimensions with the same label and different number of categories", t, func() {
dims := []dataset.VersionDimension{
{ID: "dim1", Label: "Label 1 (10 categories)"},
{ID: "dim2", Label: "Label 1 (1 category)"},
}

Convey("Then MapDimensions collapses them into a single dimension with the expected values", func() {
mappedDimensions := models.MapDimensions(ctx, dims)
So(mappedDimensions, ShouldHaveLength, 1)
So(mappedDimensions[0], ShouldResemble, models.Dimension{
Key: "label-1",
AggKey: "label-1###Label 1",
Name: "dim1,dim2",
Label: "Label 1",
RawLabel: "Label 1 (10 categories),Label 1 (1 category)",
})
})
})

Convey("Given 3 dimensions, only one being area type", t, func() {
areaTypeTrue := true
areaTypeFalse := false
dims := []dataset.VersionDimension{
{ID: "dim1", Label: "Label 1 (10 categories)"},
{ID: "dim2", Label: "Label 2", IsAreaType: &areaTypeTrue},
{ID: "dim3", Label: "Label 3", IsAreaType: &areaTypeFalse},
}

Convey("Then only the non-area type dimensions are mapped", func() {
mappedDimensions := models.MapDimensions(ctx, dims)
So(mappedDimensions, ShouldHaveLength, 2)
So(mappedDimensions, ShouldContain, models.Dimension{
Key: "label-1",
AggKey: "label-1###Label 1",
Name: "dim1",
Label: "Label 1",
RawLabel: "Label 1 (10 categories)",
})
So(mappedDimensions, ShouldContain, models.Dimension{
Key: "label-3",
AggKey: "label-3###Label 3",
Name: "dim3",
Label: "Label 3",
RawLabel: "Label 3",
})
})
})
}

func TestGetURI(t *testing.T) {
dl := dataset.Links{
LatestVersion: dataset.Link{
Expand Down
10 changes: 6 additions & 4 deletions schema/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,19 @@ var searchDataImport = `{
"name": "Dimension",
"type" : "record",
"fields": [
{ "name": "key", "type": "string", "default": "" },
{ "name": "agg_key", "type": "string", "default": "" },
{ "name": "name", "type": "string", "default": "" },
{ "name": "raw_label", "type": "string", "default": "" },
{ "name": "label", "type": "string", "default": "" },
{ "name": "agg_key", "type": "string", "default": "" }
{ "name": "raw_label", "type": "string", "default": "" }
]
}}},
{"name": "population_type", "type": {
"name": "PopulationType", "type": "record", "fields": [
{ "name": "key", "type": "string", "default": "" },
{ "name": "agg_key", "type": "string", "default": "" },
{ "name": "name", "type": "string", "default": ""},
{ "name": "label", "type": "string", "default": ""},
{ "name": "agg_key", "type": "string", "default": "" }
{ "name": "label", "type": "string", "default": ""}
]
}}
]
Expand Down

0 comments on commit 3b7b745

Please sign in to comment.