Skip to content

Commit

Permalink
Cache search better in UniqueAttributeValueConstraint (#924)
Browse files Browse the repository at this point in the history
If a document happened to have a large number of elements that have the
UniqueAttributeValueConstraint validation, it will end up recalculating
the values for the constraint way too often. This was because the
constraint was generating the cached lookup with a key using the
attribute text itself. This change updates the lookup to cache all
possible duplicates for the element in question so it only has to be
searched once.
  • Loading branch information
twsouthwick committed May 4, 2021
1 parent 537a280 commit 6184b08
Show file tree
Hide file tree
Showing 11 changed files with 134 additions and 45 deletions.
7 changes: 5 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## Unreleased

### Deprecated
- Deprecated Office2013.Word.Person.Contact property. It no longer persists and will be removed in a future version (#912)
### Fixed
- Fixed massive performance bottleneck when `UniqueAttributeValueConstraint` is involved (#924)

Release Notes:
## Version 2.13.0-beta2 - 2021-04-20

### Added
- Additional O19 types to match Open Specifications (#916)

### Deprecated
- Deprecated Office2013.Word.Person.Contact property. It no longer persists and will be removed in a future version (#912)

## Version 2.13.0-beta1 - 2021-03-09

### Added
Expand Down
2 changes: 1 addition & 1 deletion global.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"sdk": {
"version": "5.0.101",
"rollForward": "feature"
"rollForward": "latestFeature"
}
}
4 changes: 2 additions & 2 deletions src/DocumentFormat.OpenXml/OpenXmlElement.cs
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,11 @@ internal virtual void ConfigureMetadata(ElementMetadata.Builder builder)
{
}

private protected void SetAttribute<TSimpleType>(TSimpleType? value, [CallerMemberName] string propertyName = null)
private protected void SetAttribute<TSimpleType>(TSimpleType? value, [CallerMemberName] string propertyName = null!)
where TSimpleType : OpenXmlSimpleType
=> ParsedState.Attributes.GetProperty(propertyName).Value = value;

private protected TSimpleType? GetAttribute<TSimpleType>([CallerMemberName] string propertyName = null)
private protected TSimpleType? GetAttribute<TSimpleType>([CallerMemberName] string propertyName = null!)
where TSimpleType : OpenXmlSimpleType
=> ParsedState.Attributes.GetProperty(propertyName).Value as TSimpleType;

Expand Down
6 changes: 6 additions & 0 deletions src/DocumentFormat.OpenXml/SimpleTypes/EnumInfoLookup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@ public EnumStringLookupImpl()
{
var field = enumType.GetDeclaredField(enumVal.ToString()!);
var enumString = field!.GetCustomAttribute<EnumStringAttribute>();

if (field is null)
{
return;
}

var officeAvailability = field.GetCustomAttribute<OfficeAvailabilityAttribute>();

if (enumString is null)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,13 @@ private PartHolder<int> GetRefElementCount(ValidationContext context)
return new PartHolder<int>(0, null);
}

var result = context.State.Get(new { part.Uri, _refElement, _attribute, _refElementParent }, () =>
var result = context.State.GetOrCreate(new { part, constraint = this }, static (key, context) =>
{
var count = 0;
foreach (var element in part.RootElement.Descendants(context.FileFormat, TraversalOptions.SelectAlternateContent))
foreach (var element in key.part.RootElement.Descendants(context.FileFormat, TraversalOptions.SelectAlternateContent))
{
if (_refElementParent is null || element.Parent?.GetType() == _refElementParent)
if (key.constraint._refElementParent is null || element.Parent?.GetType() == key.constraint._refElementParent)
{
count++;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,15 @@ private PartHolder<ICollection<string>> GetReferencedAttributes(ValidationContex
return new PartHolder<ICollection<string>>(Cached.Array<string>(), part);
}

var result = context.State.Get(new { part.Uri, _partPath, _element, _attribute }, () =>
var result = context.State.GetOrCreate(new { part, constraint = this }, static (key, context) =>
{
var referencedAttributes = new HashSet<string>(StringComparer.Ordinal);
foreach (var element in part.RootElement.Descendants(context.FileFormat, TraversalOptions.SelectAlternateContent))
foreach (var element in key.part.RootElement.Descendants(context.FileFormat, TraversalOptions.SelectAlternateContent))
{
if (element.GetType() == _element)
if (element.GetType() == key.constraint._element)
{
var attribute = element.ParsedState.Attributes[_attribute];
var attribute = element.ParsedState.Attributes[key.constraint._attribute];
//Attributes whose value is empty string or null don't need to be cached.
if (attribute.Value is not null && !attribute.Value.InnerText.IsNullOrEmpty())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System;
using System.Collections.Generic;

namespace DocumentFormat.OpenXml.Validation.Semantic
{
Expand Down Expand Up @@ -37,51 +38,46 @@ public UniqueAttributeValueConstraint(byte attribute, bool caseSensitive, Type?
}

var attribute = element.ParsedState.Attributes[_attribute];
var elementType = element.GetType();

//if the attribute is omitted, semantic validation will do nothing
if (attribute.Value is null || string.IsNullOrEmpty(attribute.Value.InnerText))
{
return null;
}

var part = element.GetPart();
var root = GetRoot(element);

if (root is null)
{
return null;
}

if (part is null)
{
return null;
}

var attributeText = attribute.Value.InnerText;

var added = false;
var isDuplicate = context.State.Get(new { part.Uri, elementType, _parent, attributeText, _attribute, _comparer }, () =>
var elementType = element.GetType();
var textValues = context.State.GetOrCreate(new { elementType, root, constraint = this }, static (key, context) =>
{
added = true;
var set = new DuplicateFinder(key.constraint._comparer);
foreach (var e in root.Descendants(context.FileFormat, TraversalOptions.SelectAlternateContent))
foreach (var e in key.root.Descendants(context.FileFormat, TraversalOptions.SelectAlternateContent))
{
if (e != element && e.GetType() == elementType)
if (e.GetType() == key.elementType)
{
var eValue = e.ParsedState.Attributes[_attribute];
var eValue = e.ParsedState.Attributes[key.constraint._attribute];
if (eValue.Value is not null && _comparer.Equals(attributeText, eValue.Value.InnerText))
if (eValue.Value is not null)
{
return true;
set.Add(eValue.Value.InnerText);
}
}
}
return false;
set.Complete();
return set;
});

if (!isDuplicate || !added)
var isDuplicate = textValues.IsDuplicate(attribute.Value.InnerText);

if (!isDuplicate)
{
return null;
}
Expand Down Expand Up @@ -115,5 +111,67 @@ public UniqueAttributeValueConstraint(byte attribute, bool caseSensitive, Type?

return null;
}

private class DuplicateFinder
{
private readonly StringComparer _comparer;

private bool _isCompleted;
private HashSet<string?>? _set;
private HashSet<string?>? _duplicate;

public DuplicateFinder(StringComparer comparer)
{
_comparer = comparer;
}

/// <summary>
/// Add a text value and track whether it has been seen before or not.
/// </summary>
public void Add(string? text)
{
if (_isCompleted)
{
throw new InvalidOperationException();
}

if (_set is null)
{
_set = new HashSet<string?>(_comparer);
}

if (!_set.Add(text))
{
if (_duplicate is null)
{
_duplicate = new HashSet<string?>(_comparer);
}

_duplicate.Add(text);
}
}

/// <summary>
/// Clear the tracking set to free up space
/// </summary>
public void Complete()
{
_isCompleted = true;
_set = null;
}

/// <summary>
/// Checks if a duplicate was detected. Once a duplicate is checked, subsequent calls will result in <c>false</c> so we only raise the error once.
/// </summary>
public bool IsDuplicate(string? text)
{
if (_duplicate is null)
{
return false;
}

return _duplicate.Remove(text);
}
}
}
}
28 changes: 23 additions & 5 deletions src/DocumentFormat.OpenXml/Validation/StateManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,46 @@ namespace DocumentFormat.OpenXml.Validation
{
internal class StateManager
{
private readonly ValidationContext _context;

private Dictionary<object, object>? _state;

public T Get<T>(object key, Func<T> factory)
where T : notnull
public StateManager(ValidationContext context)
{
_context = context;
}

/// <summary>
/// Method to get or create a cached value. To minimize allocations, the key should track everything that is
/// required to generate the item in the factory. If so, then a static lambda can be used to ensure nothing
/// else is required and that the key will be correct.
/// </summary>
/// <typeparam name="TValue">Type of the value produced.</typeparam>
/// <typeparam name="TKey">Type of the key provided.</typeparam>
/// <param name="key">Provided key that should identify the cached value uniquely.</param>
/// <param name="factory">A factory method to create the value.</param>
/// <returns>The created or cached value.</returns>
public TValue GetOrCreate<TValue, TKey>(TKey key, Func<TKey, ValidationContext, TValue> factory)
where TValue : notnull
where TKey : notnull
{
if (_state is null)
{
_state = new Dictionary<object, object>();
}
else if (_state.TryGetValue(key, out var value))
{
if (value is T t)
if (value is TValue t)
{
return t;
}
else
{
throw new InvalidOperationException(SR.Format("Value of incorrect type: '{0}'. Expecting '{1}'", value.GetType(), typeof(T)));
throw new InvalidOperationException(SR.Format("Value of incorrect type: '{0}'. Expecting '{1}'", value.GetType(), typeof(TValue)));
}
}

var result = factory();
var result = factory(key, _context);

_state.Add(key, result);

Expand Down
3 changes: 2 additions & 1 deletion src/DocumentFormat.OpenXml/Validation/ValidationContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ public ValidationContext(ValidationSettings settings, ValidationCache cache, Can
McContext = new MCContext(false);

Stack = new ValidationStack();
State = new StateManager(this);

Stack.Push(Errors.Add);
}
Expand Down Expand Up @@ -63,7 +64,7 @@ public bool CheckIfCancelled()

public void Clear() => Errors.Clear();

public StateManager State { get; } = new StateManager();
public StateManager State { get; }

/// <summary>
/// Gets used to track MC context.
Expand Down
9 changes: 6 additions & 3 deletions test/DocumentFormat.OpenXml.Tests/TestDocx01.cs
Original file line number Diff line number Diff line change
Expand Up @@ -753,10 +753,13 @@ public void W015_InsertBeforeSelf()
firstPara.InsertBeforeSelf(newPara);

var v = new OpenXmlValidator(FileFormatVersions.Office2013);
var errs = v.Validate(doc);
var cnt = errs.Count();

Assert.Single(v.Validate(doc));
Assert.Collection(
v.Validate(doc),
e =>
{
Assert.Equal("Sem_UniqueAttributeValue", e.Id);
});
}
}

Expand Down
10 changes: 5 additions & 5 deletions test/DocumentFormat.OpenXml.Tests/ofapiTest/BugRegressionTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -95,16 +95,16 @@ public void Bug704004(FileFormatVersions version)
var errors = validator.Validate(p); // should no hang, no OOM

Assert.Collection(
errors,
errors.OrderBy(e => e.Id),
error =>
{
Assert.Equal("Sch_UnexpectedElementContentExpectingComplex", error.Id);
Assert.Same(p.FirstChild.NextSibling().FirstChild.NextSibling(), error.RelatedNode);
Assert.Equal("Sch_IncompleteContentExpectingComplex", error.Id);
Assert.Same(p.FirstChild, error.Node); // acb should have a choice
},
error =>
{
Assert.Equal("Sch_IncompleteContentExpectingComplex", error.Id);
Assert.Same(p.FirstChild, error.Node); // acb should have a choice
Assert.Equal("Sch_UnexpectedElementContentExpectingComplex", error.Id);
Assert.Same(p.FirstChild.NextSibling().FirstChild.NextSibling(), error.RelatedNode);
});

// append an empty "Choice"
Expand Down

0 comments on commit 6184b08

Please sign in to comment.