forked from apache/lucenenet
-
Notifications
You must be signed in to change notification settings - Fork 3
/
FuzzyTermsEnum.cs
507 lines (442 loc) · 20.3 KB
/
FuzzyTermsEnum.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
using J2N;
using Lucene.Net.Index;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Search
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using Attribute = Lucene.Net.Util.Attribute;
using AttributeSource = Lucene.Net.Util.AttributeSource;
using Automaton = Lucene.Net.Util.Automaton.Automaton;
using BasicAutomata = Lucene.Net.Util.Automaton.BasicAutomata;
using BasicOperations = Lucene.Net.Util.Automaton.BasicOperations;
using IBits = Lucene.Net.Util.IBits;
using ByteRunAutomaton = Lucene.Net.Util.Automaton.ByteRunAutomaton;
using BytesRef = Lucene.Net.Util.BytesRef;
using CompiledAutomaton = Lucene.Net.Util.Automaton.CompiledAutomaton;
using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum;
using DocsEnum = Lucene.Net.Index.DocsEnum;
using FilteredTermsEnum = Lucene.Net.Index.FilteredTermsEnum;
using LevenshteinAutomata = Lucene.Net.Util.Automaton.LevenshteinAutomata;
using Term = Lucene.Net.Index.Term;
using Terms = Lucene.Net.Index.Terms;
using TermsEnum = Lucene.Net.Index.TermsEnum;
using TermState = Lucene.Net.Index.TermState;
using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
/// <summary>
/// Subclass of <see cref="TermsEnum"/> for enumerating all terms that are similar
/// to the specified filter term.
///
/// <para>Term enumerations are always ordered by
/// <see cref="Comparer"/>. Each term in the enumeration is
/// greater than all that precede it.</para>
/// </summary>
public class FuzzyTermsEnum : TermsEnum
{
private void InitializeInstanceFields()
{
boostAtt = Attributes.AddAttribute<IBoostAttribute>();
}
private TermsEnum actualEnum;
private IBoostAttribute actualBoostAtt;
private IBoostAttribute boostAtt;
private readonly IMaxNonCompetitiveBoostAttribute maxBoostAtt;
private readonly ILevenshteinAutomataAttribute dfaAtt;
private float bottom;
private BytesRef bottomTerm;
// TODO: chicken-and-egg
private readonly IComparer<BytesRef> termComparer = BytesRef.UTF8SortedAsUnicodeComparer;
protected readonly float m_minSimilarity;
protected readonly float m_scaleFactor;
protected readonly int m_termLength;
protected int m_maxEdits;
protected readonly bool m_raw;
protected readonly Terms m_terms;
private readonly Term term;
protected readonly int[] m_termText;
protected readonly int m_realPrefixLength;
private readonly bool transpositions;
/// <summary>
/// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
/// length <paramref name="prefixLength"/> with <paramref name="term"/> and which have a fuzzy similarity >
/// <paramref name="minSimilarity"/>.
/// <para/>
/// After calling the constructor the enumeration is already pointing to the first
/// valid term if such a term exists.
/// </summary>
/// <param name="terms"> Delivers terms. </param>
/// <param name="atts"> <see cref="AttributeSource"/> created by the rewrite method of <see cref="MultiTermQuery"/>
/// thats contains information about competitive boosts during rewrite. It is also used
/// to cache DFAs between segment transitions. </param>
/// <param name="term"> Pattern term. </param>
/// <param name="minSimilarity"> Minimum required similarity for terms from the reader. Pass an integer value
/// representing edit distance. Passing a fraction is deprecated. </param>
/// <param name="prefixLength"> Length of required common prefix. Default value is 0. </param>
/// <param name="transpositions"> Transpositions </param>
/// <exception cref="System.IO.IOException"> if there is a low-level IO error </exception>
public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, float minSimilarity, int prefixLength, bool transpositions)
{
InitializeInstanceFields();
if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
{
throw new ArgumentException("fractional edit distances are not allowed");
}
if (minSimilarity < 0.0f)
{
throw new ArgumentException("minimumSimilarity cannot be less than 0");
}
if (prefixLength < 0)
{
throw new ArgumentException("prefixLength cannot be less than 0");
}
this.m_terms = terms;
this.term = term;
// convert the string into a utf32 int[] representation for fast comparisons
string utf16 = term.Text();
this.m_termText = new int[utf16.CodePointCount(0, utf16.Length)];
for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
{
m_termText[j++] = cp = utf16.CodePointAt(i);
}
this.m_termLength = m_termText.Length;
this.dfaAtt = atts.AddAttribute<ILevenshteinAutomataAttribute>();
//The prefix could be longer than the word.
//It's kind of silly though. It means we must match the entire word.
this.m_realPrefixLength = prefixLength > m_termLength ? m_termLength : prefixLength;
// if minSimilarity >= 1, we treat it as number of edits
if (minSimilarity >= 1f)
{
this.m_minSimilarity = 0; // just driven by number of edits
m_maxEdits = (int)minSimilarity;
m_raw = true;
}
else
{
this.m_minSimilarity = minSimilarity;
// calculate the maximum k edits for this similarity
m_maxEdits = InitialMaxDistance(this.m_minSimilarity, m_termLength);
m_raw = false;
}
if (transpositions && m_maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
{
throw new System.NotSupportedException("with transpositions enabled, distances > " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + " are not supported ");
}
this.transpositions = transpositions;
this.m_scaleFactor = 1.0f / (1.0f - this.m_minSimilarity);
this.maxBoostAtt = atts.AddAttribute<IMaxNonCompetitiveBoostAttribute>();
bottom = maxBoostAtt.MaxNonCompetitiveBoost;
bottomTerm = maxBoostAtt.CompetitiveTerm;
BottomChanged(null, true);
}
/// <summary>
/// Return an automata-based enum for matching up to <paramref name="editDistance"/> from
/// <paramref name="lastTerm"/>, if possible
/// </summary>
protected virtual TermsEnum GetAutomatonEnum(int editDistance, BytesRef lastTerm)
{
IList<CompiledAutomaton> runAutomata = InitAutomata(editDistance);
if (editDistance < runAutomata.Count)
{
//if (BlockTreeTermsWriter.DEBUG) System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString()));
CompiledAutomaton compiled = runAutomata[editDistance];
return new AutomatonFuzzyTermsEnum(this, m_terms.Intersect(compiled, lastTerm == null ? null : compiled.Floor(lastTerm, new BytesRef())), runAutomata.SubList(0, editDistance + 1).ToArray(/*new CompiledAutomaton[editDistance + 1]*/));
}
else
{
return null;
}
}
/// <summary>
/// Initialize levenshtein DFAs up to maxDistance, if possible </summary>
private IList<CompiledAutomaton> InitAutomata(int maxDistance)
{
IList<CompiledAutomaton> runAutomata = dfaAtt.Automata;
//System.out.println("cached automata size: " + runAutomata.size());
if (runAutomata.Count <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
{
LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.NewString(m_termText, m_realPrefixLength, m_termText.Length - m_realPrefixLength), transpositions);
for (int i = runAutomata.Count; i <= maxDistance; i++)
{
Automaton a = builder.ToAutomaton(i);
//System.out.println("compute automaton n=" + i);
// constant prefix
if (m_realPrefixLength > 0)
{
Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(m_termText, 0, m_realPrefixLength));
a = BasicOperations.Concatenate(prefix, a);
}
runAutomata.Add(new CompiledAutomaton(a, true, false));
}
}
return runAutomata;
}
/// <summary>
/// Swap in a new actual enum to proxy to </summary>
protected virtual void SetEnum(TermsEnum actualEnum)
{
this.actualEnum = actualEnum;
this.actualBoostAtt = actualEnum.Attributes.AddAttribute<IBoostAttribute>();
}
/// <summary>
/// Fired when the max non-competitive boost has changed. This is the hook to
/// swap in a smarter actualEnum
/// </summary>
private void BottomChanged(BytesRef lastTerm, bool init)
{
int oldMaxEdits = m_maxEdits;
// true if the last term encountered is lexicographically equal or after the bottom term in the PQ
bool termAfter = bottomTerm == null || (lastTerm != null && termComparer.Compare(lastTerm, bottomTerm) >= 0);
// as long as the max non-competitive boost is >= the max boost
// for some edit distance, keep dropping the max edit distance.
while (m_maxEdits > 0 && (termAfter ? bottom >= CalculateMaxBoost(m_maxEdits) : bottom > CalculateMaxBoost(m_maxEdits)))
{
m_maxEdits--;
}
if (oldMaxEdits != m_maxEdits || init) // the maximum n has changed
{
MaxEditDistanceChanged(lastTerm, m_maxEdits, init);
}
}
protected virtual void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init)
{
TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm);
// instead of assert, we do a hard check in case someone uses our enum directly
// assert newEnum != null;
if (newEnum == null)
{
Debug.Assert(maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
throw new System.ArgumentException("maxEdits cannot be > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE");
}
SetEnum(newEnum);
}
// for some raw min similarity and input term length, the maximum # of edits
private int InitialMaxDistance(float minimumSimilarity, int termLen)
{
return (int)((1D - minimumSimilarity) * termLen);
}
// for some number of edits, the maximum possible scaled boost
private float CalculateMaxBoost(int nEdits)
{
float similarity = 1.0f - ((float)nEdits / (float)(m_termLength));
return (similarity - m_minSimilarity) * m_scaleFactor;
}
private BytesRef queuedBottom = null;
public override BytesRef Next()
{
if (queuedBottom != null)
{
BottomChanged(queuedBottom, false);
queuedBottom = null;
}
BytesRef term = actualEnum.Next();
boostAtt.Boost = actualBoostAtt.Boost;
float bottom = maxBoostAtt.MaxNonCompetitiveBoost;
BytesRef bottomTerm = maxBoostAtt.CompetitiveTerm;
if (term != null && (bottom != this.bottom || bottomTerm != this.bottomTerm))
{
this.bottom = bottom;
this.bottomTerm = bottomTerm;
// clone the term before potentially doing something with it
// this is a rare but wonderful occurrence anyway
queuedBottom = BytesRef.DeepCopyOf(term);
}
return term;
}
// proxy all other enum calls to the actual enum
public override int DocFreq
{
get { return actualEnum.DocFreq; }
}
public override long TotalTermFreq
{
get { return actualEnum.TotalTermFreq; }
}
public override DocsEnum Docs(IBits liveDocs, DocsEnum reuse, DocsFlags flags)
{
return actualEnum.Docs(liveDocs, reuse, flags);
}
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags)
{
return actualEnum.DocsAndPositions(liveDocs, reuse, flags);
}
public override void SeekExact(BytesRef term, TermState state)
{
actualEnum.SeekExact(term, state);
}
public override TermState GetTermState()
{
return actualEnum.GetTermState();
}
public override IComparer<BytesRef> Comparer => actualEnum.Comparer;
public override long Ord => actualEnum.Ord;
public override bool SeekExact(BytesRef text)
{
return actualEnum.SeekExact(text);
}
public override SeekStatus SeekCeil(BytesRef text)
{
return actualEnum.SeekCeil(text);
}
public override void SeekExact(long ord)
{
actualEnum.SeekExact(ord);
}
public override BytesRef Term => actualEnum.Term;
/// <summary>
/// Implement fuzzy enumeration with <see cref="Terms.Intersect(CompiledAutomaton, BytesRef)"/>.
/// <para/>
/// This is the fastest method as opposed to LinearFuzzyTermsEnum:
/// as enumeration is logarithmic to the number of terms (instead of linear)
/// and comparison is linear to length of the term (rather than quadratic)
/// </summary>
private class AutomatonFuzzyTermsEnum : FilteredTermsEnum
{
internal virtual void InitializeInstanceFields()
{
boostAtt = Attributes.AddAttribute<IBoostAttribute>();
}
private readonly FuzzyTermsEnum outerInstance;
private readonly ByteRunAutomaton[] matchers;
private readonly BytesRef termRef;
private IBoostAttribute boostAtt;
public AutomatonFuzzyTermsEnum(FuzzyTermsEnum outerInstance, TermsEnum tenum, CompiledAutomaton[] compiled)
: base(tenum, false)
{
this.outerInstance = outerInstance;
InitializeInstanceFields();
this.matchers = new ByteRunAutomaton[compiled.Length];
for (int i = 0; i < compiled.Length; i++)
{
this.matchers[i] = compiled[i].RunAutomaton;
}
termRef = new BytesRef(outerInstance.term.Text());
}
/// <summary>
/// Finds the smallest Lev(n) DFA that accepts the term. </summary>
protected override AcceptStatus Accept(BytesRef term)
{
//System.out.println("AFTE.accept term=" + term);
int ed = matchers.Length - 1;
// we are wrapping either an intersect() TermsEnum or an AutomatonTermsENum,
// so we know the outer DFA always matches.
// now compute exact edit distance
while (ed > 0)
{
if (Matches(term, ed - 1))
{
ed--;
}
else
{
break;
}
}
//System.out.println("CHECK term=" + term.utf8ToString() + " ed=" + ed);
// scale to a boost and return (if similarity > minSimilarity)
if (ed == 0) // exact match
{
boostAtt.Boost = 1.0F;
//System.out.println(" yes");
return AcceptStatus.YES;
}
else
{
int codePointCount = UnicodeUtil.CodePointCount(term);
float similarity = 1.0f - ((float)ed / (float)(Math.Min(codePointCount, outerInstance.m_termLength)));
if (similarity > outerInstance.m_minSimilarity)
{
boostAtt.Boost = (similarity - outerInstance.m_minSimilarity) * outerInstance.m_scaleFactor;
//System.out.println(" yes");
return AcceptStatus.YES;
}
else
{
return AcceptStatus.NO;
}
}
}
/// <summary>
/// Returns <c>true</c> if <paramref name="term"/> is within <paramref name="k"/> edits of the query term </summary>
internal bool Matches(BytesRef term, int k)
{
return k == 0 ? term.Equals(termRef) : matchers[k].Run(term.Bytes, term.Offset, term.Length);
}
}
/// <summary>
/// @lucene.internal </summary>
public virtual float MinSimilarity => m_minSimilarity;
/// <summary>
/// @lucene.internal </summary>
public virtual float ScaleFactor => m_scaleFactor;
/// <summary>
/// Reuses compiled automata across different segments,
/// because they are independent of the index
/// <para/>
/// @lucene.internal
/// </summary>
public interface ILevenshteinAutomataAttribute : IAttribute
{
IList<CompiledAutomaton> Automata { get; }
}
/// <summary>
/// Stores compiled automata as a list (indexed by edit distance)
/// <para/>
/// @lucene.internal
/// </summary>
public sealed class LevenshteinAutomataAttribute : Attribute, ILevenshteinAutomataAttribute
{
// LUCENENET NOTE: Must use JCG.List for Equals and GetHashCode()
private readonly IList<CompiledAutomaton> automata = new JCG.List<CompiledAutomaton>();
public IList<CompiledAutomaton> Automata
{
get { return automata; }
}
public override void Clear()
{
automata.Clear();
}
public override int GetHashCode()
{
return automata.GetHashCode();
}
public override bool Equals(object other)
{
if (this == other)
{
return true;
}
if (!(other is LevenshteinAutomataAttribute))
{
return false;
}
return automata.Equals(((LevenshteinAutomataAttribute)other).automata);
}
public override void CopyTo(IAttribute target)
{
IList<CompiledAutomaton> targetAutomata = ((LevenshteinAutomataAttribute)target).Automata;
targetAutomata.Clear();
targetAutomata.AddRange(automata);
}
}
}
}