forked from apache/lucenenet
-
Notifications
You must be signed in to change notification settings - Fork 3
/
LimitTokenCountFilter.cs
105 lines (100 loc) · 4.23 KB
/
LimitTokenCountFilter.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
namespace Lucene.Net.Analysis.Miscellaneous
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// This <see cref="TokenFilter"/> limits the number of tokens while indexing. It is
/// a replacement for the maximum field length setting inside <see cref="Index.IndexWriter"/>.
/// <para>
/// By default, this filter ignores any tokens in the wrapped <see cref="TokenStream"/>
/// once the limit has been reached, which can result in <see cref="Reset"/> being
/// called prior to <see cref="IncrementToken"/> returning <c>false</c>. For most
/// <see cref="TokenStream"/> implementations this should be acceptable, and faster
/// then consuming the full stream. If you are wrapping a <see cref="TokenStream"/>
/// which requires that the full stream of tokens be exhausted in order to
/// function properly, use the
/// <see cref="LimitTokenCountFilter.LimitTokenCountFilter(TokenStream,int,bool)"/> consumeAllTokens
/// option.
/// </para>
/// </summary>
public sealed class LimitTokenCountFilter : TokenFilter
{
private readonly int maxTokenCount;
private readonly bool consumeAllTokens;
private int tokenCount = 0;
private bool exhausted = false;
/// <summary>
/// Build a filter that only accepts tokens up to a maximum number.
/// This filter will not consume any tokens beyond the <paramref name="maxTokenCount"/> limit
/// </summary>
/// <param name="in"> the stream to wrap </param>
/// <param name="maxTokenCount"> max number of tokens to produce </param>
/// <seealso cref="LimitTokenCountFilter(TokenStream,int,bool)"/>
public LimitTokenCountFilter(TokenStream @in, int maxTokenCount)
: this(@in, maxTokenCount, false)
{
}
/// <summary>
/// Build an filter that limits the maximum number of tokens per field. </summary>
/// <param name="in"> the stream to wrap </param>
/// <param name="maxTokenCount"> max number of tokens to produce </param>
/// <param name="consumeAllTokens"> whether all tokens from the input must be consumed even if <paramref name="maxTokenCount"/> is reached. </param>
public LimitTokenCountFilter(TokenStream @in, int maxTokenCount, bool consumeAllTokens)
: base(@in)
{
if (maxTokenCount < 1)
{
throw new System.ArgumentOutOfRangeException("maxTokenCount must be greater than zero");
}
this.maxTokenCount = maxTokenCount;
this.consumeAllTokens = consumeAllTokens;
}
public override bool IncrementToken()
{
if (exhausted)
{
return false;
}
else if (tokenCount < maxTokenCount)
{
if (m_input.IncrementToken())
{
tokenCount++;
return true;
}
else
{
exhausted = true;
return false;
}
}
else
{
while (consumeAllTokens && m_input.IncrementToken()) // NOOP
{
}
return false;
}
}
public override void Reset()
{
base.Reset();
tokenCount = 0;
exhausted = false;
}
}
}