forked from apache/lucenenet
-
Notifications
You must be signed in to change notification settings - Fork 3
/
TopDocsCollector.cs
302 lines (277 loc) · 14.4 KB
/
TopDocsCollector.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
using Lucene.Net.Index;
using Lucene.Net.Util;
using System;
namespace Lucene.Net.Search
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A base class for all collectors that return a <see cref="TopDocs"/> output. This
/// collector allows easy extension by providing a single constructor which
/// accepts a <see cref="Util.PriorityQueue{T}"/> as well as protected members for that
/// priority queue and a counter of the number of total hits.
/// <para/>
/// Extending classes can override any of the methods to provide their own
/// implementation, as well as avoid the use of the priority queue entirely by
/// passing null to <see cref="TopDocsCollector(Util.PriorityQueue{T})"/>. In that case
/// however, you might want to consider overriding all methods, in order to avoid
/// a <see cref="NullReferenceException"/>.
/// </summary>
public abstract class TopDocsCollector<T> : ICollector, ITopDocsCollector where T : ScoreDoc
{
/// <summary>
/// This is used in case <see cref="GetTopDocs()"/> is called with illegal parameters, or there
/// simply aren't (enough) results.
/// </summary>
protected static readonly TopDocs EMPTY_TOPDOCS = new TopDocs(0, new ScoreDoc[0], float.NaN);
/// <summary>
/// The priority queue which holds the top documents. Note that different
/// implementations of <see cref="PriorityQueue{T}"/> give different meaning to 'top documents'.
/// <see cref="HitQueue"/> for example aggregates the top scoring documents, while other priority queue
/// implementations may hold documents sorted by other criteria.
/// </summary>
protected PriorityQueue<T> m_pq;
/// <summary>
/// The total number of documents that the collector encountered. </summary>
protected int m_totalHits;
/// <summary>
/// Sole constructor.
/// </summary>
protected TopDocsCollector(PriorityQueue<T> pq)
{
this.m_pq = pq;
}
/// <summary>
/// Populates the results array with the <see cref="ScoreDoc"/> instances. This can be
/// overridden in case a different <see cref="ScoreDoc"/> type should be returned.
/// </summary>
protected virtual void PopulateResults(ScoreDoc[] results, int howMany)
{
for (int i = howMany - 1; i >= 0; i--)
{
results[i] = m_pq.Pop();
}
}
/// <summary>
/// Returns a <see cref="TopDocs"/> instance containing the given results. If
/// <paramref name="results"/> is <c>null</c> it means there are no results to return,
/// either because there were 0 calls to <see cref="Collect(int)"/> or because the arguments to
/// <see cref="TopDocs"/> were invalid.
/// </summary>
protected virtual TopDocs NewTopDocs(ScoreDoc[] results, int start)
{
return results == null ? EMPTY_TOPDOCS : new TopDocs(m_totalHits, results);
}
/// <summary>
/// The total number of documents that matched this query. </summary>
public virtual int TotalHits
{
get
{
return m_totalHits;
}
internal set
{
m_totalHits = value;
}
}
/// <summary>
/// The number of valid priority queue entries
/// </summary>
protected virtual int TopDocsCount
{
get
{
// In case pq was populated with sentinel values, there might be less
// results than pq.size(). Therefore return all results until either
// pq.size() or totalHits.
return m_totalHits < m_pq.Count ? m_totalHits : m_pq.Count;
}
}
/// <summary>
/// Returns the top docs that were collected by this collector. </summary>
public virtual TopDocs GetTopDocs()
{
// In case pq was populated with sentinel values, there might be less
// results than pq.size(). Therefore return all results until either
// pq.size() or totalHits.
return GetTopDocs(0, TopDocsCount);
}
/// <summary>
/// Returns the documents in the rage [<paramref name="start"/> .. pq.Count) that were collected
/// by this collector. Note that if <paramref name="start"/> >= pq.Count, an empty <see cref="TopDocs"/> is
/// returned.
/// <para/>
/// This method is convenient to call if the application always asks for the
/// last results, starting from the last 'page'.
/// <para/>
/// <b>NOTE:</b> you cannot call this method more than once for each search
/// execution. If you need to call it more than once, passing each time a
/// different <paramref name="start"/>, you should call <see cref="GetTopDocs()"/> and work
/// with the returned <see cref="TopDocs"/> object, which will contain all the
/// results this search execution collected.
/// </summary>
public virtual TopDocs GetTopDocs(int start)
{
// In case pq was populated with sentinel values, there might be less
// results than pq.Count. Therefore return all results until either
// pq.Count or totalHits.
return GetTopDocs(start, TopDocsCount);
}
/// <summary>
/// Returns the documents in the rage [<paramref name="start"/> .. <paramref name="start"/>+<paramref name="howMany"/>) that were
/// collected by this collector. Note that if <paramref name="start"/> >= pq.Count, an empty
/// <see cref="TopDocs"/> is returned, and if pq.Count - <paramref name="start"/> < <paramref name="howMany"/>, then only the
/// available documents in [<paramref name="start"/> .. pq.Count) are returned.
/// <para/>
/// This method is useful to call in case pagination of search results is
/// allowed by the search application, as well as it attempts to optimize the
/// memory used by allocating only as much as requested by <paramref name="howMany"/>.
/// <para/>
/// <b>NOTE:</b> you cannot call this method more than once for each search
/// execution. If you need to call it more than once, passing each time a
/// different range, you should call <see cref="GetTopDocs()"/> and work with the
/// returned <see cref="TopDocs"/> object, which will contain all the results this
/// search execution collected.
/// </summary>
public virtual TopDocs GetTopDocs(int start, int howMany)
{
// In case pq was populated with sentinel values, there might be less
// results than pq.Count. Therefore return all results until either
// pq.Count or totalHits.
int size = TopDocsCount;
// Don't bother to throw an exception, just return an empty TopDocs in case
// the parameters are invalid or out of range.
// TODO: shouldn't we throw IAE if apps give bad params here so they dont
// have sneaky silent bugs?
if (start < 0 || start >= size || howMany <= 0)
{
return NewTopDocs(null, start);
}
// We know that start < pqsize, so just fix howMany.
howMany = Math.Min(size - start, howMany);
ScoreDoc[] results = new ScoreDoc[howMany];
// pq's pop() returns the 'least' element in the queue, therefore need
// to discard the first ones, until we reach the requested range.
// Note that this loop will usually not be executed, since the common usage
// should be that the caller asks for the last howMany results. However it's
// needed here for completeness.
for (int i = m_pq.Count - start - howMany; i > 0; i--)
{
m_pq.Pop();
}
// Get the requested results from pq.
PopulateResults(results, howMany);
return NewTopDocs(results, start);
}
// LUCENENET specific - we need to implement these here, since our abstract base class
// is now an interface.
/// <summary>
/// Called before successive calls to <see cref="Collect(int)"/>. Implementations
/// that need the score of the current document (passed-in to
/// <see cref="Collect(int)"/>), should save the passed-in <see cref="Scorer"/> and call
/// <see cref="Scorer.GetScore()"/> when needed.
/// </summary>
public abstract void SetScorer(Scorer scorer);
/// <summary>
/// Called once for every document matching a query, with the unbased document
/// number.
/// <para/>Note: The collection of the current segment can be terminated by throwing
/// a <see cref="CollectionTerminatedException"/>. In this case, the last docs of the
/// current <see cref="AtomicReaderContext"/> will be skipped and <see cref="IndexSearcher"/>
/// will swallow the exception and continue collection with the next leaf.
/// <para/>
/// Note: this is called in an inner search loop. For good search performance,
/// implementations of this method should not call <see cref="IndexSearcher.Doc(int)"/> or
/// <see cref="Lucene.Net.Index.IndexReader.Document(int)"/> on every hit.
/// Doing so can slow searches by an order of magnitude or more.
/// </summary>
public abstract void Collect(int doc);
/// <summary>
/// Called before collecting from each <see cref="AtomicReaderContext"/>. All doc ids in
/// <see cref="Collect(int)"/> will correspond to <see cref="Index.IndexReaderContext.Reader"/>.
/// <para/>
/// Add <see cref="AtomicReaderContext.DocBase"/> to the current <see cref="Index.IndexReaderContext.Reader"/>'s
/// internal document id to re-base ids in <see cref="Collect(int)"/>.
/// </summary>
/// <param name="context">Next atomic reader context </param>
public abstract void SetNextReader(AtomicReaderContext context);
/// <summary>
/// Return <c>true</c> if this collector does not
/// require the matching docIDs to be delivered in int sort
/// order (smallest to largest) to <see cref="Collect"/>.
///
/// <para> Most Lucene Query implementations will visit
/// matching docIDs in order. However, some queries
/// (currently limited to certain cases of <see cref="BooleanQuery"/>)
/// can achieve faster searching if the
/// <see cref="ICollector"/> allows them to deliver the
/// docIDs out of order.</para>
///
/// <para> Many collectors don't mind getting docIDs out of
/// order, so it's important to return <c>true</c>
/// here.</para>
/// </summary>
public abstract bool AcceptsDocsOutOfOrder { get; }
}
/// <summary>
/// LUCENENET specific interface used to reference <see cref="TopDocsCollector{T}"/>
/// without referencing its generic type.
/// </summary>
public interface ITopDocsCollector : ICollector
{
// From TopDocsCollector<T>
/// <summary>
/// The total number of documents that matched this query. </summary>
int TotalHits { get; }
/// <summary>
/// Returns the top docs that were collected by this collector. </summary>
TopDocs GetTopDocs();
/// <summary>
/// Returns the documents in the rage [<paramref name="start"/> .. pq.Count) that were collected
/// by this collector. Note that if <paramref name="start"/> >= pq.Count, an empty <see cref="TopDocs"/> is
/// returned.
/// <para/>
/// This method is convenient to call if the application always asks for the
/// last results, starting from the last 'page'.
/// <para/>
/// <b>NOTE:</b> you cannot call this method more than once for each search
/// execution. If you need to call it more than once, passing each time a
/// different <paramref name="start"/>, you should call <see cref="GetTopDocs()"/> and work
/// with the returned <see cref="TopDocs"/> object, which will contain all the
/// results this search execution collected.
/// </summary>
TopDocs GetTopDocs(int start);
/// <summary>
/// Returns the documents in the rage [<paramref name="start"/> .. <paramref name="start"/>+<paramref name="howMany"/>) that were
/// collected by this collector. Note that if <paramref name="start"/> >= pq.Count, an empty
/// <see cref="TopDocs"/> is returned, and if pq.Count - <paramref name="start"/> < <paramref name="howMany"/>, then only the
/// available documents in [<paramref name="start"/> .. pq.Count) are returned.
/// <para/>
/// This method is useful to call in case pagination of search results is
/// allowed by the search application, as well as it attempts to optimize the
/// memory used by allocating only as much as requested by <paramref name="howMany"/>.
/// <para/>
/// <b>NOTE:</b> you cannot call this method more than once for each search
/// execution. If you need to call it more than once, passing each time a
/// different range, you should call <see cref="GetTopDocs()"/> and work with the
/// returned <see cref="TopDocs"/> object, which will contain all the results this
/// search execution collected.
/// </summary>
TopDocs GetTopDocs(int start, int howMany);
}
}