forked from apache/lucenenet
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Lucene40PostingsWriter.cs
369 lines (325 loc) · 14 KB
/
Lucene40PostingsWriter.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
using Lucene.Net.Index;
using Lucene.Net.Store;
using Lucene.Net.Util;
using Debug = Lucene.Net.Diagnostics.Debug; // LUCENENET NOTE: We cannot use System.Diagnostics.Debug because those calls will be optimized out of the release!
namespace Lucene.Net.Codecs.Lucene40
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Concrete class that writes the 4.0 frq/prx postings format.
/// <para/>
/// @lucene.experimental
/// </summary>
/// <seealso cref="Lucene40PostingsFormat"/>
#pragma warning disable 612, 618
public sealed class Lucene40PostingsWriter : PostingsWriterBase
{
internal readonly IndexOutput freqOut;
internal readonly IndexOutput proxOut;
internal readonly Lucene40SkipListWriter skipListWriter;
/// <summary>
/// Expert: The fraction of TermDocs entries stored in skip tables,
/// used to accelerate <see cref="Search.DocIdSetIterator.Advance(int)"/>. Larger values result in
/// smaller indexes, greater acceleration, but fewer accelerable cases, while
/// smaller values result in bigger indexes, less acceleration and more
/// accelerable cases. More detailed experiments would be useful here.
/// </summary>
internal const int DEFAULT_SKIP_INTERVAL = 16;
internal readonly int skipInterval;
/// <summary>
/// Expert: minimum docFreq to write any skip data at all
/// </summary>
internal readonly int skipMinimum;
/// <summary>
/// Expert: The maximum number of skip levels. Smaller values result in
/// slightly smaller indexes, but slower skipping in big posting lists.
/// </summary>
internal readonly int maxSkipLevels = 10;
internal readonly int totalNumDocs;
internal IndexOptions indexOptions;
internal bool storePayloads;
internal bool storeOffsets;
// Starts a new term
internal long freqStart;
internal long proxStart;
internal FieldInfo fieldInfo;
internal int lastPayloadLength;
internal int lastOffsetLength;
internal int lastPosition;
internal int lastOffset;
internal static readonly StandardTermState emptyState = new StandardTermState();
internal StandardTermState lastState;
// private String segment;
/// <summary>
/// Creates a <see cref="Lucene40PostingsWriter"/>, with the
/// <see cref="DEFAULT_SKIP_INTERVAL"/>.
/// </summary>
public Lucene40PostingsWriter(SegmentWriteState state)
: this(state, DEFAULT_SKIP_INTERVAL)
{
}
/// <summary>
/// Creates a <see cref="Lucene40PostingsWriter"/>, with the
/// specified <paramref name="skipInterval"/>.
/// </summary>
public Lucene40PostingsWriter(SegmentWriteState state, int skipInterval)
: base()
{
this.skipInterval = skipInterval;
this.skipMinimum = skipInterval; // set to the same for now
// this.segment = state.segmentName;
string fileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION);
freqOut = state.Directory.CreateOutput(fileName, state.Context);
bool success = false;
IndexOutput proxOut = null;
try
{
CodecUtil.WriteHeader(freqOut, Lucene40PostingsReader.FRQ_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
// TODO: this is a best effort, if one of these fields has no postings
// then we make an empty prx file, same as if we are wrapped in
// per-field postingsformat. maybe... we shouldn't
// bother w/ this opto? just create empty prx file...?
if (state.FieldInfos.HasProx)
{
// At least one field does not omit TF, so create the
// prox file
fileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION);
proxOut = state.Directory.CreateOutput(fileName, state.Context);
CodecUtil.WriteHeader(proxOut, Lucene40PostingsReader.PRX_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
}
else
{
// Every field omits TF so we will write no prox file
proxOut = null;
}
this.proxOut = proxOut;
success = true;
}
finally
{
if (!success)
{
IOUtils.DisposeWhileHandlingException(freqOut, proxOut);
}
}
totalNumDocs = state.SegmentInfo.DocCount;
skipListWriter = new Lucene40SkipListWriter(skipInterval, maxSkipLevels, totalNumDocs, freqOut, proxOut);
}
public override void Init(IndexOutput termsOut)
{
CodecUtil.WriteHeader(termsOut, Lucene40PostingsReader.TERMS_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
termsOut.WriteInt32(skipInterval); // write skipInterval
termsOut.WriteInt32(maxSkipLevels); // write maxSkipLevels
termsOut.WriteInt32(skipMinimum); // write skipMinimum
}
public override BlockTermState NewTermState()
{
return new StandardTermState();
}
public override void StartTerm()
{
freqStart = freqOut.GetFilePointer();
//if (DEBUG) System.out.println("SPW: startTerm freqOut.fp=" + freqStart);
if (proxOut != null)
{
proxStart = proxOut.GetFilePointer();
}
// force first payload to write its length
lastPayloadLength = -1;
// force first offset to write its length
lastOffsetLength = -1;
skipListWriter.ResetSkip();
}
// Currently, this instance is re-used across fields, so
// our parent calls setField whenever the field changes
public override int SetField(FieldInfo fieldInfo)
{
//System.out.println("SPW: setField");
/*
if (BlockTreeTermsWriter.DEBUG && fieldInfo.Name.Equals("id", StringComparison.Ordinal)) {
DEBUG = true;
} else {
DEBUG = false;
}
*/
this.fieldInfo = fieldInfo;
indexOptions = fieldInfo.IndexOptions;
storeOffsets = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
storePayloads = fieldInfo.HasPayloads;
lastState = emptyState;
//System.out.println(" set init blockFreqStart=" + freqStart);
//System.out.println(" set init blockProxStart=" + proxStart);
return 0;
}
internal int lastDocID;
internal int df;
public override void StartDoc(int docID, int termDocFreq)
{
// if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());
int delta = docID - lastDocID;
if (docID < 0 || (df > 0 && delta <= 0))
{
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (freqOut: " + freqOut + ")");
}
if ((++df % skipInterval) == 0)
{
skipListWriter.SetSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
skipListWriter.BufferSkip(df);
}
Debug.Assert(docID < totalNumDocs, "docID=" + docID + " totalNumDocs=" + totalNumDocs);
lastDocID = docID;
if (indexOptions == IndexOptions.DOCS_ONLY)
{
freqOut.WriteVInt32(delta);
}
else if (1 == termDocFreq)
{
freqOut.WriteVInt32((delta << 1) | 1);
}
else
{
freqOut.WriteVInt32(delta << 1);
freqOut.WriteVInt32(termDocFreq);
}
lastPosition = 0;
lastOffset = 0;
}
/// <summary>
/// Add a new <paramref name="position"/> & <paramref name="payload"/>. </summary>
public override void AddPosition(int position, BytesRef payload, int startOffset, int endOffset)
{
//if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.Length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
Debug.Assert(indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0, "invalid indexOptions: " + indexOptions);
Debug.Assert(proxOut != null);
int delta = position - lastPosition;
Debug.Assert(delta >= 0, "position=" + position + " lastPosition=" + lastPosition); // not quite right (if pos=0 is repeated twice we don't catch it)
lastPosition = position;
int payloadLength = 0;
if (storePayloads)
{
payloadLength = payload == null ? 0 : payload.Length;
if (payloadLength != lastPayloadLength)
{
lastPayloadLength = payloadLength;
proxOut.WriteVInt32((delta << 1) | 1);
proxOut.WriteVInt32(payloadLength);
}
else
{
proxOut.WriteVInt32(delta << 1);
}
}
else
{
proxOut.WriteVInt32(delta);
}
if (storeOffsets)
{
// don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
// and the numbers aren't that much smaller anyways.
int offsetDelta = startOffset - lastOffset;
int offsetLength = endOffset - startOffset;
Debug.Assert(offsetDelta >= 0 && offsetLength >= 0, "startOffset=" + startOffset + ",lastOffset=" + lastOffset + ",endOffset=" + endOffset);
if (offsetLength != lastOffsetLength)
{
proxOut.WriteVInt32(offsetDelta << 1 | 1);
proxOut.WriteVInt32(offsetLength);
}
else
{
proxOut.WriteVInt32(offsetDelta << 1);
}
lastOffset = startOffset;
lastOffsetLength = offsetLength;
}
if (payloadLength > 0)
{
proxOut.WriteBytes(payload.Bytes, payload.Offset, payloadLength);
}
}
public override void FinishDoc()
{
}
internal class StandardTermState : BlockTermState
{
public long FreqStart { get; set; }
public long ProxStart { get; set; }
public long SkipOffset { get; set; }
}
/// <summary>
/// Called when we are done adding docs to this term. </summary>
public override void FinishTerm(BlockTermState state)
{
StandardTermState state_ = (StandardTermState)state;
// if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart);
Debug.Assert(state_.DocFreq > 0);
// TODO: wasteful we are counting this (counting # docs
// for this term) in two places?
Debug.Assert(state_.DocFreq == df);
state_.FreqStart = freqStart;
state_.ProxStart = proxStart;
if (df >= skipMinimum)
{
state_.SkipOffset = skipListWriter.WriteSkip(freqOut) - freqStart;
}
else
{
state_.SkipOffset = -1;
}
lastDocID = 0;
df = 0;
}
public override void EncodeTerm(long[] empty, DataOutput @out, FieldInfo fieldInfo, BlockTermState state, bool absolute)
{
StandardTermState state_ = (StandardTermState)state;
if (absolute)
{
lastState = emptyState;
}
@out.WriteVInt64(state_.FreqStart - lastState.FreqStart);
if (state_.SkipOffset != -1)
{
Debug.Assert(state_.SkipOffset > 0);
@out.WriteVInt64(state_.SkipOffset);
}
if (indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0)
{
@out.WriteVInt64(state_.ProxStart - lastState.ProxStart);
}
lastState = state_;
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
try
{
freqOut.Dispose();
}
finally
{
if (proxOut != null)
{
proxOut.Dispose();
}
}
}
}
}
#pragma warning restore 612, 618
}