forked from apache/lucenenet
-
Notifications
You must be signed in to change notification settings - Fork 3
/
BufferedUpdates.cs
337 lines (304 loc) · 16.5 KB
/
BufferedUpdates.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
using J2N.Threading.Atomic;
using Lucene.Net.Support;
using System.Collections.Generic;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using BinaryDocValuesUpdate = Lucene.Net.Index.DocValuesUpdate.BinaryDocValuesUpdate;
using NumericDocValuesUpdate = Lucene.Net.Index.DocValuesUpdate.NumericDocValuesUpdate;
using Query = Lucene.Net.Search.Query;
using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator;
/// <summary>
/// Holds buffered deletes and updates, by docID, term or query for a
/// single segment. this is used to hold buffered pending
/// deletes and updates against the to-be-flushed segment. Once the
/// deletes and updates are pushed (on flush in <see cref="DocumentsWriter"/>), they
/// are converted to a FrozenDeletes instance.
/// <para/>
/// NOTE: instances of this class are accessed either via a private
/// instance on <see cref="DocumentsWriterPerThread"/>, or via sync'd code by
/// <see cref="DocumentsWriterDeleteQueue"/>
/// </summary>
public class BufferedUpdates // LUCENENET NOTE: Made public rather than internal because it is available through a public API
{
/* Rough logic: HashMap has an array[Entry] w/ varying
load factor (say 2 * POINTER). Entry is object w/ Term
key, Integer val, int hash, Entry next
(OBJ_HEADER + 3*POINTER + INT). Term is object w/
String field and String text (OBJ_HEADER + 2*POINTER).
Term's field is String (OBJ_HEADER + 4*INT + POINTER +
OBJ_HEADER + string.length*CHAR).
Term's text is String (OBJ_HEADER + 4*INT + POINTER +
OBJ_HEADER + string.length*CHAR). Integer is
OBJ_HEADER + INT. */
internal static readonly int BYTES_PER_DEL_TERM = 9 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 7 * RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 10 * RamUsageEstimator.NUM_BYTES_INT32;
/* Rough logic: del docIDs are List<Integer>. Say list
allocates ~2X size (2*POINTER). Integer is OBJ_HEADER
+ int */
internal static readonly int BYTES_PER_DEL_DOCID = 2 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT32;
/* Rough logic: HashMap has an array[Entry] w/ varying
load factor (say 2 * POINTER). Entry is object w/
Query key, Integer val, int hash, Entry next
(OBJ_HEADER + 3*POINTER + INT). Query we often
undercount (say 24 bytes). Integer is OBJ_HEADER + INT. */
internal static readonly int BYTES_PER_DEL_QUERY = 5 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2 * RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 2 * RamUsageEstimator.NUM_BYTES_INT32 + 24;
/* Rough logic: NumericUpdate calculates its actual size,
* including the update Term and DV field (String). The
* per-field map holds a reference to the updated field, and
* therefore we only account for the object reference and
* map space itself. this is incremented when we first see
* an updated field.
*
* HashMap has an array[Entry] w/ varying load
* factor (say 2*POINTER). Entry is an object w/ String key,
* LinkedHashMap val, int hash, Entry next (OBJ_HEADER + 3*POINTER + INT).
*
* LinkedHashMap (val) is counted as OBJ_HEADER, array[Entry] ref + header, 4*INT, 1*FLOAT,
* Set (entrySet) (2*OBJ_HEADER + ARRAY_HEADER + 2*POINTER + 4*INT + FLOAT)
*/
internal static readonly int BYTES_PER_NUMERIC_FIELD_ENTRY = 7 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 3 * RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + 5 * RamUsageEstimator.NUM_BYTES_INT32 + RamUsageEstimator.NUM_BYTES_SINGLE;
/* Rough logic: Incremented when we see another Term for an already updated
* field.
* LinkedHashMap has an array[Entry] w/ varying load factor
* (say 2*POINTER). Entry is an object w/ Term key, NumericUpdate val,
* int hash, Entry next, Entry before, Entry after (OBJ_HEADER + 5*POINTER + INT).
*
* Term (key) is counted only as POINTER.
* NumericUpdate (val) counts its own size and isn't accounted for here.
*/
internal static readonly int BYTES_PER_NUMERIC_UPDATE_ENTRY = 7 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT32;
/* Rough logic: BinaryUpdate calculates its actual size,
* including the update Term and DV field (String). The
* per-field map holds a reference to the updated field, and
* therefore we only account for the object reference and
* map space itself. this is incremented when we first see
* an updated field.
*
* HashMap has an array[Entry] w/ varying load
* factor (say 2*POINTER). Entry is an object w/ String key,
* LinkedHashMap val, int hash, Entry next (OBJ_HEADER + 3*POINTER + INT).
*
* LinkedHashMap (val) is counted as OBJ_HEADER, array[Entry] ref + header, 4*INT, 1*FLOAT,
* Set (entrySet) (2*OBJ_HEADER + ARRAY_HEADER + 2*POINTER + 4*INT + FLOAT)
*/
internal static readonly int BYTES_PER_BINARY_FIELD_ENTRY = 7 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 3 * RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + 5 * RamUsageEstimator.NUM_BYTES_INT32 + RamUsageEstimator.NUM_BYTES_SINGLE;
/* Rough logic: Incremented when we see another Term for an already updated
* field.
* LinkedHashMap has an array[Entry] w/ varying load factor
* (say 2*POINTER). Entry is an object w/ Term key, BinaryUpdate val,
* int hash, Entry next, Entry before, Entry after (OBJ_HEADER + 5*POINTER + INT).
*
* Term (key) is counted only as POINTER.
* BinaryUpdate (val) counts its own size and isn't accounted for here.
*/
internal static readonly int BYTES_PER_BINARY_UPDATE_ENTRY = 7 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT32;
internal readonly AtomicInt32 numTermDeletes = new AtomicInt32();
internal readonly AtomicInt32 numNumericUpdates = new AtomicInt32();
internal readonly AtomicInt32 numBinaryUpdates = new AtomicInt32();
internal readonly IDictionary<Term, int?> terms = new Dictionary<Term, int?>();
internal readonly IDictionary<Query, int?> queries = new Dictionary<Query, int?>();
internal readonly IList<int?> docIDs = new List<int?>();
// Map<dvField,Map<updateTerm,NumericUpdate>>
// For each field we keep an ordered list of NumericUpdates, key'd by the
// update Term. LinkedHashMap guarantees we will later traverse the map in
// insertion order (so that if two terms affect the same document, the last
// one that came in wins), and helps us detect faster if the same Term is
// used to update the same field multiple times (so we later traverse it
// only once).
internal readonly IDictionary<string, LinkedHashMap<Term, NumericDocValuesUpdate>> numericUpdates = new Dictionary<string, LinkedHashMap<Term, NumericDocValuesUpdate>>();
// Map<dvField,Map<updateTerm,BinaryUpdate>>
// For each field we keep an ordered list of BinaryUpdates, key'd by the
// update Term. LinkedHashMap guarantees we will later traverse the map in
// insertion order (so that if two terms affect the same document, the last
// one that came in wins), and helps us detect faster if the same Term is
// used to update the same field multiple times (so we later traverse it
// only once).
internal readonly IDictionary<string, LinkedHashMap<Term, BinaryDocValuesUpdate>> binaryUpdates = new Dictionary<string, LinkedHashMap<Term, BinaryDocValuesUpdate>>();
/// <summary>
/// NOTE: This was MAX_INT in Lucene
/// </summary>
public static readonly int MAX_INT32 = int.MaxValue;
internal readonly AtomicInt64 bytesUsed;
private static bool VERBOSE_DELETES = false;
internal long gen;
internal BufferedUpdates() // LUCENENET NOTE: Made internal rather than public, since this class is intended to be internal but couldn't be because it is exposed through a public API
{
this.bytesUsed = new AtomicInt64();
}
public override string ToString()
{
if (VERBOSE_DELETES)
{
return "gen=" + gen + " numTerms=" + numTermDeletes + ", terms=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", terms)
+ ", queries=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", queries) + ", docIDs=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", docIDs)
+ ", numericUpdates=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", numericUpdates)
+ ", binaryUpdates=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", binaryUpdates) + ", bytesUsed=" + bytesUsed;
}
else
{
string s = "gen=" + gen;
if (numTermDeletes != 0)
{
s += " " + numTermDeletes.Value + " deleted terms (unique count=" + terms.Count + ")";
}
if (queries.Count != 0)
{
s += " " + queries.Count + " deleted queries";
}
if (docIDs.Count != 0)
{
s += " " + docIDs.Count + " deleted docIDs";
}
if (numNumericUpdates != 0)
{
s += " " + numNumericUpdates + " numeric updates (unique count=" + numericUpdates.Count + ")";
}
if (numBinaryUpdates != 0)
{
s += " " + numBinaryUpdates + " binary updates (unique count=" + binaryUpdates.Count + ")";
}
if (bytesUsed != 0)
{
s += " bytesUsed=" + bytesUsed;
}
return s;
}
}
public virtual void AddQuery(Query query, int docIDUpto)
{
int? prev;
queries.TryGetValue(query, out prev);
queries[query] = docIDUpto;
// increment bytes used only if the query wasn't added so far.
if (prev == null)
{
bytesUsed.AddAndGet(BYTES_PER_DEL_QUERY);
}
}
public virtual void AddDocID(int docID)
{
docIDs.Add(docID);
bytesUsed.AddAndGet(BYTES_PER_DEL_DOCID);
}
public virtual void AddTerm(Term term, int docIDUpto)
{
int? current;
terms.TryGetValue(term, out current);
if (current != null && docIDUpto < current)
{
// Only record the new number if it's greater than the
// current one. this is important because if multiple
// threads are replacing the same doc at nearly the
// same time, it's possible that one thread that got a
// higher docID is scheduled before the other
// threads. If we blindly replace than we can
// incorrectly get both docs indexed.
return;
}
terms[term] = docIDUpto;
// note that if current != null then it means there's already a buffered
// delete on that term, therefore we seem to over-count. this over-counting
// is done to respect IndexWriterConfig.setMaxBufferedDeleteTerms.
numTermDeletes.IncrementAndGet();
if (current == null)
{
bytesUsed.AddAndGet(BYTES_PER_DEL_TERM + term.Bytes.Length + (RamUsageEstimator.NUM_BYTES_CHAR * term.Field.Length));
}
}
public virtual void AddNumericUpdate(NumericDocValuesUpdate update, int docIDUpto)
{
LinkedHashMap<Term, NumericDocValuesUpdate> fieldUpdates = null;
if (!numericUpdates.TryGetValue(update.field, out fieldUpdates))
{
fieldUpdates = new LinkedHashMap<Term, NumericDocValuesUpdate>();
numericUpdates[update.field] = fieldUpdates;
bytesUsed.AddAndGet(BYTES_PER_NUMERIC_FIELD_ENTRY);
}
NumericDocValuesUpdate current;
if (fieldUpdates.TryGetValue(update.term, out current) && current != null && docIDUpto < current.docIDUpto)
{
// Only record the new number if it's greater than or equal to the current
// one. this is important because if multiple threads are replacing the
// same doc at nearly the same time, it's possible that one thread that
// got a higher docID is scheduled before the other threads.
return;
}
update.docIDUpto = docIDUpto;
// since it's an LinkedHashMap, we must first remove the Term entry so that
// it's added last (we're interested in insertion-order).
if (current != null)
{
fieldUpdates.Remove(update.term);
}
fieldUpdates[update.term] = update;
numNumericUpdates.IncrementAndGet();
if (current == null)
{
bytesUsed.AddAndGet(BYTES_PER_NUMERIC_UPDATE_ENTRY + update.GetSizeInBytes());
}
}
public virtual void AddBinaryUpdate(BinaryDocValuesUpdate update, int docIDUpto)
{
LinkedHashMap<Term, BinaryDocValuesUpdate> fieldUpdates;
if (!binaryUpdates.TryGetValue(update.field, out fieldUpdates))
{
fieldUpdates = new LinkedHashMap<Term, BinaryDocValuesUpdate>();
binaryUpdates[update.field] = fieldUpdates;
bytesUsed.AddAndGet(BYTES_PER_BINARY_FIELD_ENTRY);
}
BinaryDocValuesUpdate current;
if (fieldUpdates.TryGetValue(update.term, out current) && current != null && docIDUpto < current.docIDUpto)
{
// Only record the new number if it's greater than or equal to the current
// one. this is important because if multiple threads are replacing the
// same doc at nearly the same time, it's possible that one thread that
// got a higher docID is scheduled before the other threads.
return;
}
update.docIDUpto = docIDUpto;
// since it's an LinkedHashMap, we must first remove the Term entry so that
// it's added last (we're interested in insertion-order).
if (current != null)
{
fieldUpdates.Remove(update.term);
}
fieldUpdates[update.term] = update;
numBinaryUpdates.IncrementAndGet();
if (current == null)
{
bytesUsed.AddAndGet(BYTES_PER_BINARY_UPDATE_ENTRY + update.GetSizeInBytes());
}
}
internal virtual void Clear()
{
terms.Clear();
queries.Clear();
docIDs.Clear();
numericUpdates.Clear();
binaryUpdates.Clear();
numTermDeletes.Value = 0;
numNumericUpdates.Value = 0;
numBinaryUpdates.Value = 0;
bytesUsed.Value = 0;
}
internal virtual bool Any()
{
return terms.Count > 0 || docIDs.Count > 0 || queries.Count > 0 || numericUpdates.Count > 0 || binaryUpdates.Count > 0;
}
}
}