src/Lucene.Net.TestFramework/Codecs/Lucene40/Lucene40PostingsWriter.cs

using Lucene.Net.Index;
using Lucene.Net.Store;
using Lucene.Net.Util;
using Debug = Lucene.Net.Diagnostics.Debug; // LUCENENET NOTE: We cannot use System.Diagnostics.Debug because those calls will be optimized out of the release!

namespace Lucene.Net.Codecs.Lucene40
{
    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    /// <summary>
    /// Concrete class that writes the 4.0 frq/prx postings format.
    /// <para/>
    /// @lucene.experimental
    /// </summary>
    /// <seealso cref="Lucene40PostingsFormat"/>
#pragma warning disable 612, 618
    public sealed class Lucene40PostingsWriter : PostingsWriterBase
    {
        internal readonly IndexOutput freqOut;
        internal readonly IndexOutput proxOut;
        internal readonly Lucene40SkipListWriter skipListWriter;

        /// <summary>
        /// Expert: The fraction of TermDocs entries stored in skip tables,
        /// used to accelerate <see cref="Search.DocIdSetIterator.Advance(int)"/>.  Larger values result in
        /// smaller indexes, greater acceleration, but fewer accelerable cases, while
        /// smaller values result in bigger indexes, less acceleration and more
        /// accelerable cases. More detailed experiments would be useful here.
        /// </summary>
        internal const int DEFAULT_SKIP_INTERVAL = 16;

        internal readonly int skipInterval;

        /// <summary>
        /// Expert: minimum docFreq to write any skip data at all
        /// </summary>
        internal readonly int skipMinimum;

        /// <summary>
        /// Expert: The maximum number of skip levels. Smaller values result in
        /// slightly smaller indexes, but slower skipping in big posting lists.
        /// </summary>
        internal readonly int maxSkipLevels = 10;

        internal readonly int totalNumDocs;

        internal IndexOptions indexOptions;
        internal bool storePayloads;
        internal bool storeOffsets;

        // Starts a new term
        internal long freqStart;

        internal long proxStart;
        internal FieldInfo fieldInfo;
        internal int lastPayloadLength;
        internal int lastOffsetLength;
        internal int lastPosition;
        internal int lastOffset;

        internal static readonly StandardTermState emptyState = new StandardTermState();
        internal StandardTermState lastState;

        // private String segment;

        /// <summary>
        /// Creates a <see cref="Lucene40PostingsWriter"/>, with the
        /// <see cref="DEFAULT_SKIP_INTERVAL"/>.
        /// </summary>
        public Lucene40PostingsWriter(SegmentWriteState state)
            : this(state, DEFAULT_SKIP_INTERVAL)
        {
        }

        /// <summary>
        /// Creates a <see cref="Lucene40PostingsWriter"/>, with the
        /// specified <paramref name="skipInterval"/>.
        /// </summary>
        public Lucene40PostingsWriter(SegmentWriteState state, int skipInterval)
            : base()
        {
            this.skipInterval = skipInterval;
            this.skipMinimum = skipInterval; // set to the same for now
            // this.segment = state.segmentName;
            string fileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION);
            freqOut = state.Directory.CreateOutput(fileName, state.Context);
            bool success = false;
            IndexOutput proxOut = null;
            try
            {
                CodecUtil.WriteHeader(freqOut, Lucene40PostingsReader.FRQ_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
                // TODO: this is a best effort, if one of these fields has no postings
                // then we make an empty prx file, same as if we are wrapped in
                // per-field postingsformat. maybe... we shouldn't
                // bother w/ this opto?  just create empty prx file...?
                if (state.FieldInfos.HasProx)
                {
                    // At least one field does not omit TF, so create the
                    // prox file
                    fileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION);
                    proxOut = state.Directory.CreateOutput(fileName, state.Context);
                    CodecUtil.WriteHeader(proxOut, Lucene40PostingsReader.PRX_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
                }
                else
                {
                    // Every field omits TF so we will write no prox file
                    proxOut = null;
                }
                this.proxOut = proxOut;
                success = true;
            }
            finally
            {
                if (!success)
                {
                    IOUtils.DisposeWhileHandlingException(freqOut, proxOut);
                }
            }

            totalNumDocs = state.SegmentInfo.DocCount;

            skipListWriter = new Lucene40SkipListWriter(skipInterval, maxSkipLevels, totalNumDocs, freqOut, proxOut);
        }

        public override void Init(IndexOutput termsOut)
        {
            CodecUtil.WriteHeader(termsOut, Lucene40PostingsReader.TERMS_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
            termsOut.WriteInt32(skipInterval); // write skipInterval
            termsOut.WriteInt32(maxSkipLevels); // write maxSkipLevels
            termsOut.WriteInt32(skipMinimum); // write skipMinimum
        }

        public override BlockTermState NewTermState()
        {
            return new StandardTermState();
        }

        public override void StartTerm()
        {
            freqStart = freqOut.GetFilePointer();
            //if (DEBUG) System.out.println("SPW: startTerm freqOut.fp=" + freqStart);
            if (proxOut != null)
            {
                proxStart = proxOut.GetFilePointer();
            }
            // force first payload to write its length
            lastPayloadLength = -1;
            // force first offset to write its length
            lastOffsetLength = -1;
            skipListWriter.ResetSkip();
        }

        // Currently, this instance is re-used across fields, so
        // our parent calls setField whenever the field changes
        public override int SetField(FieldInfo fieldInfo)
        {
            //System.out.println("SPW: setField");
            /*
            if (BlockTreeTermsWriter.DEBUG && fieldInfo.Name.Equals("id", StringComparison.Ordinal)) {
              DEBUG = true;
            } else {
              DEBUG = false;
            }
            */
            this.fieldInfo = fieldInfo;
            indexOptions = fieldInfo.IndexOptions;

            storeOffsets = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
            storePayloads = fieldInfo.HasPayloads;
            lastState = emptyState;
            //System.out.println("  set init blockFreqStart=" + freqStart);
            //System.out.println("  set init blockProxStart=" + proxStart);
            return 0;
        }

        internal int lastDocID;
        internal int df;

        public override void StartDoc(int docID, int termDocFreq)
        {
            // if (DEBUG) System.out.println("SPW:   startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());

            int delta = docID - lastDocID;

            if (docID < 0 || (df > 0 && delta <= 0))
            {
                throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (freqOut: " + freqOut + ")");
            }

            if ((++df % skipInterval) == 0)
            {
                skipListWriter.SetSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
                skipListWriter.BufferSkip(df);
            }

            Debug.Assert(docID < totalNumDocs, "docID=" + docID + " totalNumDocs=" + totalNumDocs);

            lastDocID = docID;
            if (indexOptions == IndexOptions.DOCS_ONLY)
            {
                freqOut.WriteVInt32(delta);
            }
            else if (1 == termDocFreq)
            {
                freqOut.WriteVInt32((delta << 1) | 1);
            }
            else
            {
                freqOut.WriteVInt32(delta << 1);
                freqOut.WriteVInt32(termDocFreq);
            }

            lastPosition = 0;
            lastOffset = 0;
        }

        /// <summary>
        /// Add a new <paramref name="position"/> &amp; <paramref name="payload"/>. </summary>
        public override void AddPosition(int position, BytesRef payload, int startOffset, int endOffset)
        {
            //if (DEBUG) System.out.println("SPW:     addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.Length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
            Debug.Assert(indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0, "invalid indexOptions: " + indexOptions);
            Debug.Assert(proxOut != null);

            int delta = position - lastPosition;

            Debug.Assert(delta >= 0, "position=" + position + " lastPosition=" + lastPosition); // not quite right (if pos=0 is repeated twice we don't catch it)

            lastPosition = position;

            int payloadLength = 0;

            if (storePayloads)
            {
                payloadLength = payload == null ? 0 : payload.Length;

                if (payloadLength != lastPayloadLength)
                {
                    lastPayloadLength = payloadLength;
                    proxOut.WriteVInt32((delta << 1) | 1);
                    proxOut.WriteVInt32(payloadLength);
                }
                else
                {
                    proxOut.WriteVInt32(delta << 1);
                }
            }
            else
            {
                proxOut.WriteVInt32(delta);
            }

            if (storeOffsets)
            {
                // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
                // and the numbers aren't that much smaller anyways.
                int offsetDelta = startOffset - lastOffset;
                int offsetLength = endOffset - startOffset;
                Debug.Assert(offsetDelta >= 0 && offsetLength >= 0, "startOffset=" + startOffset + ",lastOffset=" + lastOffset + ",endOffset=" + endOffset);
                if (offsetLength != lastOffsetLength)
                {
                    proxOut.WriteVInt32(offsetDelta << 1 | 1);
                    proxOut.WriteVInt32(offsetLength);
                }
                else
                {
                    proxOut.WriteVInt32(offsetDelta << 1);
                }
                lastOffset = startOffset;
                lastOffsetLength = offsetLength;
            }

            if (payloadLength > 0)
            {
                proxOut.WriteBytes(payload.Bytes, payload.Offset, payloadLength);
            }
        }

        public override void FinishDoc()
        {
        }

        internal class StandardTermState : BlockTermState
        {
            public long FreqStart { get; set; }
            public long ProxStart { get; set; }
            public long SkipOffset { get; set; }
        }

        /// <summary>
        /// Called when we are done adding docs to this term. </summary>
        public override void FinishTerm(BlockTermState state)
        {
            StandardTermState state_ = (StandardTermState)state;
            // if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart);
            Debug.Assert(state_.DocFreq > 0);

            // TODO: wasteful we are counting this (counting # docs
            // for this term) in two places?
            Debug.Assert(state_.DocFreq == df);
            state_.FreqStart = freqStart;
            state_.ProxStart = proxStart;
            if (df >= skipMinimum)
            {
                state_.SkipOffset = skipListWriter.WriteSkip(freqOut) - freqStart;
            }
            else
            {
                state_.SkipOffset = -1;
            }
            lastDocID = 0;
            df = 0;
        }

        public override void EncodeTerm(long[] empty, DataOutput @out, FieldInfo fieldInfo, BlockTermState state, bool absolute)
        {
            StandardTermState state_ = (StandardTermState)state;
            if (absolute)
            {
                lastState = emptyState;
            }
            @out.WriteVInt64(state_.FreqStart - lastState.FreqStart);
            if (state_.SkipOffset != -1)
            {
                Debug.Assert(state_.SkipOffset > 0);
                @out.WriteVInt64(state_.SkipOffset);
            }
            if (indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0)
            {
                @out.WriteVInt64(state_.ProxStart - lastState.ProxStart);
            }
            lastState = state_;
        }

        protected override void Dispose(bool disposing)
        {
            if (disposing)
            {
                try
                {
                    freqOut.Dispose();
                }
                finally
                {
                    if (proxOut != null)
                    {
                        proxOut.Dispose();
                    }
                }
            }
        }
    }
#pragma warning restore 612, 618
}