src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/CapitalizationFilter.cs

﻿using System;
using System.Collections.Generic;
using System.Globalization;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;

namespace Lucene.Net.Analysis.Miscellaneous
{
    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    /// <summary>
    /// A filter to apply normal capitalization rules to Tokens.  It will make the first letter
    /// capital and the rest lower case.
    /// <para/>
    /// This filter is particularly useful to build nice looking facet parameters.  This filter
    /// is not appropriate if you intend to use a prefix query.
    /// </summary>
    public sealed class CapitalizationFilter : TokenFilter
    {
        public static readonly int DEFAULT_MAX_WORD_COUNT = int.MaxValue;
        public static readonly int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue;

        private readonly bool onlyFirstWord;
        private readonly CharArraySet keep;
        private readonly bool forceFirstLetter;
        private readonly ICollection<char[]> okPrefix;

        private readonly int minWordLength;
        private readonly int maxWordCount;
        private readonly int maxTokenLength;

        private readonly ICharTermAttribute termAtt;

        // LUCENENET specific for specifying culture instead of using
        // invariant culture (which makes this class more generally useful).
        // Per MSDN, InvariantCulture shouldn't be used for cases such as this:
        // https://msdn.microsoft.com/en-us/library/dd465121(v=vs.110).aspx
        private readonly CultureInfo culture;

        /// <summary>
        /// Creates a <see cref="CapitalizationFilter"/> with the default parameters using the culture from the current thread.
        /// <para>
        /// Calls <see cref="CapitalizationFilter.CapitalizationFilter(TokenStream, bool, CharArraySet, bool, ICollection{char[]}, int, int, int)">
        ///     CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, null)
        /// </see>
        /// </para>
        /// </summary>
        public CapitalizationFilter(TokenStream @in)
            : this(@in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, null)
        {
        }

        /// <summary>
        /// Creates a <see cref="CapitalizationFilter"/> with the default parameters and the specified <paramref name="culture"/>.
        /// <para>
        /// Calls <see cref="CapitalizationFilter.CapitalizationFilter(TokenStream, bool, CharArraySet, bool, ICollection{char[]}, int, int, int)">
        ///     CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH)
        /// </see>
        /// </para>
        /// </summary>
        /// <param name="in"> input tokenstream </param>
        /// <param name="culture"> The culture to use for the casing operation. If null, the culture of the current thread will be used. </param>
        // LUCENENET specific overload for specifying culture instead of using
        // invariant culture (which makes this class more generally useful).
        public CapitalizationFilter(TokenStream @in, CultureInfo culture)
            : this(@in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, culture)
        {
        }

        /// <summary>
        /// Creates a <see cref="CapitalizationFilter"/> with the specified parameters using the culture from the current thread.</summary>
        /// <param name="in"> input tokenstream </param>
        /// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param>
        /// <param name="keep"> a keep word list.  Each word that should be kept separated by whitespace. </param>
        /// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param>
        /// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param>
        /// <param name="minWordLength"> how long the word needs to be to get capitalization applied.  If the
        ///                      minWordLength is 3, "and" &gt; "And" but "or" stays "or". </param>
        /// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is
        ///                     assumed to be correct. </param>
        /// <param name="maxTokenLength"> The maximum length for an individual token. Tokens that exceed this length will not have the capitalization operation performed. </param>
        public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength)
            : this(@in, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength, null)
        {
        }

        /// <summary>
        /// Creates a <see cref="CapitalizationFilter"/> with the specified parameters and the specified <paramref name="culture"/>. </summary>
        /// <param name="in"> input tokenstream </param>
        /// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param>
        /// <param name="keep"> a keep word list.  Each word that should be kept separated by whitespace. </param>
        /// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param>
        /// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param>
        /// <param name="minWordLength"> how long the word needs to be to get capitalization applied.  If the
        ///                      minWordLength is 3, "and" &gt; "And" but "or" stays "or". </param>
        /// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is
        ///                     assumed to be correct. </param>
        /// <param name="maxTokenLength"> The maximum length for an individual token. Tokens that exceed this length will not have the capitalization operation performed. </param>
        /// <param name="culture"> The culture to use for the casing operation. If null, the culture of the current thread will be used. </param>
        // LUCENENET specific overload for specifying culture instead of using
        // invariant culture (which makes this class more generally useful).
        public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength, CultureInfo culture)
            : base(@in)
        {
            // LUCENENET NOTE: The guard clauses were copied here from a later version of Lucene.
            // Apparently, the tests were not ported from 4.8.0 because they expected this and the
            // original tests did not. Adding them anyway because there is no downside to this.
            if (minWordLength < 0)
            {
                throw new ArgumentOutOfRangeException("minWordLength must be greater than or equal to zero");
            }
            if (maxWordCount < 1)
            {
                throw new ArgumentOutOfRangeException("maxWordCount must be greater than zero");
            }
            if (maxTokenLength < 1)
            {
                throw new ArgumentOutOfRangeException("maxTokenLength must be greater than zero");
            }

            this.onlyFirstWord = onlyFirstWord;
            this.keep = keep;
            this.forceFirstLetter = forceFirstLetter;
            this.okPrefix = okPrefix;
            this.minWordLength = minWordLength;
            this.maxWordCount = maxWordCount;
            this.maxTokenLength = maxTokenLength;
            this.culture = culture;
            termAtt = AddAttribute<ICharTermAttribute>();
        }

        public override bool IncrementToken()
        {
            if (!m_input.IncrementToken())
            {
                return false;
            }

            char[] termBuffer = termAtt.Buffer;
            int termBufferLength = termAtt.Length;
            char[] backup = null;

            if (maxWordCount < DEFAULT_MAX_WORD_COUNT)
            {
                //make a backup in case we exceed the word count
                backup = new char[termBufferLength];
                Array.Copy(termBuffer, 0, backup, 0, termBufferLength);
            }

            if (termBufferLength < maxTokenLength)
            {
                int wordCount = 0;

                int lastWordStart = 0;
                for (int i = 0; i < termBufferLength; i++)
                {
                    char c = termBuffer[i];
                    if (c <= ' ' || c == '.')
                    {
                        int len = i - lastWordStart;
                        if (len > 0)
                        {
                            ProcessWord(termBuffer, lastWordStart, len, wordCount++);
                            lastWordStart = i + 1;
                            i++;
                        }
                    }
                }

                // process the last word
                if (lastWordStart < termBufferLength)
                {
                    ProcessWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
                }

                if (wordCount > maxWordCount)
                {
                    termAtt.CopyBuffer(backup, 0, termBufferLength);
                }
            }

            return true;
        }

        private void ProcessWord(char[] buffer, int offset, int length, int wordCount)
        {
            if (length < 1)
            {
                return;
            }

            if (onlyFirstWord && wordCount > 0)
            {
                for (int i = 0; i < length; i++)
                {
                    buffer[offset + i] = Culture.TextInfo.ToLower(buffer[offset + i]);
                }
                return;
            }

            if (keep != null && keep.Contains(buffer, offset, length))
            {
                if (wordCount == 0 && forceFirstLetter)
                {
                    buffer[offset] = Culture.TextInfo.ToUpper(buffer[offset]);
                }
                return;
            }

            if (length < minWordLength)
            {
                return;
            }

            if (okPrefix != null)
            {
                foreach (char[] prefix in okPrefix)
                {
                    if (length >= prefix.Length) //don't bother checking if the buffer length is less than the prefix
                    {
                        bool match = true;
                        for (int i = 0; i < prefix.Length; i++)
                        {
                            if (prefix[i] != buffer[offset + i])
                            {
                                match = false;
                                break;
                            }
                        }
                        if (match)
                        {
                            return;
                        }
                    }
                }
            }

            // We know it has at least one character
            /*char[] chars = w.toCharArray();
            StringBuilder word = new StringBuilder( w.length() );
            word.append( Character.toUpperCase( chars[0] ) );*/
            buffer[offset] = Culture.TextInfo.ToUpper(buffer[offset]);

            for (int i = 1; i < length; i++)
            {
                buffer[offset + i] = Culture.TextInfo.ToLower(buffer[offset + i]);
            }
            //return word.toString();
        }

        // LUCENENET specific - we get the culture from the current thread if it wasn't
        // provided in the constructor. This allows us to change the culture on the current
        // thread and work like other .NET components. But culture can be overridden by
        // passing it to the constructor.
        private CultureInfo Culture
        {
            get
            {
                return culture ?? CultureInfo.CurrentCulture;
            }
        }
    }
}