src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs

﻿// commons-codec version compatibility level: 1.9
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace Lucene.Net.Analysis.Phonetic.Language.Bm
{
    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    /// <summary>
    /// Converts words into potential phonetic representations.
    /// </summary>
    /// <remarks>
    /// This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes
    /// into account the likely source language. Next, this phonetic representation is converted into a
    /// pan-European 'average' representation, allowing comparison between different versions of essentially
    /// the same word from different languages.
    /// <para/>
    /// This class is intentionally immutable and thread-safe.
    /// If you wish to alter the settings for a PhoneticEngine, you
    /// must make a new one with the updated settings.
    /// <para/>
    /// Ported from phoneticengine.php
    /// <para/>
    /// since 1.6
    /// </remarks>
    public class PhoneticEngine
    {
        internal Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled);

        /// <summary>
        /// Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside
        /// this package, and probably not outside the <see cref="PhoneticEngine"/> class.
        /// <para/>
        /// since 1.6
        /// </summary>
        internal sealed class PhonemeBuilder
        {
            /// <summary>
            /// An empty builder where all phonemes must come from some set of languages. This will contain a single
            /// phoneme of zero characters. This can then be appended to. This should be the only way to create a new
            /// phoneme from scratch.
            /// </summary>
            /// <param name="languages">The set of languages.</param>
            /// <returns>A new, empty phoneme builder.</returns>
            public static PhonemeBuilder Empty(LanguageSet languages)
            {
                return new PhonemeBuilder(new Phoneme("", languages));
            }

            private readonly IList<Phoneme> phonemes;

            private PhonemeBuilder(Phoneme phoneme)
            {
                // LUCENENET NOTE: LinkedHashSet cares about insertion order - in .NET, we can just use List<T> for that
                this.phonemes = new List<Phoneme>();
                this.phonemes.Add(phoneme);
            }

            internal PhonemeBuilder(IList<Phoneme> phonemes)
            {
                this.phonemes = phonemes;
            }

            /// <summary>
            /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
            /// </summary>
            /// <param name="str">The characters to append to the phonemes.</param>
            public void Append(ICharSequence str)
            {
                foreach (Phoneme ph in this.phonemes)
                {
                    ph.Append(str.ToString());
                }
            }

            /// <summary>
            /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
            /// </summary>
            /// <param name="str">The characters to append to the phonemes.</param>
            // LUCENENET specific
            public void Append(string str)
            {
                foreach (Phoneme ph in this.phonemes)
                {
                    ph.Append(str);
                }
            }

            /// <summary>
            /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>.
            /// </summary>
            /// <param name="str">The characters to append to the phonemes.</param>
            // LUCENENET specific
            public void Append(StringBuilder str)
            {
                foreach (Phoneme ph in this.phonemes)
                {
                    ph.Append(str.ToString());
                }
            }

            /// <summary>
            /// Applies the given phoneme expression to all phonemes in this phoneme builder.
            /// <para/>
            /// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
            /// incompatible.
            /// </summary>
            /// <param name="phonemeExpr">The expression to apply.</param>
            /// <param name="maxPhonemes">The maximum number of phonemes to build up.</param>
            public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes)
            {
                // LUCENENET NOTE: LinkedHashSet cares about insertion order - in .NET, we can just use List<T> for that
                IList<Phoneme> newPhonemes = new List<Phoneme>(maxPhonemes);

                //EXPR_continue:
                foreach (Phoneme left in this.phonemes)
                {
                    foreach (Phoneme right in phonemeExpr.Phonemes)
                    {
                        LanguageSet languages = left.Languages.RestrictTo(right.Languages);
                        if (!languages.IsEmpty)
                        {
                            Phoneme join = new Phoneme(left, right, languages);
                            if (newPhonemes.Count < maxPhonemes)
                            {
                                newPhonemes.Add(join);
                                if (newPhonemes.Count >= maxPhonemes)
                                {
                                    goto EXPR_break;
                                }
                            }
                        }
                    }
                }
                EXPR_break: { }

                this.phonemes.Clear();
                // LUCENENET: We need to filter out any duplicates, since we converted from LinkedHashSet
                // to List.
                this.phonemes.AddRange(newPhonemes.Where(x => !phonemes.Any(y => y.Equals(x))));
            }

            /// <summary>
            /// Gets underlying phoneme set. Please don't mutate.
            /// </summary>
            public IList<Phoneme> Phonemes
            {
                get { return this.phonemes; }
            }

            /// <summary>
            /// Stringifies the phoneme set. This produces a single string of the strings of each phoneme,
            /// joined with a pipe. This is explicitly provided in place of <see cref="object.ToString()"/> as it is a potentially
            /// expensive operation, which should be avoided when debugging.
            /// </summary>
            /// <returns>The stringified phoneme set.</returns>
            public string MakeString()
            {
                StringBuilder sb = new StringBuilder();

                foreach (Phoneme ph in this.phonemes)
                {
                    if (sb.Length > 0)
                    {
                        sb.Append("|");
                    }
                    sb.Append(ph.GetPhonemeText());
                }

                return sb.ToString();
            }
        }

        /// <summary>
        /// A function closure capturing the application of a list of rules to an input sequence at a particular offset.
        /// After invocation, the values <c>i</c> and <c>found</c> are updated. <c>i</c> points to the
        /// index of the next char in <c>input</c> that must be processed next (the input up to that index having been
        /// processed already), and <c>found</c> indicates if a matching rule was found or not. In the case where a
        /// matching rule was found, <c>phonemeBuilder</c> is replaced with a new builder containing the phonemes
        /// updated by the matching rule.
        /// <para/>
        /// Although this class is not thread-safe (it has mutable unprotected fields), it is not shared between threads
        /// as it is constructed as needed by the calling methods.
        /// <para/>
        /// since 1.6
        /// </summary>
        private sealed class RulesApplication
        {
            private readonly IDictionary<string, IList<Rule>> finalRules;
            private readonly string input;

            private PhonemeBuilder phonemeBuilder;
            private int i;
            private readonly int maxPhonemes;
            private bool found;

            public RulesApplication(IDictionary<string, IList<Rule>> finalRules, string input,
                                    PhonemeBuilder phonemeBuilder, int i, int maxPhonemes)
            {
                if (finalRules == null)
                {
                    throw new ArgumentNullException("The finalRules argument must not be null");
                }
                this.finalRules = finalRules;
                this.phonemeBuilder = phonemeBuilder;
                this.input = input;
                this.i = i;
                this.maxPhonemes = maxPhonemes;
            }

            public int I
            {
                get { return this.i; }
            }

            public PhonemeBuilder PhonemeBuilder
            {
                get { return this.phonemeBuilder; }
            }

            /// <summary>
            /// Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context
            /// and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no
            /// match, <c>i</c> is advanced one and the character is silently dropped from the phonetic spelling.
            /// </summary>
            /// <returns><c>this</c></returns>
            public RulesApplication Invoke()
            {
                this.found = false;
                int patternLength = 1;
                IList<Rule> rules;
                if (this.finalRules.TryGetValue(input.Substring(i, patternLength), out rules) && rules != null)
                {
                    foreach (Rule rule in rules)
                    {
                        string pattern = rule.Pattern;
                        patternLength = pattern.Length;
                        if (rule.PatternAndContextMatches(this.input, this.i))
                        {
                            this.phonemeBuilder.Apply(rule.Phoneme, maxPhonemes);
                            this.found = true;
                            break;
                        }
                    }
                }

                if (!this.found)
                {
                    patternLength = 1;
                }

                this.i += patternLength;
                return this;
            }

            public bool IsFound
            {
                get { return this.found; }
            }
        }

        private static readonly IDictionary<NameType, ISet<string>> NAME_PREFIXES = LoadNamePrefixes();

        private static IDictionary<NameType, ISet<string>> LoadNamePrefixes() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
        {
            var namePrefixes = new Dictionary<NameType, ISet<string>>();
            namePrefixes[NameType.ASHKENAZI] =
                    Collections.UnmodifiableSet(
                            new HashSet<string>() { "bar", "ben", "da", "de", "van", "von" });
            namePrefixes[NameType.SEPHARDIC] =
                    Collections.UnmodifiableSet(
                            new HashSet<string>() { "al", "el", "da", "dal", "de", "del", "dela", "de la",
                                                              "della", "des", "di", "do", "dos", "du", "van", "von" });
            namePrefixes[NameType.GENERIC] =
                    Collections.UnmodifiableSet(
                            new HashSet<string>() { "da", "dal", "de", "del", "dela", "de la", "della",
                                                          "des", "di", "do", "dos", "du", "van", "von" });
            return namePrefixes;
        }

        /// <summary>
        /// Joins some strings with an internal separator.
        /// </summary>
        /// <param name="strings">Strings to join.</param>
        /// <param name="sep">String to separate them with.</param>
        /// <returns>A single string consisting of each element of <paramref name="strings"/> interleaved by <paramref name="sep"/>.</returns>
        private static string Join(IEnumerable<string> strings, string sep)
        {
            StringBuilder sb = new StringBuilder();
            using (IEnumerator<string> si = strings.GetEnumerator())
            {
                if (si.MoveNext())
                {
                    sb.Append(si.Current);
                }
                while (si.MoveNext())
                {
                    sb.Append(sep).Append(si.Current);
                }
            }

            return sb.ToString();
        }

        private static readonly int DEFAULT_MAX_PHONEMES = 20;

        private readonly Lang lang;

        private readonly NameType nameType;

        private readonly RuleType ruleType;

        private readonly bool concat;

        private readonly int maxPhonemes;

        /// <summary>
        /// Generates a new, fully-configured phonetic engine.
        /// </summary>
        /// <param name="nameType">The type of names it will use.</param>
        /// <param name="ruleType">The type of rules it will apply.</param>
        /// <param name="concat">If it will concatenate multiple encodings.</param>
        public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat)
            : this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES)
        {
        }

        /// <summary>
        /// Generates a new, fully-configured phonetic engine.
        /// <para/>
        /// since 1.7
        /// </summary>
        /// <param name="nameType">The type of names it will use.</param>
        /// <param name="ruleType">The type of rules it will apply.</param>
        /// <param name="concat">If it will concatenate multiple encodings.</param>
        /// <param name="maxPhonemes">The maximum number of phonemes that will be handled.</param>
        public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat,
                              int maxPhonemes)
        {
            if (ruleType == RuleType.RULES)
            {
                throw new ArgumentException("ruleType must not be " + RuleType.RULES);
            }
            this.nameType = nameType;
            this.ruleType = ruleType;
            this.concat = concat;
            this.lang = Lang.GetInstance(nameType);
            this.maxPhonemes = maxPhonemes;
        }

        /// <summary>
        /// Applies the final rules to convert from a language-specific phonetic representation to a
        /// language-independent representation.
        /// </summary>
        /// <param name="phonemeBuilder">The current phonemes.</param>
        /// <param name="finalRules">The final rules to apply.</param>
        /// <returns>The resulting phonemes.</returns>
        private PhonemeBuilder ApplyFinalRules(PhonemeBuilder phonemeBuilder,
                                               IDictionary<string, IList<Rule>> finalRules)
        {
            if (finalRules == null)
            {
                throw new ArgumentNullException("finalRules can not be null");
            }
            if (finalRules.Count == 0)
            {
                return phonemeBuilder;
            }

            ISet<Phoneme> phonemes = new SortedSet<Phoneme>(Phoneme.COMPARER);

            foreach (Phoneme phoneme in phonemeBuilder.Phonemes)
            {
                PhonemeBuilder subBuilder = PhonemeBuilder.Empty(phoneme.Languages);
                string phonemeText = phoneme.GetPhonemeText();

                for (int i = 0; i < phonemeText.Length;)
                {
                    RulesApplication rulesApplication =
                            new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).Invoke();
                    bool found = rulesApplication.IsFound;
                    subBuilder = rulesApplication.PhonemeBuilder;

                    if (!found)
                    {
                        // not found, appending as-is
                        subBuilder.Append(phonemeText.Substring(i, 1));
                    }

                    i = rulesApplication.I;
                }

                phonemes.UnionWith(subBuilder.Phonemes);
            }

            return new PhonemeBuilder(phonemes.ToList());
        }

        /// <summary>
        /// Encodes a string to its phonetic representation.
        /// </summary>
        /// <param name="input">The string to encode.</param>
        /// <returns>The encoding of the input.</returns>
        public virtual string Encode(string input)
        {
            LanguageSet languageSet = this.lang.GuessLanguages(input);
            return Encode(input, languageSet);
        }

        /// <summary>
        /// Encodes an input string into an output phonetic representation, given a set of possible origin languages.
        /// </summary>
        /// <param name="input">String to phoneticise; a string with dashes or spaces separating each word.</param>
        /// <param name="languageSet"></param>
        /// <returns>A phonetic representation of the input; a string containing '-'-separated phonetic representations of the input.</returns>
        public virtual string Encode(string input, LanguageSet languageSet)
        {
            IDictionary<string, IList<Rule>> rules = Rule.GetInstanceMap(this.nameType, RuleType.RULES, languageSet);
            // rules common across many (all) languages
            IDictionary<string, IList<Rule>> finalRules1 = Rule.GetInstanceMap(this.nameType, this.ruleType, "common");
            // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
            IDictionary<string, IList<Rule>> finalRules2 = Rule.GetInstanceMap(this.nameType, this.ruleType, languageSet);

            // tidy the input
            // lower case is a locale-dependent operation
            input = input.ToLowerInvariant().Replace('-', ' ').Trim();

            if (this.nameType == NameType.GENERIC)
            {
                if (input.Length >= 2 && input.Substring(0, 2 - 0).Equals("d'", StringComparison.Ordinal))
                { // check for d'
                    string remainder = input.Substring(2);
                    string combined = "d" + remainder;
                    return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
                }
                foreach (string l in NAME_PREFIXES[this.nameType])
                {
                    // handle generic prefixes
                    if (input.StartsWith(l + " ", StringComparison.Ordinal))
                    {
                        // check for any prefix in the words list
                        string remainder = input.Substring(l.Length + 1); // input without the prefix
                        string combined = l + remainder; // input with prefix without space
                        return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")";
                    }
                }
            }

            IList<string> words = WHITESPACE.Split(input).TrimEnd().ToList();
            IList<string> words2 = new List<string>();

            // special-case handling of word prefixes based upon the name type
            switch (this.nameType)
            {
                case NameType.SEPHARDIC:
                    foreach (string aWord in words)
                    {
                        string[] parts = aWord.Split('\'').TrimEnd();
                        string lastPart = parts[parts.Length - 1];
                        words2.Add(lastPart);
                    }
                    words2.RemoveAll(NAME_PREFIXES[this.nameType]);
                    break;
                case NameType.ASHKENAZI:
                    words2.AddRange(words);
                    words2.RemoveAll(NAME_PREFIXES[this.nameType]);
                    break;
                case NameType.GENERIC:
                    words2.AddRange(words);
                    break;
                default:
                    throw new InvalidOperationException("Unreachable case: " + this.nameType);
            }

            if (this.concat)
            {
                // concat mode enabled
                input = Join(words2, " ");
            }
            else if (words2.Count == 1)
            {
                // not a multi-word name
                //input = words.iterator().next();
                input = words.FirstOrDefault();
            }
            else
            {
                // encode each word in a multi-word name separately (normally used for approx matches)
                StringBuilder result = new StringBuilder();
                foreach (string word in words2)
                {
                    result.Append("-").Append(Encode(word));
                }
                // return the result without the leading "-"
                return result.ToString(1, result.Length - 1);
            }

            PhonemeBuilder phonemeBuilder = PhonemeBuilder.Empty(languageSet);

            // loop over each char in the input - we will handle the increment manually
            for (int i = 0; i < input.Length;)
            {
                RulesApplication rulesApplication =
                        new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).Invoke();
                i = rulesApplication.I;
                phonemeBuilder = rulesApplication.PhonemeBuilder;
            }

            // Apply the general rules
            phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules1);
            // Apply the language-specific rules
            phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules2);

            return phonemeBuilder.MakeString();
        }

        /// <summary>
        /// Gets the Lang language guessing rules being used.
        /// </summary>
        public virtual Lang Lang
        {
            get { return this.lang; }
        }

        /// <summary>
        /// Gets the <see cref="Bm.NameType"/> being used.
        /// </summary>
        public virtual NameType NameType
        {
            get { return this.nameType; }
        }

        /// <summary>
        /// Gets the <see cref="Bm.RuleType"/> being used.
        /// </summary>
        public virtual RuleType RuleType
        {
            get { return this.ruleType; }
        }

        /// <summary>
        /// Gets if multiple phonetic encodings are concatenated or if just the first one is kept.
        /// Returns <c>true</c> if multiple phonetic encodings are returned, <c>false</c> if just the first is.
        /// </summary>
        public virtual bool IsConcat
        {
            get { return this.concat; }
        }

        /// <summary>
        /// Gets the maximum number of phonemes the engine will calculate for a given input.
        /// <para/>
        /// since 1.7
        /// </summary>
        public virtual int MaxPhonemes
        {
            get { return this.maxPhonemes; }
        }
    }
}