forked from apache/lucenenet
-
Notifications
You must be signed in to change notification settings - Fork 3
/
CapitalizationFilterFactory.cs
117 lines (108 loc) · 5.39 KB
/
CapitalizationFilterFactory.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
using Lucene.Net.Support;
using Lucene.Net.Analysis.Util;
using System.Collections.Generic;
using System.Globalization;
namespace Lucene.Net.Analysis.Miscellaneous
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Factory for <see cref="CapitalizationFilter"/>.
/// <para/>
/// The factory takes parameters:<para/>
/// "onlyFirstWord" - should each word be capitalized or all of the words?<para/>
/// "keep" - a keep word list. Each word that should be kept separated by whitespace.<para/>
/// "keepIgnoreCase - true or false. If true, the keep list will be considered case-insensitive.<para/>
/// "forceFirstLetter" - Force the first letter to be capitalized even if it is in the keep list<para/>
/// "okPrefix" - do not change word capitalization if a word begins with something in this list.
/// for example if "McK" is on the okPrefix list, the word "McKinley" should not be changed to
/// "Mckinley"<para/>
/// "minWordLength" - how long the word needs to be to get capitalization applied. If the
/// minWordLength is 3, "and" > "And" but "or" stays "or"<para/>
/// "maxWordCount" - if the token contains more then maxWordCount words, the capitalization is
/// assumed to be correct.<para/>
///
/// <code>
/// <fieldType name="text_cptlztn" class="solr.TextField" positionIncrementGap="100">
/// <analyzer>
/// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
/// <filter class="solr.CapitalizationFilterFactory" onlyFirstWord="true"
/// keep="java solr lucene" keepIgnoreCase="false"
/// okPrefix="McK McD McA"/>
/// </analyzer>
/// </fieldType></code>
///
/// @since solr 1.3
/// </summary>
public class CapitalizationFilterFactory : TokenFilterFactory
{
public const string KEEP = "keep";
public const string KEEP_IGNORE_CASE = "keepIgnoreCase";
public const string OK_PREFIX = "okPrefix";
public const string MIN_WORD_LENGTH = "minWordLength";
public const string MAX_WORD_COUNT = "maxWordCount";
public const string MAX_TOKEN_LENGTH = "maxTokenLength";
public const string ONLY_FIRST_WORD = "onlyFirstWord";
public const string FORCE_FIRST_LETTER = "forceFirstLetter";
public const string CULTURE = "culture"; // LUCENENET specific
internal CharArraySet keep;
internal ICollection<char[]> okPrefix = Collections.EmptyList<char[]>(); // for Example: McK
internal readonly int minWordLength; // don't modify capitalization for words shorter then this
internal readonly int maxWordCount;
internal readonly int maxTokenLength;
internal readonly bool onlyFirstWord;
internal readonly bool forceFirstLetter; // make sure the first letter is capital even if it is in the keep list
private readonly CultureInfo culture; // LUCENENET specific
/// <summary>
/// Creates a new <see cref="CapitalizationFilterFactory"/> </summary>
public CapitalizationFilterFactory(IDictionary<string, string> args)
: base(args)
{
AssureMatchVersion();
bool ignoreCase = GetBoolean(args, KEEP_IGNORE_CASE, false);
ICollection<string> k = GetSet(args, KEEP);
if (k != null)
{
keep = new CharArraySet(m_luceneMatchVersion, 10, ignoreCase);
keep.UnionWith(k);
}
k = GetSet(args, OK_PREFIX);
if (k != null)
{
okPrefix = new List<char[]>();
foreach (string item in k)
{
okPrefix.Add(item.ToCharArray());
}
}
minWordLength = GetInt32(args, MIN_WORD_LENGTH, 0);
maxWordCount = GetInt32(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT);
maxTokenLength = GetInt32(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH);
onlyFirstWord = GetBoolean(args, ONLY_FIRST_WORD, true);
forceFirstLetter = GetBoolean(args, FORCE_FIRST_LETTER, true);
culture = GetCulture(args, CULTURE, null);
if (args.Count > 0)
{
throw new System.ArgumentException("Unknown parameters: " + args);
}
}
public override TokenStream Create(TokenStream input)
{
return new CapitalizationFilter(input, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength, culture);
}
}
}