forked from apache/lucenenet
-
Notifications
You must be signed in to change notification settings - Fork 1
/
SmartChineseAnalyzer.cs
171 lines (158 loc) · 7.03 KB
/
SmartChineseAnalyzer.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
// lucene version compatibility level: 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.En;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using System;
using System.IO;
using System.Text;
namespace Lucene.Net.Analysis.Cn.Smart
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// <para>
/// <see cref="SmartChineseAnalyzer"/> is an analyzer for Chinese or mixed Chinese-English text.
/// The analyzer uses probabilistic knowledge to find the optimal word segmentation for Simplified Chinese text.
/// The text is first broken into sentences, then each sentence is segmented into words.
/// </para>
/// <para>
/// Segmentation is based upon the <a href="http://en.wikipedia.org/wiki/Hidden_Markov_Model">Hidden Markov Model</a>.
/// A large training corpus was used to calculate Chinese word frequency probability.
/// </para>
/// <para>
/// This analyzer requires a dictionary to provide statistical data.
/// <see cref="SmartChineseAnalyzer"/> has an included dictionary out-of-box.
/// </para>
/// <para>
/// The included dictionary data is from <a href="http://www.ictclas.org">ICTCLAS1.0</a>.
/// Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License!
/// </para>
/// @lucene.experimental
/// </summary>
public sealed class SmartChineseAnalyzer : Analyzer
{
private readonly CharArraySet stopWords;
private const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
private const string STOPWORD_FILE_COMMENT = "//";
/// <summary>
/// Returns an unmodifiable instance of the default stop-words set.
/// </summary>
/// <returns>An unmodifiable instance of the default stop-words set.</returns>
public static CharArraySet GetDefaultStopSet()
{
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/// <summary>
/// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
/// accesses the static final set the first time.
/// </summary>
private class DefaultSetHolder
{
internal static readonly CharArraySet DEFAULT_STOP_SET = LoadDefaultStopSet();
private static CharArraySet LoadDefaultStopSet() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
{
try
{
return LoadDefaultStopWordSet();
}
catch (IOException ex)
{
// default set should always be present as it is part of the
// distribution (JAR)
throw new Exception("Unable to load default stopword set", ex);
}
}
internal static CharArraySet LoadDefaultStopWordSet()
{
// make sure it is unmodifiable as we expose it in the outer class
return CharArraySet.UnmodifiableSet(WordlistLoader.GetWordSet(IOUtils
.GetDecodingReader(typeof(SmartChineseAnalyzer), DEFAULT_STOPWORD_FILE,
Encoding.UTF8), STOPWORD_FILE_COMMENT,
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT));
#pragma warning restore 612, 618
}
}
private readonly LuceneVersion matchVersion;
/// <summary>
/// Create a new <see cref="SmartChineseAnalyzer"/>, using the default stopword list.
/// </summary>
public SmartChineseAnalyzer(LuceneVersion matchVersion)
: this(matchVersion, true)
{
}
/// <summary>
/// <para>
/// Create a new <see cref="SmartChineseAnalyzer"/>, optionally using the default stopword list.
/// </para>
/// <para>
/// The included default stopword list is simply a list of punctuation.
/// If you do not use this list, punctuation will not be removed from the text!
/// </para>
/// </summary>
/// <param name="matchVersion"></param>
/// <param name="useDefaultStopWords"><c>true</c> to use the default stopword list.</param>
public SmartChineseAnalyzer(LuceneVersion matchVersion, bool useDefaultStopWords)
{
stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
: CharArraySet.EMPTY_SET;
this.matchVersion = matchVersion;
}
/// <summary>
/// <para>
/// Create a new <see cref="SmartChineseAnalyzer"/>, using the provided <see cref="CharArraySet"/> of stopwords.
/// </para>
/// <para>
/// Note: the set should include punctuation, unless you want to index punctuation!
/// </para>
/// </summary>
/// <param name="matchVersion"></param>
/// <param name="stopWords"><see cref="CharArraySet"/> of stopwords to use.</param>
public SmartChineseAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords)
{
this.stopWords = stopWords ?? CharArraySet.EMPTY_SET;
this.matchVersion = matchVersion;
}
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
Tokenizer tokenizer;
TokenStream result;
if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
{
tokenizer = new HMMChineseTokenizer(reader);
result = tokenizer;
}
else
{
#pragma warning disable 612, 618
tokenizer = new SentenceTokenizer(reader);
result = new WordTokenFilter(tokenizer);
#pragma warning restore 612, 618
}
// result = new LowerCaseFilter(result);
// LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
// The porter stemming is too strict, this is not a bug, this is a feature:)
result = new PorterStemFilter(result);
if (stopWords.Count > 0)
{
result = new StopFilter(matchVersion, result, stopWords);
}
return new TokenStreamComponents(tokenizer, result);
}
}
}