forked from apache/lucenenet
/
DefaultSimilarity.cs
195 lines (178 loc) · 7.38 KB
/
DefaultSimilarity.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
using System;
namespace Lucene.Net.Search.Similarities
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using BytesRef = Lucene.Net.Util.BytesRef;
using FieldInvertState = Lucene.Net.Index.FieldInvertState;
using SmallSingle = Lucene.Net.Util.SmallSingle;
/// <summary>
/// Expert: Default scoring implementation which encodes (<see cref="EncodeNormValue(float)"/>)
/// norm values as a single byte before being stored. At search time,
/// the norm byte value is read from the index
/// <see cref="Lucene.Net.Store.Directory"/> and
/// decoded (<see cref="DecodeNormValue(long)"/>) back to a float <i>norm</i> value.
/// this encoding/decoding, while reducing index size, comes with the price of
/// precision loss - it is not guaranteed that <i>Decode(Encode(x)) = x</i>. For
/// instance, <i>Decode(Encode(0.89)) = 0.75</i>.
/// <para/>
/// Compression of norm values to a single byte saves memory at search time,
/// because once a field is referenced at search time, its norms - for all
/// documents - are maintained in memory.
/// <para/>
/// The rationale supporting such lossy compression of norm values is that given
/// the difficulty (and inaccuracy) of users to express their true information
/// need by a query, only big differences matter.
/// <para/>
/// Last, note that search time is too late to modify this <i>norm</i> part of
/// scoring, e.g. by using a different <see cref="Similarity"/> for search.
/// </summary>
public class DefaultSimilarity : TFIDFSimilarity
{
/// <summary>
/// Cache of decoded bytes. </summary>
private static readonly float[] NORM_TABLE = LoadNormTable();
private static float[] LoadNormTable() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
{
float[] normTable = new float[256];
for (int i = 0; i < 256; i++)
{
normTable[i] = SmallSingle.SByte315ToSingle((sbyte)i);
}
return normTable;
}
/// <summary>
/// Sole constructor: parameter-free </summary>
public DefaultSimilarity()
{
}
/// <summary>
/// Implemented as <c>overlap / maxOverlap</c>. </summary>
public override float Coord(int overlap, int maxOverlap)
{
return overlap / (float)maxOverlap;
}
/// <summary>
/// Implemented as <c>1/sqrt(sumOfSquaredWeights)</c>. </summary>
public override float QueryNorm(float sumOfSquaredWeights)
{
return (float)(1.0 / Math.Sqrt(sumOfSquaredWeights));
}
/// <summary>
/// Encodes a normalization factor for storage in an index.
/// <para/>
/// The encoding uses a three-bit mantissa, a five-bit exponent, and the
/// zero-exponent point at 15, thus representing values from around 7x10^9 to
/// 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
/// represented. Negative numbers are rounded up to zero. Values too large to
/// represent are rounded down to the largest representable value. Positive
/// values too small to represent are rounded up to the smallest positive
/// representable value.
/// </summary>
/// <seealso cref="Lucene.Net.Documents.Field.Boost"/>
/// <seealso cref="Lucene.Net.Util.SmallSingle"/>
public override sealed long EncodeNormValue(float f)
{
return SmallSingle.SingleToSByte315(f);
}
/// <summary>
/// Decodes the norm value, assuming it is a single byte.
/// </summary>
/// <seealso cref="EncodeNormValue(float)"/>
public override sealed float DecodeNormValue(long norm)
{
return NORM_TABLE[(int)(norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
}
/// <summary>
/// Implemented as
/// <c>state.Boost * LengthNorm(numTerms)</c>, where
/// <c>numTerms</c> is <see cref="FieldInvertState.Length"/> if
/// <see cref="DiscountOverlaps"/> is <c>false</c>, else it's
/// <see cref="FieldInvertState.Length"/> -
/// <see cref="FieldInvertState.NumOverlap"/>.
///
/// <para/>
/// @lucene.experimental
/// </summary>
public override float LengthNorm(FieldInvertState state)
{
int numTerms;
if (m_discountOverlaps)
{
numTerms = state.Length - state.NumOverlap;
}
else
{
numTerms = state.Length;
}
return state.Boost * ((float)(1.0 / Math.Sqrt(numTerms)));
}
/// <summary>
/// Implemented as <c>Math.Sqrt(freq)</c>. </summary>
public override float Tf(float freq)
{
return (float)Math.Sqrt(freq);
}
/// <summary>
/// Implemented as <c>1 / (distance + 1)</c>. </summary>
public override float SloppyFreq(int distance)
{
return 1.0f / (distance + 1);
}
/// <summary>
/// The default implementation returns <c>1</c> </summary>
public override float ScorePayload(int doc, int start, int end, BytesRef payload)
{
return 1;
}
/// <summary>
/// Implemented as <c>log(numDocs/(docFreq+1)) + 1</c>. </summary>
public override float Idf(long docFreq, long numDocs)
{
return (float)(Math.Log(numDocs / (double)(docFreq + 1)) + 1.0);
}
/// <summary>
/// <c>True</c> if overlap tokens (tokens with a position of increment of zero) are
/// discounted from the document's length.
/// </summary>
protected bool m_discountOverlaps = true;
/// <summary>
/// Determines whether overlap tokens (Tokens with
/// 0 position increment) are ignored when computing
/// norm. By default this is true, meaning overlap
/// tokens do not count when computing norms.
/// <para/>
/// @lucene.experimental
/// </summary>
/// <seealso cref="TFIDFSimilarity.ComputeNorm(FieldInvertState)"/>
public virtual bool DiscountOverlaps
{
set
{
m_discountOverlaps = value;
}
get
{
return m_discountOverlaps;
}
}
public override string ToString()
{
return "DefaultSimilarity";
}
}
}