Skip to content

Commit

Permalink
Warn when BM25.average_idf < 0
Browse files Browse the repository at this point in the history
  • Loading branch information
Witiko committed Nov 26, 2019
1 parent e391f0c commit 3c4ac0a
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"""


import logging
import math
from six import iteritems
from six.moves import range
Expand All @@ -48,6 +49,8 @@
PARAM_B = 0.75
EPSILON = 0.25

logger = logging.getLogger(__name__)


class BM25(object):
"""Implementation of Best Matching 25 ranking function.
Expand Down Expand Up @@ -116,6 +119,13 @@ def _initialize(self, corpus):
negative_idfs.append(word)
self.average_idf = float(idf_sum) / len(self.idf)

if self.average_idf < 0:
logger.warning(
'Average inverse document frequency is less than zero. Your corpus of {} documents'
' is either too small or it does not originate from actual text documents. BM25'
' will likely produce "wrong" results.'.format(self.corpus_size)
)

eps = EPSILON * self.average_idf
for word in negative_idfs:
self.idf[word] = eps
Expand Down

0 comments on commit 3c4ac0a

Please sign in to comment.