Skip to content

Commit

Permalink
Filtering words using a dictionnary (#84)
Browse files Browse the repository at this point in the history
  • Loading branch information
aberanger committed May 21, 2024
1 parent d51bb71 commit 2e538dc
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions sinr/text/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def do_txt_to_vrt(self, separator='sentence'):
corpus_opened.close()
logger.info(f"VRT-style file written in {self.corpus_output.absolute()}")

def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en="chunking", min_freq=50, alpha=True, exclude_en=[], min_length_word=3, min_length_doc=2):
def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en="chunking", min_freq=50, alpha=True, exclude_en=[], min_length_word=3, min_length_doc=2, dict_filt=[]):
"""Extracts the text from a VRT corpus file.
:param corpus_path: str
Expand All @@ -176,6 +176,8 @@ def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=F
:param min_length_word: (Default value = 3)
:param min_length_doc: The minimal number of token for a document (or sentence) to be kept (Default value = 2)
:type min_length_doc: int
:param dict_filt: List of words to keep only specific vocabulary
:type dict_filt: list
:returns: text (list(list(str))): A list of documents containing words
"""
Expand Down Expand Up @@ -236,7 +238,11 @@ def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=F
elif en == "deleting" :
pass
elif len(lemma) > min_length_word:
document.append(lemma_)
if len(dict_filt) > 0:
if lemma_ in dict_filt:
document.append(lemma_)
else:
document.append(lemma_)
else:
pass
else:
Expand Down

0 comments on commit 2e538dc

Please sign in to comment.