From 2e538dc6c1f116a81677ee1f168ddcec119ee289 Mon Sep 17 00:00:00 2001 From: Anna B <72624798+aberanger@users.noreply.github.com> Date: Tue, 21 May 2024 17:39:17 +0200 Subject: [PATCH] Filtering words using a dictionnary (#84) --- sinr/text/preprocess.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sinr/text/preprocess.py b/sinr/text/preprocess.py index d5dcae3..e88a2de 100644 --- a/sinr/text/preprocess.py +++ b/sinr/text/preprocess.py @@ -158,7 +158,7 @@ def do_txt_to_vrt(self, separator='sentence'): corpus_opened.close() logger.info(f"VRT-style file written in {self.corpus_output.absolute()}") -def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en="chunking", min_freq=50, alpha=True, exclude_en=[], min_length_word=3, min_length_doc=2): +def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en="chunking", min_freq=50, alpha=True, exclude_en=[], min_length_word=3, min_length_doc=2, dict_filt=[]): """Extracts the text from a VRT corpus file. :param corpus_path: str @@ -176,6 +176,8 @@ def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=F :param min_length_word: (Default value = 3) :param min_length_doc: The minimal number of token for a document (or sentence) to be kept (Default value = 2) :type min_length_doc: int + :param dict_filt: List of words to keep only specific vocabulary + :type dict_filt: list :returns: text (list(list(str))): A list of documents containing words """ @@ -236,7 +238,11 @@ def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=F elif en == "deleting" : pass elif len(lemma) > min_length_word: - document.append(lemma_) + if len(dict_filt) > 0: + if lemma_ in dict_filt: + document.append(lemma_) + else: + document.append(lemma_) else: pass else: