From 2e538dc6c1f116a81677ee1f168ddcec119ee289 Mon Sep 17 00:00:00 2001
From: Anna B <72624798+aberanger@users.noreply.github.com>
Date: Tue, 21 May 2024 17:39:17 +0200
Subject: [PATCH] Filtering words using a dictionnary (#84)

---
 sinr/text/preprocess.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sinr/text/preprocess.py b/sinr/text/preprocess.py
index d5dcae3..e88a2de 100644
--- a/sinr/text/preprocess.py
+++ b/sinr/text/preprocess.py
@@ -158,7 +158,7 @@ def do_txt_to_vrt(self, separator='sentence'):
         corpus_opened.close()
         logger.info(f"VRT-style file written in {self.corpus_output.absolute()}")
 
-def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en="chunking", min_freq=50, alpha=True, exclude_en=[], min_length_word=3, min_length_doc=2):
+def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en="chunking", min_freq=50, alpha=True, exclude_en=[], min_length_word=3, min_length_doc=2, dict_filt=[]):
     """Extracts the text from a VRT corpus file.
 
     :param corpus_path: str
@@ -176,6 +176,8 @@ def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=F
     :param min_length_word:  (Default value = 3)
     :param min_length_doc: The minimal number of token for a document (or sentence) to be kept (Default value = 2)
     :type min_length_doc: int
+    :param dict_filt: List of words to keep only specific vocabulary
+    :type dict_filt: list
     :returns: text (list(list(str))): A list of documents containing words
 
     """
@@ -236,7 +238,11 @@ def extract_text(corpus_path, exceptions_path=None, lemmatize=True, stop_words=F
                                 elif en == "deleting" :
                                     pass
                             elif len(lemma) > min_length_word:
-                                document.append(lemma_)
+                                if len(dict_filt) > 0:
+                                    if lemma_ in dict_filt:
+                                        document.append(lemma_)
+                                else:
+                                    document.append(lemma_)
                     else:
                         pass
             else: