Merge branch 'master' of https://github.com/R1j1t/contextualSpellCheck

R1j1t · Oct 25, 2020 · 128a6f8 · 128a6f8
2 parents f4a84db + b4c2b2e
commit 128a6f8
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 7 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-ignore = W503
+ignore = W503, E203
 exclude = .git,__pycache__,build,peters_code,.ipynb_checkpoints,setup.py
 max-complexity = 15
 per-file-ignores =

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -30,7 +30,10 @@ jobs:
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: install spacy model
       run: |
+        # Download base english langauge model
         python -m spacy download en_core_web_sm
+        # Download large english language model
+        python -m spacy download en_core_web_lg
     - name: Black Code Formatter
       run: black . --check
     - name: Flake Code Checker

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Contextual word checker for better suggestions
 [![license](https://img.shields.io/github/license/r1j1t/contextualSpellCheck)](https://github.com/R1j1t/contextualSpellCheck/blob/master/LICENSE) 
 [![PyPI](https://img.shields.io/pypi/v/contextualSpellCheck?color=green)](https://pypi.org/project/contextualSpellCheck/) 
 [![Python-Version](https://img.shields.io/badge/Python-3.6+-green)](https://github.com/R1j1t/contextualSpellCheck#install)
-[![Downloads](https://pepy.tech/badge/contextualspellcheck/week)](https://pepy.tech/project/contextualspellcheck/week)
+[![Downloads](https://pepy.tech/badge/contextualspellcheck/week)](https://pepy.tech/project/contextualspellcheck)
 [![GitHub contributors](https://img.shields.io/github/contributors/r1j1t/contextualSpellCheck)](https://github.com/R1j1t/contextualSpellCheck/graphs/contributors)
 [![Help Wanted](https://img.shields.io/badge/Help%20Wanted-Task%20List-violet)](https://github.com/R1j1t/contextualSpellCheck#task-list)
 
@@ -182,8 +182,7 @@ Response:
 ## Task List
 
 - [ ] Add support for Real Word Error (RWE) (Big Task)
-- [x] specify maximum edit distance for `candidateRanking`
-- [x] allow user to specify bert model
+- [ ] Include transformers deTokenizer to get better suggestions
 - [ ] edit distance code optimisation
 - [ ] add multi mask out capability
 - [ ] better candidate generation (maybe by fine tuning the model?)
@@ -192,6 +191,14 @@ Response:
 - [ ] Add examples for other langauges
 - [ ] use piece wise tokeniser when identifying the misspell
 
+<details><summary>Completed Task</summary>
+<p>
+
+- [x] specify maximum edit distance for `candidateRanking`
+- [x] allow user to specify bert model
+</p>
+</details>
+
 ## Support and contribution
 
 If you like the project, please ⭑ the project and show your support! Also, if you feel, the current behaviour is not as expected, please feel free to raise an [issue](https://github.com/R1j1t/contextualSpellCheck/issues). If you can help with any of the above tasks, please open a [PR](https://github.com/R1j1t/contextualSpellCheck/pulls) with necessary changes to documentation and tests.

diff --git a/contextualSpellCheck/contextualSpellCheck.py b/contextualSpellCheck/contextualSpellCheck.py
@@ -3,6 +3,7 @@
 import os
 import warnings
 from datetime import datetime
+import unicodedata
 
 import editdistance
 import spacy
@@ -251,8 +252,8 @@ def misspell_identify(self, doc, query=""):
                 and (token.ent_type_ != "GPE")
                 and (token.ent_type_ != "ORG")
             ):
-                misspell.append(token)
-
+                if self.deep_tokenize_in_vocab(token.text):
+                    misspell.append(token)
         if self.debug:
             print("misspell identified: ", misspell)
         return misspell, doc
@@ -574,6 +575,58 @@ def doc_outcome_spell_check(self, doc):
 
         return update_query
 
+    def deep_tokenize_in_vocab(self, text):
+        """Check if the token contains punctuations
+            if char is punctuation then check in vocab
+            check rest of the word in vocab
+            if both in vocab return False
+
+        Args:
+            text (str): Text to tokenize again for punct
+
+        Returns:
+            Bool: True if both punct and rest of the word
+                 in vocab
+        """
+        text_len = len(text)
+        sub_tokens = []
+        pre_puct_position = -1
+        for char_position in range(text_len):
+            if unicodedata.category(text[char_position]).startswith("P"):
+                # print("current_pos is {} and sub_token append {}"
+                # .format(char_position,text[char_position]))
+                sub_tokens.append(text[char_position])
+                # print("pre_pos is {}, cur  is {} , pre to current is {}"
+                # .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position]))
+                if (
+                    pre_puct_position >= 0
+                    and text[pre_puct_position + 1 : char_position] != ""
+                ):
+                    # print("pre_pos is {}, cur  is {} , pre to current is {}"
+                    # .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position]))
+                    sub_tokens.append(
+                        text[pre_puct_position + 1 : char_position]
+                    )
+                pre_puct_position = char_position
+
+            if (
+                (len(sub_tokens) > 0)
+                and (char_position + 1 == text_len)
+                and (text[pre_puct_position + 1 :] != "")
+            ):
+                # print("inside last token append {}"
+                # .format(text[pre_puct_position+1:]))
+                sub_tokens.append(text[pre_puct_position + 1 :])
+
+        if len(sub_tokens) > 0:
+            for sub_token in sub_tokens:
+                if sub_token not in self.vocab:
+                    return True
+        else:
+            return True
+
+        return False
+
 
 if __name__ == "__main__":
     print("Code running...")

diff --git a/contextualSpellCheck/tests/test_contextualSpellCheck.py b/contextualSpellCheck/tests/test_contextualSpellCheck.py
@@ -640,3 +640,37 @@ def test_max_edit_dist(max_edit_distance, expected_spell_check_flag):
     assert doc._.outcome_spellCheck == gold_outcome
 
     nlp.remove_pipe("contextual spellchecker")
+
+
+@pytest.mark.parametrize(
+    "input_sentence,expected_outcome,expected_score_doc,\
+expected_suggestion_doc,possible_misspel_index",
+    [
+        (
+            "This is not a pure Python Spell Checking based on Peter Norvig’s \
+blog post on setting up a simple spell checking algorithm.",
+            "",
+            None,
+            {},
+            8,
+        )
+    ],
+)
+def test_deep_tokenization(
+    input_sentence,
+    expected_outcome,
+    expected_score_doc,
+    expected_suggestion_doc,
+    possible_misspel_index,
+):
+    nlp_lg = spacy.load("en_core_web_lg")
+    checker_deep_tokenize = ContextualSpellCheck(max_edit_dist=4)
+    nlp_lg.add_pipe(checker_deep_tokenize)
+    doc = nlp(input_sentence)
+
+    # To check the status of `performed_spell_check` flag
+    assert doc._.outcome_spellCheck == expected_outcome
+    assert doc._.score_spellCheck == expected_score_doc
+    assert doc._.suggestions_spellCheck == expected_suggestion_doc
+
+    assert doc[possible_misspel_index]._.get_suggestion_spellCheck == ""
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="contextualSpellCheck",
-    version="0.3.0",
+    version="0.3.2",
     author="R1j1t",
     author_email="r1j1t@protonmail.com",
     description="Contextual spell correction using BERT (bidirectional representations)",