Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
R1j1t committed Oct 25, 2020
2 parents f4a84db + b4c2b2e commit 128a6f8
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[flake8]
ignore = W503
ignore = W503, E203
exclude = .git,__pycache__,build,peters_code,.ipynb_checkpoints,setup.py
max-complexity = 15
per-file-ignores =
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ jobs:
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: install spacy model
run: |
# Download base english langauge model
python -m spacy download en_core_web_sm
# Download large english language model
python -m spacy download en_core_web_lg
- name: Black Code Formatter
run: black . --check
- name: Flake Code Checker
Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Contextual word checker for better suggestions
[![license](https://img.shields.io/github/license/r1j1t/contextualSpellCheck)](https://github.com/R1j1t/contextualSpellCheck/blob/master/LICENSE)
[![PyPI](https://img.shields.io/pypi/v/contextualSpellCheck?color=green)](https://pypi.org/project/contextualSpellCheck/)
[![Python-Version](https://img.shields.io/badge/Python-3.6+-green)](https://github.com/R1j1t/contextualSpellCheck#install)
[![Downloads](https://pepy.tech/badge/contextualspellcheck/week)](https://pepy.tech/project/contextualspellcheck/week)
[![Downloads](https://pepy.tech/badge/contextualspellcheck/week)](https://pepy.tech/project/contextualspellcheck)
[![GitHub contributors](https://img.shields.io/github/contributors/r1j1t/contextualSpellCheck)](https://github.com/R1j1t/contextualSpellCheck/graphs/contributors)
[![Help Wanted](https://img.shields.io/badge/Help%20Wanted-Task%20List-violet)](https://github.com/R1j1t/contextualSpellCheck#task-list)

Expand Down Expand Up @@ -182,8 +182,7 @@ Response:
## Task List

- [ ] Add support for Real Word Error (RWE) (Big Task)
- [x] specify maximum edit distance for `candidateRanking`
- [x] allow user to specify bert model
- [ ] Include transformers deTokenizer to get better suggestions
- [ ] edit distance code optimisation
- [ ] add multi mask out capability
- [ ] better candidate generation (maybe by fine tuning the model?)
Expand All @@ -192,6 +191,14 @@ Response:
- [ ] Add examples for other langauges
- [ ] use piece wise tokeniser when identifying the misspell

<details><summary>Completed Task</summary>
<p>

- [x] specify maximum edit distance for `candidateRanking`
- [x] allow user to specify bert model
</p>
</details>

## Support and contribution

If you like the project, please ⭑ the project and show your support! Also, if you feel, the current behaviour is not as expected, please feel free to raise an [issue](https://github.com/R1j1t/contextualSpellCheck/issues). If you can help with any of the above tasks, please open a [PR](https://github.com/R1j1t/contextualSpellCheck/pulls) with necessary changes to documentation and tests.
Expand Down
57 changes: 55 additions & 2 deletions contextualSpellCheck/contextualSpellCheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import warnings
from datetime import datetime
import unicodedata

import editdistance
import spacy
Expand Down Expand Up @@ -251,8 +252,8 @@ def misspell_identify(self, doc, query=""):
and (token.ent_type_ != "GPE")
and (token.ent_type_ != "ORG")
):
misspell.append(token)

if self.deep_tokenize_in_vocab(token.text):
misspell.append(token)
if self.debug:
print("misspell identified: ", misspell)
return misspell, doc
Expand Down Expand Up @@ -574,6 +575,58 @@ def doc_outcome_spell_check(self, doc):

return update_query

def deep_tokenize_in_vocab(self, text):
"""Check if the token contains punctuations
if char is punctuation then check in vocab
check rest of the word in vocab
if both in vocab return False
Args:
text (str): Text to tokenize again for punct
Returns:
Bool: True if both punct and rest of the word
in vocab
"""
text_len = len(text)
sub_tokens = []
pre_puct_position = -1
for char_position in range(text_len):
if unicodedata.category(text[char_position]).startswith("P"):
# print("current_pos is {} and sub_token append {}"
# .format(char_position,text[char_position]))
sub_tokens.append(text[char_position])
# print("pre_pos is {}, cur is {} , pre to current is {}"
# .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position]))
if (
pre_puct_position >= 0
and text[pre_puct_position + 1 : char_position] != ""
):
# print("pre_pos is {}, cur is {} , pre to current is {}"
# .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position]))
sub_tokens.append(
text[pre_puct_position + 1 : char_position]
)
pre_puct_position = char_position

if (
(len(sub_tokens) > 0)
and (char_position + 1 == text_len)
and (text[pre_puct_position + 1 :] != "")
):
# print("inside last token append {}"
# .format(text[pre_puct_position+1:]))
sub_tokens.append(text[pre_puct_position + 1 :])

if len(sub_tokens) > 0:
for sub_token in sub_tokens:
if sub_token not in self.vocab:
return True
else:
return True

return False


if __name__ == "__main__":
print("Code running...")
Expand Down
34 changes: 34 additions & 0 deletions contextualSpellCheck/tests/test_contextualSpellCheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,3 +640,37 @@ def test_max_edit_dist(max_edit_distance, expected_spell_check_flag):
assert doc._.outcome_spellCheck == gold_outcome

nlp.remove_pipe("contextual spellchecker")


@pytest.mark.parametrize(
"input_sentence,expected_outcome,expected_score_doc,\
expected_suggestion_doc,possible_misspel_index",
[
(
"This is not a pure Python Spell Checking based on Peter Norvig’s \
blog post on setting up a simple spell checking algorithm.",
"",
None,
{},
8,
)
],
)
def test_deep_tokenization(
input_sentence,
expected_outcome,
expected_score_doc,
expected_suggestion_doc,
possible_misspel_index,
):
nlp_lg = spacy.load("en_core_web_lg")
checker_deep_tokenize = ContextualSpellCheck(max_edit_dist=4)
nlp_lg.add_pipe(checker_deep_tokenize)
doc = nlp(input_sentence)

# To check the status of `performed_spell_check` flag
assert doc._.outcome_spellCheck == expected_outcome
assert doc._.score_spellCheck == expected_score_doc
assert doc._.suggestions_spellCheck == expected_suggestion_doc

assert doc[possible_misspel_index]._.get_suggestion_spellCheck == ""
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="contextualSpellCheck",
version="0.3.0",
version="0.3.2",
author="R1j1t",
author_email="r1j1t@protonmail.com",
description="Contextual spell correction using BERT (bidirectional representations)",
Expand Down

0 comments on commit 128a6f8

Please sign in to comment.