# Multilingual spell-checker

*Nicolò Cosimo Albanese*

## 1. Environment Set-up

### 1.1 Install Enchant

For Google Colab:

In [1]:
!sudo apt-get install libenchant-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libenchant-dev is already the newest version (1.6.0-11.1).
0 upgraded, 0 newly installed, 0 to remove and 11 not upgraded.


### 1.2 Install Dictionaries

For Google Colab:

In [2]:
!sudo apt-get install hunspell-it hunspell-es hunspell-de-de hunspell-fr

Reading package lists... Done
Building dependency tree       
Reading state information... Done
hunspell-de-de is already the newest version (20161207-4).
hunspell-es is already the newest version (1:6.0.3-3).
hunspell-fr is already the newest version (1:6.2-1).
hunspell-it is already the newest version (1:6.0.3-3).
0 upgraded, 0 newly installed, 0 to remove and 11 not upgraded.


### 1.3 Install Python dependencies

- PyEnchant
- langdetect

For Google Colab:

In [3]:
!pip install pyenchant langdetect



## 2. Implementation

In [4]:
import json


class Language:
    """Language class
        - lang [string]: language identifier (e.g.: en)
        - prob [double]: probability (e.g. 0.999)
    """

    def __init__(self, lang, prob):
        self.lang = lang
        self.prob = prob


class SpellCheck:
    """SpellCheck class
        - suggestion [string]: recommended sentence
        - similarity [double]: measure of similarity between input and recommendation
    """

    def __init__(self, suggestion, similarity):
        self.suggestion = suggestion
        self.similarity = similarity


class Response:
    """Response class
        - SpellCheck object:
            - suggestion [string]: recommended sentence
            - similarity [double]: measure of similarity between input and recommendation
        - Language object:
            - lang [string]: language identifier (e.g.: en)
            - prob [double]: probability (e.g. 0.999)
    """

    def __init__(self, Language, SpellCheck):
        self.Language = Language
        self.SpellCheck = SpellCheck

    def __repr__(self):
        return 'Response: (\n\tLanguage:\n\t\tlang : "%s" \n\t\tprob : "%s" \n\tSpellCheck:\n\t\tsuggestion : "%s" ' \
               '\n\t\tsimilarity : "%s" \n)' % (
                   self.Language.lang, self.Language.prob, self.SpellCheck.suggestion, self.SpellCheck.similarity)

    def to_json(self):
        return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)


In [5]:
import string
import enchant
from difflib import SequenceMatcher
from langdetect import detect_langs

lang_to_dict =	{
  "it": "it-IT",
  "fr": "fr_FR",
  "es": "es_ES",
  "en": "en_GB",
  "de": "de_DE"
}

def map_language_to_dict(lang):
  """Determines hunspell dictionary to be used for spell-check
  
  :return hunspell dictionary given the input language
  """
  if lang in lang_to_dict:
    return lang_to_dict[lang]

  return "Unknown dictionary"


def detect_language(input, min_prob=0.9, get_prob=True):
  """Detect most likely language of input text
  
  Can specify 'min_prob' (default 0.9) as minimum probability
  under which reject the detected language.

  :return detected language
  and also probability if 'get_prob' is True
  """
  if not input:
    return ("Unknown language", 0)

  first_guess = detect_langs(input)[0]
  prob = first_guess.prob
  lang = first_guess.lang

  if get_prob and prob > min_prob:
    return (lang, prob)

  if prob > min_prob:
    return lang

  return ("Unknown language", 0)


def spellcheck(text, hunspell_dict):
  """Identify misspellings and correct input text

  :return suggestion and a similarity measure with the input text
  if no misspelling is found, a message is returned
  """
  if hunspell_dict=="Unknown dictionary":
    return "No language detected or no dictionary available.", 0

  d = enchant.Dict(hunspell_dict)

  suggestion = []
  suggestion_similarity = []
  replaced_terms = 0

  # remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))

  # for each token word in the input text
  for token in text.split():
    # if word is mispelled
    if not d.check(token):
      # replace it with new token from spell-check suggestion
      new_token = d.suggest(token)[0]
      suggestion.append(new_token)
      # calculate similarity between original word and replacement
      similarity = SequenceMatcher(None, new_token, token).ratio()
      suggestion_similarity.append(similarity)
      # increase the replacements counter
      replaced_terms += 1
    # if word is not mispelled, keep it as-is
    else:
      suggestion.append(token)
      # if the word is identical, its similarity is 1
      suggestion_similarity.append(1)

  # calculate suggested sentence and overall similarity
  suggested_text = " ".join(suggestion)
  overall_similarity = sum(suggestion_similarity) / len(suggestion_similarity)

  # if no misspelling was found, return message
  if replaced_terms == 0:
    return "No misspelling detected. Input text is correct.", overall_similarity

  return suggested_text, overall_similarity


def process_input_text(text, min_prob = 0.9):
  """Identify language and suggest correction in case of misspellings
  Takes input text and minimum desired probability for language detection

  :return detected language, probability, suggestion and similarity
  """
  lang, prob = detect_language(text, min_prob)
  d = map_language_to_dict(lang)
  suggestion, similarity = spellcheck(text, d)
  result = Response(Language(lang, prob), SpellCheck(suggestion, similarity))

  return result

### 2.1 Testing with some sentences

In [12]:
process_input_text("This englisch sentencee has some mistackes", 0.99)

Response: (
	Language:
		lang : "en" 
		prob : "0.9999944325753912" 
	SpellCheck:
		suggestion : "This English sentence has some mistakes" 
		similarity : "0.9470588235294118" 
)

In [7]:
process_input_text("La France métropoltaine posède une grande variété de payssages", 0.99)

Response: (
	Language:
		lang : "fr" 
		prob : "0.9999964482516039" 
	SpellCheck:
		suggestion : "La France métropolitaine possède une grande variété de paysages" 
		similarity : "0.9808018174031246" 
)

In [8]:
process_input_text("La profesoresa insenga la matematica a scquola", 0.99)

Response: (
	Language:
		lang : "it" 
		prob : "0.9999934470729319" 
	SpellCheck:
		suggestion : "La professoressa insegna la matematica a scuola" 
		similarity : "0.956698063840921" 
)

In [9]:
process_input_text("La historia comienza una mañana de octubre.", 0.99)

Response: (
	Language:
		lang : "es" 
		prob : "0.9999942235541004" 
	SpellCheck:
		suggestion : "No misspelling detected. Input text is correct." 
		similarity : "1.0" 
)

In [10]:
process_input_text("englasdfcsd", 0.99)

Response: (
	Language:
		lang : "Unknown language" 
		prob : "0" 
	SpellCheck:
		suggestion : "No language detected or no dictionary available." 
		similarity : "0" 
)

The following is an example of "Cupertino effect": the word Huawei is correct, but it is replaced anyway, because it does not belong to the dictionaries/ knowledge base.

In [11]:
process_input_text("Do you know any colleagues in Huawei?", 0.99)

Response: (
	Language:
		lang : "en" 
		prob : "0.999997049545999" 
	SpellCheck:
		suggestion : "Do you know any colleagues in Hawaii" 
		similarity : "0.9523809523809524" 
)