In [1]:
from spellchecker import SpellChecker
from difflib import SequenceMatcher, get_close_matches, Differ
import json
import os
import math
import pickle

In [2]:
#Pickle functions

In [3]:
def get_letters_pickle(address):
    """
    Returns the letters dictionary.
    Arguments:
        address(String) - The address of the pickle. 
    Returns:
        Dictionary. The dictionary of the letters. 
    """
    with open(address,"rb") as f:
        pickle_out = pickle.load(f)
    return pickle_out

In [4]:
#Json and Dictionary Functions

In [5]:
def parse_letter_dictionary(letters_dictionary):
    """
    Parsing the replaces key in the dictionary to list of tuples.
    Arguments:
        letters_dictionary(Dictionary) - the letters dictionary.
    Returns:
        List(tuples) - the list of replaces tuples.
    """
    letters_switches = []  #List of tuples
    for key, value in letters_dictionary['replaces'].items():
        letters_switches_tuple = (key[0],key[1])
        letters_full_switches_tuple = (letters_switches_tuple,value)
        letters_switches.append(letters_full_switches_tuple)
    return letters_switches

In [6]:
#Reading json file
def json_loads_file(file):
    """Get file json.loads .

    Args:
        file(string): path to the json file.

    Returns:
        file. json file.
    """
    with open(file, encoding="utf8",mode='r') as f:
        data = json.load(f)
        return data

In [7]:
def list_of_jsons_to_letters_dictionary(list_of_jsons):
    """
    Converting List of jsons to only one dictionary with the letters as key and the probabilities as value.
    Arguments:
        list_of_jsons(List) - The list of jsons described each letter
    Returns:
        Dictionary. The dictionary holds all characters
    """
    myDict = {}
    my_subDict = {'subs': 0, 'add':0, 'replace': {}}
    for letter in list_of_jsons:
        my_subDict['subs'] = letter['subs']
        my_subDict['add'] = letter['add']
        my_subDict['replace'] = letter['replace']
        myDict[letter['letter']] = my_subDict
        my_subDict = {'subs': 0, 'add':0, 'replace': {}}
    return myDict      

In [8]:
def get_candidate(candidates_scores):
    """
    Finds the key with the minimal value.
    Arguments:
        candidates_scores(Dictionary) - the candidates dictionary.
    Returns:
        String. the candidate word.        
    """
    return(min(candidates_scores, key=candidates_scores.get))

In [9]:
#Building the letters dictionary scores

In [10]:
def switches_confusion_matrix_scoring(letters_description,common_switches):
    """
    Scoring the letters switches based on the confusion matrix .
    Arguments:
        letters_description(Dictionary) - letters probabilities.
        common_switches(List) - switches probabilities.
    Returns:
        Dictionary. The letters_description probabilities. 
    """
    #put the information from the confusion matrix(described by the variable common_switches) into the letters_description
    for letters_pair in common_switches:
        letters_description[letters_pair[0][0]]['replace'][letters_pair[0][1]] = letters_pair[1]
    
    #running on every key in the dictionary
    for origin_letter in letters_description:
        for model_letter,score in letters_description[origin_letter]['replace'].items():
            if score > 0 and score < 1:
                letters_description[origin_letter]['replace'][model_letter] = 0
    return letters_description

In [11]:
def insertion_and_remove_commonly_score(letters_description_dictionary,letters_dictionary):
    """
    Puts the scores of insertion and remove in each letter.
    Arguments: 
        letters_description_dictionary(dictionary) - letters scores.
        letters_dictionary(dictionary) - the full scores of the letters to each operation.
    Returns:
        dictionary. The dictionary describes the common scores of substitution, insertion and remove.
    """
    for letter in letters_dictionary['inserts'].keys():
        letters_description_dictionary[letter]['add']= letters_dictionary['inserts'][letter]
    for letter in letters_dictionary['deletes'].keys():
        letters_description_dictionary[letter]['subs'] = letters_dictionary['deletes'][letter]            
    return letters_description_dictionary

In [52]:
def sum_of_scores(letters_description):
    """
    finds the total score of all the scores.
    Arguments:
        letters_description(dictionary) -Letters dictionary.
    Returns:
        int. the maximum number
    """
    total_score = 0
    for letter in letters_description.keys():
        total_score += letters_description[letter]['subs']
        total_score += letters_description[letter]['add']
        for replaced_letter in letters_description[letter]['replace'].keys():
            total_score += letters_description[letter]['replace'][replaced_letter]
    return total_score        

In [65]:
def letters_scores_division(letters_description, total_score):
    """
    Divide the scores in a constant number.
    Arguments:
        letters_description(dictionary) - the dictionary of the letters scores.
        total_score(int) - the total sum of the scores in the dictionary
    Returns:
        Dictionary. the dictionary after the division in the constant number.
    """
    for letter in letters_description.keys():
        letters_description[letter]['subs'] = letters_description[letter]['subs'] / total_score
        letters_description[letter]['add'] = letters_description[letter]['add'] / total_score
        for replaced_letter in letters_description[letter]['replace'].keys():
            letters_description[letter]['replace'][replaced_letter] = letters_description[letter]['replace'][replaced_letter] / total_score
    return letters_description

In [12]:
def find_max_score(letters_description):
    """
    Finds the max score in a letters dictionary
    Arguments:
        letters_description(dictionary) -Letters dictionary.
    Returns:
        int. the maximum number
    """
    maximum_score = 0
    for letter in letters_description.keys():
        if letters_description[letter]['subs'] > maximum_score:
            maximum_score = letters_description[letter]['subs']
        if letters_description[letter]['add'] > maximum_score:
             maximum_score = letters_description[letter]['add']  
        for switch_score in letters_description[letter]['replace'].values():
             if switch_score > maximum_score:
                 maximum_score = switch_score
    return maximum_score

In [69]:
def compare_letters_scoring(letter_description_after_scoring):
    """
    Updates  the score of substitution if the score of insertion and remove smaller than the subtitution.
    Arguments:
        letter_description_after_scoring(dictionary) - the score of the letters different cases.
    Returns:
        Dictionary. The updated score of the letters 
    """
    for original_letter in letter_description_after_scoring.keys():
        for model_letter in letter_description_after_scoring[original_letter]['replace'].keys():
            if letter_description_after_scoring[original_letter]['replace'][model_letter] > letter_description_after_scoring[original_letter]['subs'] + letter_description_after_scoring[model_letter]['add']:
                letter_description_after_scoring[original_letter]['replace'][model_letter] = letter_description_after_scoring[original_letter]['subs'] + letter_description_after_scoring[model_letter]['add']
    return letter_description_after_scoring

In [13]:
def letters_calculation_scores(letters_description,max_score):
    """
    Calculates the scoring inversly to the term frequency using equation.
    Arguments:
        letters_description(Dictionary) - the dictionary contains the whole probabilities
        max_score(int) - the maximmum value in the dictionary
    Returns:
        
    """
    #origin
    # for original_letter in letters_description:
    #     for model_letter, score in letters_description[original_letter]['replace'].items():
    #         letters_description[original_letter]['replace'][model_letter] = (max_score - score + 2)/ (max_score)
    #     letters_description[original_letter]['subs']= (max_score - letters_description[original_letter]['subs'] +2)/ max_score
    #     letters_description[original_letter]['add'] =  (max_score - letters_description[original_letter]['add'] +2)/ max_score
    # return letters_description

    for original_letter in letters_description:
        for model_letter, score in letters_description[original_letter]['replace'].items():
            letters_description[original_letter]['replace'][model_letter] = (math.log(max_score - score + 2))/ (max_score)
        letters_description[original_letter]['subs']= (math.log(max_score - letters_description[original_letter]['subs'] +2))/ max_score
        letters_description[original_letter]['add'] =  (math.log(max_score - letters_description[original_letter]['add'] +2))/ max_score
    return letters_description

In [14]:
#Filter candidates functions

In [15]:
def filter_candidates(candidates):
    """
    #     Finds the words we should remove from the candidates list.
    #     Arguments:
    #         candidates(List) - the candidates list.
    #     Returns:
    #         List. The words we should remove from the candidates list.
    #     """
    
    words_to_remove =[]
    for candidate in candidates:
        if candidate[-1] in 'מנפצכ':
            words_to_remove.append(candidate)
            continue
        elif candidate.count('"')>1:
            words_to_remove.append(candidate)
            continue
        else:
            for candidate_letter in candidate:
                if candidate_letter in 'ןםךץף':
                    if candidate.find(candidate_letter) != (len(candidate)-1):
                        words_to_remove.append(candidate)
                        break
                elif candidate_letter in '"':
                    if candidate.find(candidate_letter) != (len(candidate)-2):
                        words_to_remove.append(candidate)
                        break
    return words_to_remove

In [16]:
def remove_words_from_list(original_list, words_to_remove):
    """
    Removing each word from words_to_remove from the original_list.
    Arguments:
        original_list(List) - the original list.
        words_to_remove(List) - the list that contains the words we should remove.
    Returns:
        List. The list after the words filtering.
    """
    for remove_word in words_to_remove:
        original_list.remove(remove_word)
    return original_list    

In [17]:
def remove_duplicates_from_list(my_list):
    """
    Removes the duplicates elements from a list.
    Arguments:
        my_list(List) - the original list.
    Returns:
        List. The list without the duplicates.
    """
    my_list = list(dict.fromkeys(my_list))
    return my_list

In [18]:
#Converting suffix function

In [19]:
def converting_suffix(candidate):
    """
    Convert back the final letter from regular letter to suffix letter.
    Arguments:
        candidate(String) - the candidate word
    Returns:
        String - the candidate with suffix.
    """
    suffix_map = str.maketrans("מנצפכ","םןץףך")
    if candidate[-1] in 'כמנצפ':
        candidate = candidate[:-1] + candidate[-1].translate(suffix_map)
    return candidate

In [20]:
#Candidates scoring functions

In [21]:
#Omission and insertion scores

In [22]:
def letter_omission_score(letter_description_after_scoring, candidate_letter):
    """
    Calculates the the score of candidate letter omission.
    Arguments:
        letter_description_after_scoring(dictionary) - the score of the letters different cases.
        candidate_letter(char) - the letter we should consider its score to omit.
    Returns:
        Double. The score of the letter ommision.
    """  
    return letter_description_after_scoring[candidate_letter]['subs']

In [23]:
def letter_addition_score(letter_description_after_scoring, model_letter):
    """
    Calculates the the score of missplled word letter insertion.
    Arguments:
        letter_description_after_scoring(dictionary) - the score of the letters different cases.
        model_letter(char) - the letter we should consider its score to insert.
    Returns:
        Double. The score of the letter insertion.
    """
    return letter_description_after_scoring[model_letter]['add']   

In [24]:
#Replacement scores functions

In [25]:
#Letter vs letter

In [26]:
#model letter vs candidate letters

In [27]:
def candidate_letters_by_model_letter(letter_description_after_scoring, model_letter, candidate_letters):
    """
     Finds the minimum score between two candidate letters replacement by one model letter.
     Arguments:
         letter_description_after_scoring(dictionary) - the score of the letters different cases.
         model_letter(String) - letter of the model.
         candidate_letters(String) - letters of the candiate.
     Returns:
         Double. The minimum score of the letter replacement/omission/insertion.
     """ 
    sequence_score = 0
    if (letter_description_after_scoring[candidate_letters[0]]['replace'][model_letter] + letter_description_after_scoring[candidate_letters[1]]['subs']) <= (letter_description_after_scoring[candidate_letters[1]]['replace'][model_letter] + letter_description_after_scoring[candidate_letters[0]]['subs']):
        sequence_score = letter_description_after_scoring[candidate_letters[0]]['replace'][model_letter] + letter_description_after_scoring[candidate_letters[1]]['subs']
    else:
        sequence_score = letter_description_after_scoring[candidate_letters[1]]['replace'][model_letter] + letter_description_after_scoring[candidate_letters[0]]['subs']
    return sequence_score


In [28]:
def candidate_letter_by_model_letters(letter_description_after_scoring, model_letters, candidate_letter):
    """
    Finds the minimum score between two candidate letters replacement by one model letter.
    Arguments:
        letter_description_after_scoring(dictionary) - the score of the letters different cases.
        model_letters(String) - letters of the model.
        candidate_letter(String) - letter of the candiate.
    Returns:
        Double. The minimum score of the letter replacement/omission/insertion.
    """
    sequence_score = 0
    if (letter_description_after_scoring[candidate_letter]['replace'][model_letters[0]] + letter_description_after_scoring[model_letters[1]]['add']) <= (letter_description_after_scoring[candidate_letter]['replace'][model_letters[1]] + letter_description_after_scoring[model_letters[0]]['add']):
        sequence_score = letter_description_after_scoring[candidate_letter]['replace'][model_letters[0]] + letter_description_after_scoring[model_letters[1]]['add']
    else:
        sequence_score = letter_description_after_scoring[candidate_letter]['replace'][model_letters[1]] + letter_description_after_scoring[model_letters[0]]['add']
    return sequence_score
     

In [29]:
def candidate_letters_by_model_letters(letter_description_after_scoring, model_letters, candidate_letters):
    """
    Calculates the letters by letters replecment.
    Arguments:
        letter_description_after_scoring(dictionary) - the score of the letters different cases.
        model_letters(String) - the letters of the model we should consider its score to replace.
        candidate_letters(String) - the letters of the candidate we should consider its score to replace.
    Returns:
        Double. The score of the letter replacement. 
    """
    candidate_score = 0
    if (letter_description_after_scoring[candidate_letters[0]]['replace'][model_letters[0]] + letter_description_after_scoring[candidate_letters[1]]['replace'][model_letters[1]] < letter_description_after_scoring[candidate_letters[0]]['replace'][model_letters[1]] + letter_description_after_scoring[candidate_letters[1]]['subs'] + letter_description_after_scoring[candidate_letters[0]]['add']):
        candidate_score = letter_description_after_scoring[candidate_letters[0]]['replace'][model_letters[0]] + letter_description_after_scoring[candidate_letters[1]]['replace'][model_letters[1]]
    else:
        candidate_score = letter_description_after_scoring[candidate_letters[0]]['replace'][model_letters[1]] + letter_description_after_scoring[candidate_letters[1]]['subs'] + letter_description_after_scoring[candidate_letters[0]]['add']
    return candidate_score

In [30]:
def letters_replacement_score(letter_description_after_scoring, model_letters, candidate_letters):
    """
    Calculates the replacement score.
    Arguments:
        letter_description_after_scoring(dictionary) - the score of the letters different cases.
        model_letters(String) - the letters of the model we should consider its score to replace.
        candidate_letters(String) - the leters of the candidate we should consider its score to replace.
    Returns:
        Double. The score of the letter replacement.
    """
    candidate_score = 0
    if len(model_letters) == 1 and len(candidate_letters) == 1:
        candidate_score += letter_description_after_scoring[candidate_letters]['replace'][model_letters]      
    elif len(model_letters) == 1 and len(candidate_letters) > 1:  
        candidate_score += candidate_letters_by_model_letter(letter_description_after_scoring, model_letters, candidate_letters)  
    elif len(model_letters) > 1 and len(candidate_letters) == 1:  
        candidate_score += candidate_letter_by_model_letters(letter_description_after_scoring, model_letters, candidate_letters) 
    elif len(model_letters) > 1 and len(candidate_letters) > 1:
        candidate_score += candidate_letters_by_model_letters(letter_description_after_scoring, model_letters, candidate_letters) 
    return candidate_score

In [31]:
def sort_candidates(model_word, candidates,letter_description_after_scoring):
    """
    Sorts the missplled word candidates by scoring.
    Arguments:
        model_word(str) - The missplled word
        candidates(List) - The correction candidates
        letter_description_after_scoring(dictionary) - the probability of the letters.
    Returns:
        Dictionary. The scores dictionary.
    """ 
    candidates_scores = {}
    candidate_score = 0
    for candidate in candidates:
        print(f'The actions needed to go from {model_word} to {candidate} are:')
        print('\n')
        suffix_map = str.maketrans("םןץףך", "מנצפכ",'"')
        candidate = candidate.translate(suffix_map)
        model_word =model_word.translate(suffix_map)
        d = SequenceMatcher(None, model_word, candidate)
        for tag, i1, i2, j1, j2 in d.get_opcodes():
            print('{:7}   model_word[{}:{}] --> candidate[{}:{}] {!r:>8} --> {!r}'.format(tag, i1, i2, j1, j2, model_word[i1:i2], candidate[j1:j2]))
            if tag == 'insert':
                for candidate_letter in candidate[j1:j2]:
                    candidate_score += letter_omission_score(letter_description_after_scoring,candidate_letter)
            elif tag == 'delete':
                for model_letter in model_word[i1:i2]:
                    candidate_score += letter_addition_score(letter_description_after_scoring, model_letter)
            elif tag == 'replace':
                candidate_score += letters_replacement_score(letter_description_after_scoring, model_word[i1:i2], candidate[j1:j2])
        candidate = converting_suffix(candidate)
        candidates_scores[candidate] = candidate_score
        print(f'The candidate total score is: {candidate_score}')
        candidate_score = 0
        print('\n')
    return candidates_scores

In [32]:
#filtering scores

In [33]:
def filter_scores(candidates_scores):
    """
    Removes the words with score 0.
    Arguments:
        candidates_scores(Dictionary) - Words with their score.
    Returns:
        Dictionary. The filtered dictionary.
    """
    candidates_final_scores = {}
    for key, value in candidates_scores.items():
        if value!= 0:
            candidates_final_scores[key] = value
    return candidates_final_scores

In [34]:
def sort_dict_by_minimum(candidates_scores):
    """
    Sorts the dictionary by its minimum values.
    Arguments:
        my_dict(Dictionary)
    """
    candidates_scores_keys_sorted = sorted(candidates_scores.keys())
    candidates_scores_values_sorted = sorted(candidates_scores.values())
    candidates_sorted = {}
    for i in range(len(candidates_scores_keys_sorted)):
        candidates_sorted[candidates_scores_keys_sorted[i]] = candidates_scores_values_sorted[i]
    print(candidates_sorted)

In [77]:
#Dataframe functions

In [78]:
def word_spell(word):
    if word[-1] in 'מנצפכ':
        word = word[:-1] + word[-1].translate(str.maketrans('מנצפכ', 'םןץףך'))
    return word

In [79]:
def sentence_correction(sentence):
    """
    Repairs each word to finish with a correct suffix.
    Arguments:
        sentence(String) - The sentence we should repair.
    Return:
        String. The repaird String.  
    """
    repaired_sentence = []
    for word in sentence:
        if word!= '':
            word = word_spell(word)
        repaired_sentence.append(word)
    repaired_sentence = ' '.join(repaired_sentence)
    return repaired_sentence

In [80]:
def dataframe_sentences_correction(demo_data):
    """
    Repairs each row in the data frame.
    Arguments:
        demo_data(Pandas Data frame) - the demo data frame.
    Returns:
        Pandas Dataframe. The repaired data frame.
    """
    df_rows = len(demo_data)
    for i in range(df_rows):
        #Taking the sentences from the dataframe.
        label_str_sentence = demo_data.loc[i,'label_str']
        pred_str_sentence = demo_data.loc[i,'pred_str']
        
        #Splits the sentences to list words
        label_str_sentence = label_str_sentence.split(' ')
        pred_str_sentence = pred_str_sentence.split(' ')

        #repair the sentence
        label_str_sentence = sentence_correction(label_str_sentence)
        pred_str_sentence = sentence_correction(pred_str_sentence)

        #Update the data frame
        demo_data.loc[i,'label_str'] = label_str_sentence
        demo_data.loc[i,'pred_str'] = pred_str_sentence
    return demo_data

In [136]:
def data_frame_comparison(demo_data):
    """
    compares each row in the data frame.
    Arguments:
        demo_data(Pandas Data frame) - the demo data frame.
    Returns:
        Pandas Dataframe. The repaired data frame.
    """
    #Initiating the hebrew dictionary
    spell = SpellChecker(language=None, case_sensitive=True, distance=2)
    spell.word_frequency.load_dictionary('./pyspellchecker/spellchecker/resources/he.json.gz')
    
    correct_words = 0
    wrong_words = 0
    correct_correct_words = 0                      #words that passed the spelling check and also  
    correct_wrong_words = 0
    wrong_correct_words = 0
    wrong_wrong_words = 0
    wrong_wrong_correct_words = 0
    wrong_wrong_wrong_words = 0
    
    df_rows = len(demo_data)
    for i in range(df_rows):
        #Taking the sentences from the dataframe and parse them to list of words.
        label_str_sentence = demo_data.loc[i,'label_str'].split()
        pred_str_sentence = demo_data.loc[i,'pred_str'].split()
        print(f'Original sentence: {label_str_sentence}')
        print(f'Prediction sentence: {pred_str_sentence} \n')

        d = SequenceMatcher(None, pred_str_sentence, label_str_sentence)
        for tag, i1, i2, j1, j2 in d.get_opcodes():
            print('{:7}   prediction[{}:{}] --> label[{}:{}] {!r:>8} --> {!r}'.format(
                     tag, i1, i2, j1, j2, pred_str_sentence[i1:i2], label_str_sentence[j1:j2]))
            print('\n')
            if tag == 'replace':
                if i2-i1 == 1 and j2-j1 == 1:
                    wrong_words += 1 #finds the all unknown words
                    word_after_classic_spellcheck = spell.correction(pred_str_sentence[i1])    
                    print(f'The repair of the original word {pred_str_sentence[i1]} is {word_after_classic_spellcheck}')
                    d2 = SequenceMatcher(None, pred_str_sentence[i1], word_after_classic_spellcheck)
                    print(f'Now it is the actions needed to be operated from the prediction {pred_str_sentence[i1]} to the word after the spellcheck {word_after_classic_spellcheck}: \n ')
                    for tag2, is1, is2, js1, js2 in d2.get_opcodes():
                        print('{:7}   prediction[{}:{}] --> label[{}:{}] {!r:>8} --> {!r}'.format(tag2, is1, is2, js1, js2, pred_str_sentence[i1][is1:is2], word_after_classic_spellcheck[js1:js2]))
                        print('\n')
        print('----------------------------------------------------------')





                    # word_not_in_dict = sc.spell.unknown([pred_str_sentence[i1]])
                    # if pred_str_sentence[i1] not in word_not_in_dict:
                    #     wrong_correct_words += 1
                    # else:
                    #     wrong_wrong_words += 1
                    #     try:
                    #         prediction_after_spell_checking = sc.get_candidate(pred_str_sentence[i1])
                    #         if prediction_after_spell_checking == label_str_sentence[j1]:
                    #             wrong_wrong_correct_words += 1
                    #         else:
                    #             wrong_wrong_wrong_words += 1
                    #             #if pred_str_sentence[i1] not in sc.spell.unknown([pred_str_sentence[i1]]):    
                    #             # if prediction_after_spell_checking == pred_str_sentence[i1]:
                    #             #     wrong_wrong_wrong_words += 1
                    #     except:
                    #         wrong_wrong_wrong_words += 1
        
        # d = SequenceMatcher(None, pred_str_sentence, label_str_sentence)
        # for tag, i1, i2, j1, j2 in d.get_opcodes():
        #     print('{:7}   prediction[{}:{}] --> label[{}:{}] {!r:>8} --> {!r}'.format(
        #              tag, i1, i2, j1, j2, pred_str_sentence[i1:i2], label_str_sentence[j1:j2]))
        #     print('\n')
        #     if tag == 'equal':
        #         for prediction_word in pred_str_sentence[i1:i2]:
        #             correct_words += 1
        #             word_not_in_dict = sc.spell.unknown([prediction_word])
        #             if prediction_word in word_not_in_dict:
        #                 print(f'The word {prediction_word} is not in the dictionary')
        #                 correct_wrong_words+=1
        #             else:
        #                 correct_correct_words +=1
        #                 print(f'The word {prediction_word} in the dictionary and successed spell checking')
        #     elif tag == 'replace':
        #         if i2-i1 == 1 and j2-j1 == 1:
        #             wrong_words += 1
        #             word_not_in_dict = sc.spell.unknown([pred_str_sentence[i1]])
        #             if pred_str_sentence[i1] not in word_not_in_dict:
        #                 wrong_correct_words += 1
        #             else:
        #                 wrong_wrong_words += 1
        #                 try:
        #                     prediction_after_spell_checking = sc.get_candidate(pred_str_sentence[i1])
        #                     if prediction_after_spell_checking == label_str_sentence[j1]:
        #                         wrong_wrong_correct_words += 1
        #                     else:
        #                         wrong_wrong_wrong_words += 1
        #                 except:
        #                     wrong_wrong_wrong_words += 1    
    print(f'There are {correct_words} correct words')
    print(f'There are {wrong_words} wrong words')
    print(f'There are {correct_words + wrong_words} total words')
    print(f'The number of correct-correct words is {correct_correct_words}')
    print(f'The number of correct-wrong words is {correct_wrong_words}')
    print(f'The number of wrong-correct words is {wrong_correct_words}')
    print(f'The number of wrong-wrong words is {wrong_wrong_words}')
    print(f'The number of wrong-wrong-correct words is {wrong_wrong_correct_words}')
    print(f'The number of wrong-wrong-wrong words is {wrong_wrong_wrong_words}')
    print('\n')

In [81]:
#Pandas dataframe
df = get_letters_pickle('pred_res.pkl')

In [82]:
df

Unnamed: 0,pred_str,label_str
0,זה נורא ברור לו,זה נורא ברור לא
1,אז ז זהו שלא,אז זהו שלא
2,אנחנו נמצאימ עכשיו,אנחנו נמצאימ עכשיו
3,היא נוצרה הלחה מאוד מאוד ברורה שאמרה,נוצרה הלכה מאוד מאוד ברורה שאמרה
4,מה בדיוק הקריטיונימ למוות,מה בדיוק הקריטריונימ למוות
...,...,...
2270,בכל מה שנוגע לפעילות הכוללת של המנהל הציבורי,בכל מה שנוגע לפעילות הכוללת של המנהל הציבורי
2271,א לאותו פיקוח,לאותו פיקוח
2272,ובכלל לכל מה שנוגע להגדרה החוקית,ובכלל בכל מה שנוגע לההגדרה החוקית
2273,אז אני מקווה שתהנו מהסרטונ,אז אני מקווה שתהנו מהסרטונ


In [137]:
df = dataframe_sentences_correction(df)

In [84]:
df

Unnamed: 0,pred_str,label_str
0,זה נורא ברור לו,זה נורא ברור לא
1,אז ז זהו שלא,אז זהו שלא
2,אנחנו נמצאים עכשיו,אנחנו נמצאים עכשיו
3,היא נוצרה הלחה מאוד מאוד ברורה שאמרה,נוצרה הלכה מאוד מאוד ברורה שאמרה
4,מה בדיוק הקריטיונים למוות,מה בדיוק הקריטריונים למוות
...,...,...
2270,בכל מה שנוגע לפעילות הכוללת של המנהל הציבורי,בכל מה שנוגע לפעילות הכוללת של המנהל הציבורי
2271,א לאותו פיקוח,לאותו פיקוח
2272,ובכלל לכל מה שנוגע להגדרה החוקית,ובכלל בכל מה שנוגע לההגדרה החוקית
2273,אז אני מקווה שתהנו מהסרטון,אז אני מקווה שתהנו מהסרטון


In [35]:
#Letters switches list based on Dina's confusion matrix

In [55]:
letters_dictionary = get_letters_pickle('char_mistakes.pkl')

In [56]:
# "((supposed to be,model's output),number describe the frequency of substitution)"
letters_switches = parse_letter_dictionary(letters_dictionary)

In [57]:
letters_switches

[(('ו', 'א'), 11),
 (('ח', 'כ'), 17),
 (('ט', 'נ'), 1),
 (('י', 'ט'), 2),
 (('ב', 'ד'), 1),
 (('ה', 'א'), 48),
 (('ט', 'ת'), 16),
 (('ו', 'ב'), 11),
 (('ב', 'ו'), 22),
 (('ה', 'ד'), 3),
 (('כ', 'ח'), 27),
 (('ה', 'ת'), 12),
 (('ע', 'א'), 35),
 (('ה', 'נ'), 8),
 (('ו', 'ה'), 17),
 (('ה', 'מ'), 11),
 (('ת', 'א'), 4),
 (('ש', 'ג'), 1),
 (('ת', 'ט'), 48),
 (('ד', 'א'), 2),
 (('ר', 'א'), 7),
 (('ח', 'ה'), 2),
 (('ד', 'ת'), 5),
 (('ל', 'נ'), 7),
 (('מ', 'נ'), 24),
 (('ת', 'ו'), 2),
 (('ב', 'פ'), 2),
 (('ר', 'ת'), 1),
 (('ס', 'מ'), 1),
 (('מ', 'א'), 7),
 (('א', 'ט'), 2),
 (('ר', 'ג'), 2),
 (('ע', 'ה'), 27),
 (('מ', 'ר'), 1),
 (('ג', 'מ'), 1),
 (('ד', 'ש'), 1),
 (('ש', 'ל'), 1),
 (('כ', 'ל'), 1),
 (('נ', 'ת'), 1),
 (('ז', 'ש'), 1),
 (('ק', 'כ'), 27),
 (('א', 'ע'), 64),
 (('א', 'ה'), 25),
 (('י', 'ו'), 4),
 (('מ', 'ת'), 3),
 (('ה', 'י'), 25),
 (('ה', 'ל'), 5),
 (('ל', 'ה'), 9),
 (('ת', 'צ'), 3),
 (('ר', 'נ'), 1),
 (('כ', 'א'), 2),
 (('ש', 'ס'), 9),
 (('ר', 'מ'), 1),
 (('ע', 'י'), 4),
 (('י', 'נ

In [58]:
#The letter description - List of dictionaries(each dictionary is a letter with all the probabilities)  
letters_description = json_loads_file('./letters_probs.json')

In [59]:
#Parsing the List of dictionaries to one dictionary when the letters are the only keys  
letters_description_dictionary = list_of_jsons_to_letters_dictionary(letters_description)
print(letters_description_dictionary)

{'א': {'subs': 0, 'add': 0, 'replace': {'ב': 0.05, 'ג': 0.006, 'ד': 0.08, 'ה': 0.05, 'ו': 0.006, 'ז': 0.08, 'ח': 0.05, 'ט': 0.006, 'י': 0.08, 'כ': 0.05, 'ל': 0.006, 'מ': 0.08, 'נ': 0.05, 'ס': 0.006, 'ע': 0.08, 'פ': 0.05, 'צ': 0.006, 'ק': 0.08, 'ר': 0.05, 'ש': 0.006, 'ת': 0.08}}, 'ב': {'subs': 0, 'add': 0, 'replace': {'א': 0.05, 'ג': 0.006, 'ד': 0.08, 'ה': 0.05, 'ו': 0.006, 'ז': 0.08, 'ח': 0.05, 'ט': 0.006, 'י': 0.08, 'כ': 0.05, 'ל': 0.006, 'מ': 0.08, 'נ': 0.05, 'ס': 0.006, 'ע': 0.08, 'פ': 0.05, 'צ': 0.006, 'ק': 0.08, 'ר': 0.05, 'ש': 0.006, 'ת': 0.08}}, 'ג': {'subs': 0, 'add': 0, 'replace': {'א': 0.05, 'ב': 0.006, 'ד': 0.08, 'ה': 0.05, 'ו': 0.006, 'ז': 0.08, 'ח': 0.05, 'ט': 0.006, 'י': 0.08, 'כ': 0.05, 'ל': 0.006, 'מ': 0.08, 'נ': 0.05, 'ס': 0.006, 'ע': 0.08, 'פ': 0.05, 'צ': 0.006, 'ק': 0.08, 'ר': 0.05, 'ש': 0.006, 'ת': 0.08}}, 'ד': {'subs': 0, 'add': 0, 'replace': {'א': 0.08, 'ב': 0.05, 'ג': 0.006, 'ה': 0.05, 'ו': 0.006, 'ז': 0.08, 'ח': 0.05, 'ט': 0.006, 'י': 0.08, 'כ': 0.05, 'ל': 0.006

In [60]:
#Insert the scores from the letters switches list of tuples(based on the confusion matrix) 
letters_description = switches_confusion_matrix_scoring(letters_description_dictionary,letters_switches)            
print(letters_description)

{'א': {'subs': 0, 'add': 0, 'replace': {'ב': 1, 'ג': 0, 'ד': 1, 'ה': 25, 'ו': 11, 'ז': 1, 'ח': 0, 'ט': 2, 'י': 19, 'כ': 0, 'ל': 2, 'מ': 1, 'נ': 1, 'ס': 0, 'ע': 64, 'פ': 0, 'צ': 4, 'ק': 1, 'ר': 1, 'ש': 2, 'ת': 0}}, 'ב': {'subs': 0, 'add': 0, 'replace': {'א': 3, 'ג': 0, 'ד': 1, 'ה': 4, 'ו': 22, 'ז': 0, 'ח': 1, 'ט': 0, 'י': 1, 'כ': 1, 'ל': 1, 'מ': 7, 'נ': 0, 'ס': 0, 'ע': 0, 'פ': 2, 'צ': 0, 'ק': 1, 'ר': 0, 'ש': 0, 'ת': 0}}, 'ג': {'subs': 0, 'add': 0, 'replace': {'א': 0, 'ב': 1, 'ד': 0, 'ה': 0, 'ו': 0, 'ז': 0, 'ח': 0, 'ט': 0, 'י': 0, 'כ': 0, 'ל': 0, 'מ': 1, 'נ': 0, 'ס': 0, 'ע': 0, 'פ': 0, 'צ': 0, 'ק': 1, 'ר': 0, 'ש': 0, 'ת': 0}}, 'ד': {'subs': 0, 'add': 0, 'replace': {'א': 2, 'ב': 0, 'ג': 0, 'ה': 1, 'ו': 0, 'ז': 0, 'ח': 0, 'ט': 4, 'י': 3, 'כ': 0, 'ל': 0, 'מ': 0, 'נ': 1, 'ס': 0, 'ע': 0, 'פ': 0, 'צ': 0, 'ק': 0, 'ר': 0, 'ש': 1, 'ת': 5}}, 'ה': {'subs': 0, 'add': 0, 'replace': {'א': 48, 'ב': 3, 'ג': 0, 'ד': 3, 'ו': 20, 'ז': 1, 'ח': 0, 'ט': 3, 'י': 25, 'כ': 2, 'ל': 5, 'מ': 11, 'נ': 8, 'ס': 0, 'ע'

In [61]:
letters_description = insertion_and_remove_commonly_score(letters_description,letters_dictionary) 

In [62]:
print(letters_description)

{'א': {'subs': 94, 'add': 189, 'replace': {'ב': 1, 'ג': 0, 'ד': 1, 'ה': 25, 'ו': 11, 'ז': 1, 'ח': 0, 'ט': 2, 'י': 19, 'כ': 0, 'ל': 2, 'מ': 1, 'נ': 1, 'ס': 0, 'ע': 64, 'פ': 0, 'צ': 4, 'ק': 1, 'ר': 1, 'ש': 2, 'ת': 0}}, 'ב': {'subs': 21, 'add': 13, 'replace': {'א': 3, 'ג': 0, 'ד': 1, 'ה': 4, 'ו': 22, 'ז': 0, 'ח': 1, 'ט': 0, 'י': 1, 'כ': 1, 'ל': 1, 'מ': 7, 'נ': 0, 'ס': 0, 'ע': 0, 'פ': 2, 'צ': 0, 'ק': 1, 'ר': 0, 'ש': 0, 'ת': 0}}, 'ג': {'subs': 7, 'add': 1, 'replace': {'א': 0, 'ב': 1, 'ד': 0, 'ה': 0, 'ו': 0, 'ז': 0, 'ח': 0, 'ט': 0, 'י': 0, 'כ': 0, 'ל': 0, 'מ': 1, 'נ': 0, 'ס': 0, 'ע': 0, 'פ': 0, 'צ': 0, 'ק': 1, 'ר': 0, 'ש': 0, 'ת': 0}}, 'ד': {'subs': 7, 'add': 5, 'replace': {'א': 2, 'ב': 0, 'ג': 0, 'ה': 1, 'ו': 0, 'ז': 0, 'ח': 0, 'ט': 4, 'י': 3, 'כ': 0, 'ל': 0, 'מ': 0, 'נ': 1, 'ס': 0, 'ע': 0, 'פ': 0, 'צ': 0, 'ק': 0, 'ר': 0, 'ש': 1, 'ת': 5}}, 'ה': {'subs': 167, 'add': 197, 'replace': {'א': 48, 'ב': 3, 'ג': 0, 'ד': 3, 'ו': 20, 'ז': 1, 'ח': 0, 'ט': 3, 'י': 25, 'כ': 2, 'ל': 5, 'מ': 11, 'נ': 8, 'ס

In [63]:
total_score = sum_of_scores(letters_description) 

In [68]:
total_score

2837

In [66]:
relative_letters_description = letters_scores_division(letters_description, total_score)

In [67]:
relative_letters_description

{'א': {'subs': 0.03313359182234755,
  'add': 0.06661966866408178,
  'replace': {'ב': 0.00035248501938667606,
   'ג': 0.0,
   'ד': 0.00035248501938667606,
   'ה': 0.008812125484666901,
   'ו': 0.0038773352132534366,
   'ז': 0.00035248501938667606,
   'ח': 0.0,
   'ט': 0.0007049700387733521,
   'י': 0.0066972153683468455,
   'כ': 0.0,
   'ל': 0.0007049700387733521,
   'מ': 0.00035248501938667606,
   'נ': 0.00035248501938667606,
   'ס': 0.0,
   'ע': 0.022559041240747268,
   'פ': 0.0,
   'צ': 0.0014099400775467042,
   'ק': 0.00035248501938667606,
   'ר': 0.00035248501938667606,
   'ש': 0.0007049700387733521,
   'ת': 0.0}},
 'ב': {'subs': 0.007402185407120197,
  'add': 0.004582305252026789,
  'replace': {'א': 0.0010574550581600281,
   'ג': 0.0,
   'ד': 0.00035248501938667606,
   'ה': 0.0014099400775467042,
   'ו': 0.007754670426506873,
   'ז': 0.0,
   'ח': 0.00035248501938667606,
   'ט': 0.0,
   'י': 0.00035248501938667606,
   'כ': 0.00035248501938667606,
   'ל': 0.00035248501938667606,
   

In [74]:
relative_letters_description = compare_letters_scoring(relative_letters_description)

In [75]:
relative_letters_description

{'א': {'subs': 0.03313359182234755,
  'add': 0.06661966866408178,
  'replace': {'ב': 0.00035248501938667606,
   'ג': 0.0,
   'ד': 0.00035248501938667606,
   'ה': 0.008812125484666901,
   'ו': 0.0038773352132534366,
   'ז': 0.00035248501938667606,
   'ח': 0.0,
   'ט': 0.0007049700387733521,
   'י': 0.0066972153683468455,
   'כ': 0.0,
   'ל': 0.0007049700387733521,
   'מ': 0.00035248501938667606,
   'נ': 0.00035248501938667606,
   'ס': 0.0,
   'ע': 0.022559041240747268,
   'פ': 0.0,
   'צ': 0.0014099400775467042,
   'ק': 0.00035248501938667606,
   'ר': 0.00035248501938667606,
   'ש': 0.0007049700387733521,
   'ת': 0.0}},
 'ב': {'subs': 0.007402185407120197,
  'add': 0.004582305252026789,
  'replace': {'א': 0.0010574550581600281,
   'ג': 0.0,
   'ד': 0.00035248501938667606,
   'ה': 0.0014099400775467042,
   'ו': 0.007754670426506873,
   'ז': 0.0,
   'ח': 0.00035248501938667606,
   'ט': 0.0,
   'י': 0.00035248501938667606,
   'כ': 0.00035248501938667606,
   'ל': 0.00035248501938667606,
   

In [70]:
# print(f'the total sum is {total_score}')

In [71]:
# maximum_score = find_max_score(letters_description)
# print(f'The maximum score is {maximum_score}')

In [72]:
# #Getting the different scores to each letter - score of substitution, insertion and remove
# letter_description_after_scoring = letters_calculation_scores(letters_description, maximum_score)

In [73]:
# letter_description_after_scoring 

In [138]:
data_frame_comparison(df)

Original sentence: ['זה', 'נורא', 'ברור', 'לא']
Prediction sentence: ['זה', 'נורא', 'ברור', 'לו'] 

equal     prediction[0:3] --> label[0:3] ['זה', 'נורא', 'ברור'] --> ['זה', 'נורא', 'ברור']


replace   prediction[3:4] --> label[3:4]   ['לו'] --> ['לא']


The repair of the original word לו is לו
Now it is the actions needed to be operated from the prediction לו to the word after the spellcheck לו: 
 
equal     prediction[0:2] --> label[0:2]     'לו' --> 'לו'


----------------------------------------------------------
Original sentence: ['אז', 'זהו', 'שלא']
Prediction sentence: ['אז', 'ז', 'זהו', 'שלא'] 

equal     prediction[0:1] --> label[0:1]   ['אז'] --> ['אז']


delete    prediction[1:2] --> label[1:1]    ['ז'] --> []


equal     prediction[2:4] --> label[1:3] ['זהו', 'שלא'] --> ['זהו', 'שלא']


----------------------------------------------------------
Original sentence: ['אנחנו', 'נמצאים', 'עכשיו']
Prediction sentence: ['אנחנו', 'נמצאים', 'עכשיו'] 

equal     prediction[0:3] -->

In [49]:
#intermediate level - getting the list of candidates words

In [50]:
#Initiating the hebrew dictionary
spell = SpellChecker(language=None, case_sensitive=True, distance=2)
spell.word_frequency.load_dictionary('./pyspellchecker/spellchecker/resources/he.json.gz')

#Getting a list of misspelled word
misspelled = spell.unknown(['חטול','אוהב','אבא','גדול'])
print(misspelled)

{'חטול'}


In [51]:
for word in misspelled:
    #finds the candidates at first 
    lev_dist1 = list(spell.edit_distance_1(word))
    lev_dist2 = list(spell.edit_distance_2(word))
    total_candidates = lev_dist1+ lev_dist2

    #filtering the words - removing the unlogical words
    words_to_remove = filter_candidates(total_candidates)
    
    total_candidates = remove_words_from_list(total_candidates, words_to_remove)
    
    #removing the duplicates words   
    total_candidates = remove_duplicates_from_list(total_candidates)

    #removing the mistaken word itself from the candidates
    total_candidates.remove(word)
    
    #finds the scores of each candidate
    candidates_scores = sort_candidates(word, total_candidates, letter_description_after_scoring) 
    candidates_scores = filter_scores(candidates_scores)
    #sort_dict_by_minimum(candidates_scores)
    candidate = get_candidate(candidates_scores)
    print(f'The final candidate is {candidate}')

The actions needed to go from חטול to חסול are:


equal     model_word[0:1] --> candidate[0:1]      'ח' --> 'ח'
replace   model_word[1:2] --> candidate[1:2]      'ט' --> 'ס'
equal     model_word[2:4] --> candidate[2:4]     'ול' --> 'ול'
The candidate total score is: 0.026869567638195393


The actions needed to go from חטול to חטונל are:


equal     model_word[0:3] --> candidate[0:3]    'חטו' --> 'חטו'
insert    model_word[3:3] --> candidate[3:4]       '' --> 'נ'
equal     model_word[3:4] --> candidate[4:5]      'ל' --> 'ל'
The candidate total score is: 0.02604009499960951


The actions needed to go from חטול to חאטול are:


equal     model_word[0:1] --> candidate[0:1]      'ח' --> 'ח'
insert    model_word[1:1] --> candidate[1:2]       '' --> 'א'
equal     model_word[1:4] --> candidate[2:5]    'טול' --> 'טול'
The candidate total score is: 0.02362416421399758


The actions needed to go from חטול to חטוו are:


equal     model_word[0:3] --> candidate[0:3]    'חטו' --> 'חטו'
replace   mode