# OCR Correction - Gensim

In [204]:
import io
from collections import defaultdict
import os, os.path
import sys
sys.path.insert(0, "..")
from string import punctuation
import pprint
import re
import collections

from src import iterators

import pandas as pd
# NLP
import spacy
import enchant
import fasttext

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk

import gensim
from gensim import corpora
from gensim import parsing

In [205]:
# Load csv
csv = iterators.iterate_directory("../data/processed/selected_articles/", ".csv")
df = pd.concat([pd.read_csv(c["article_path"]) for c in csv],ignore_index=True)
df.sort_values(by=["count"], ascending=False, inplace=True)

### Pre-process to clean up documents

In [215]:
punctuation = ",/<>;':\"[]\\{}|`~@#$%^&*()_+-="

def remove_punctuation(text):
    """Remove punctuation"""
    no_punct = "".join([c for c in text if c not in punctuation])
    return(no_punct)

In [216]:
df["text_clean"] = df["text"].apply(lambda x: remove_punctuation(x))

In [217]:
tokenizer = RegexpTokenizer(r'[a-zA-Z]+') # No punctuation and no numbers
df["text_clean"] = df["text_clean"].apply(lambda x: tokenizer.tokenize(x.lower()))

In [218]:
def remove_stopwords(text):
    """Remove stopwords as defined by nltk"""
    words = [w for w in text if w not in stopword_list]
    return words

In [219]:
df["text_clean"] = df["text_clean"].apply(lambda x: remove_stopwords(x))

In [220]:
def remove_one_char(text):
    """Eliminates all one char words"""
    words = [w for w in text if len(w)>1]
    return words

In [221]:
df["text_clean"] = df["text_clean"].apply(lambda x: remove_one_char(x))

### Create the corpus

In [45]:
# Extract the column as list of documents
texts = df['text_clean'].tolist()

In [46]:
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
# processed_corpus is a list of words for each doc in the documents

In [67]:
# Flatten the litst of words into a single list to count frequencies
from itertools import chain
chain_list = list(chain.from_iterable(processed_corpus))
vocab = collections.Counter(chain_list)
vocab.most_common(10)

[('uur', 1836),
 ('stg', 1772),
 ('grote', 1731),
 ('wij', 1678),
 ('no', 1646),
 ('tel', 1615),
 ('koop', 1534),
 ('onze', 1528),
 ('wel', 1505),
 ('ca', 1482)]

In [50]:
# Save file
with open('corpus.txt', 'w') as f:
    for doc in processed_corpus:
        for word in doc:
            f.write("%s\n" % word)

In [53]:
# associate each word in the corpus with a unique integer ID
dictionary = corpora.Dictionary(processed_corpus)
dictionary.save("dictionary.cor")
print(dictionary)

Dictionary(52733 unique tokens: ['aanraking', 'aanstonds', 'aantrekken', 'aanwezig', 'aanwezigheid']...)


In [29]:
# convert our entire original corpus to a list of vectors:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
#pprint.pprint(bow_corpus)

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 2),
  (5, 3),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 1),
  (22, 2),
  (23, 1),
  (24, 2),
  (25, 2),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 2),
  (30, 2),
  (31, 1),
  (32, 3),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 3),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 2),
  (50, 2),
  (51, 1),
  (52, 3),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 6),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 2),
  (69, 1),
  (70, 1),
  (71, 9),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1)

KeyboardInterrupt: 

### FastText in Gensim

In [60]:
# We could do this for every 10 years so that we have a model to substitute similar error words
# whitin a span of 10 years
from gensim.models.fasttext import FastText as FT_gensim

corpus_file = ("corpus.txt")

model_gensim = FT_gensim(
    size=100,
    iter=50,
    word_ngrams=3,
    min_count=1, # maybe increase this?
    window=6,
    negative=20)

# build the vocabulary
model_gensim.build_vocab(corpus_file=corpus_file)

# train the model
model_gensim.train(
    corpus_file=corpus_file, epochs=model_gensim.epochs,
    total_examples=model_gensim.corpus_count, total_words=model_gensim.corpus_total_words
)

print(model_gensim)

FastText(vocab=52733, size=100, alpha=0.025)


In [62]:
model_gensim.save("model_gensim.bin")

#### Test out similarity of mistakes

In [143]:
spell_mistake_min_frequency = 1
fasttext_min_similarity = 0.91
def include_spell_mistake(word, similar_word, score):
    """
    Check if similar word passes some rules to be considered a spelling mistake
    
    Rules:
       1. Similarity score should be greater than a threshold
       2. Length of the word with spelling error should be greater than 3.
       3. spelling mistake must occur at least some N times in the corpus
       4. Must not be a correct Dutch word.
       -. Has edit distance less than 2
    """
    return (score > fasttext_min_similarity
            and len(similar_word) > 1
            and vocab[similar_word] >= spell_mistake_min_frequency)
#            and not enchant_nl.check(similar_word))

In [144]:
word_to_mistakes = collections.defaultdict(list)
 
for word, freq in vocab.items():
    # Query the fasttext model for 50 closest neighbors to the word
    results = model_gensim.wv.most_similar(word, topn=50)
    for s_w in results:
        if include_spell_mistake(word, s_w[0], s_w[1]):
            word_to_mistakes[word].append(s_w)

In [149]:
word_to_mistakes

defaultdict(list,
            {'buitenstaander': [('buitenstaanders', 0.9139913320541382)],
             'vakorganisaties': [('vakorganisatie', 0.9088793992996216)],
             'andersdenkenden': [('andersdenkende', 0.9110281467437744)],
             'andersdenkende': [('andersdenkenden', 0.9110281467437744)],
             'buitenstaanders': [('buitenstaander', 0.9139913320541382)],
             'volksvertegenwoordiger': [('volksvertegenwoordigers',
               0.9309079647064209)],
             'standsorganisaties': [('standsorganisatie', 0.907096803188324)],
             'eestvergadering': [('feestvergadering', 0.9085502624511719)],
             'hoogwaardigheidsbekleder': [('hoogwaardigheidsbekleders',
               0.922699511051178)],
             'belangstellenden': [('belangstellende', 0.9099758863449097)],
             'verslaggever': [('verslaggevers', 0.9125968813896179)],
             'vakorganisatie': [('vakorganisaties', 0.9088793992996216)],
             'administra

In [146]:
inverted_index = {}
for word, mistakes in word_to_mistakes.items():
    for mistake in mistakes:
        if mistake != word:
            inverted_index[mistake] = word