# Integrating cleanup code

In [1]:
import sys;sys.path.append('..')
from ppanlp.cleanup import *
from ppanlp import *

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ryanheuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def tokenize_agnostic(txt):
    return re.findall(r"[\w']+|[.,!?; -—–'\n]", txt)

def untokenize_agnostic(l):
    return ''.join(l)

In [3]:
s="Oh no I can't poss-ibly———---- ! ££3eee-e"
assert untokenize_agnostic(tokenize_agnostic(s)) == s

In [4]:
def cleanup_str(txt, use_nltk_tokenizer=False, **page_attrs):
    page_text = txt
    # dicts to store specific corrections and their counts
    specific_ocr_corrections = []
    specific_linebreak_corrections = []
    specific_long_s_corrections = []
    correction_rules = load_correction_rules()
    clever_f_s_hack_rules = load_f_s_hack_corrections()

    # add a dictionary for specific f ſ hack corrections
    specific_f_s_hack_corrections = []

    # counters for corrections
    linebreak_corrections = 0
    ocr_corrections = 0
    long_s_corrections = 0
    f_s_word_replacements = 0

    # rejoin line breaks before tokenization and log corrections
    page_text, corrections = rejoin_linebreaks(page_text, specific_linebreak_corrections)
    linebreak_corrections += corrections

    # apply correction for long 's'
    corrected_text, corrections = replace_historic_long_s(page_text, specific_long_s_corrections)
    long_s_corrections += corrections
    page_text = corrected_text

    # tokenization
    tokens = word_tokenize(page_text) if use_nltk_tokenizer else tokenize_agnostic(page_text)

    # apply OCR corrections on tokens and log corrections
    corrected_tokens = []
    for token in tokens:
        if token in correction_rules:
            corrected_token = correction_rules[token]
            ocr_corrections += 1
            specific_ocr_corrections.append((token,corrected_token))
        else:
            corrected_token = token
        corrected_tokens.append(corrected_token)

    # apply f-ſ-s hack corrections on tokens and log corrections
    for i, token in enumerate(corrected_tokens):
        if token in clever_f_s_hack_rules:
            corrected_token = clever_f_s_hack_rules[token]
            f_s_word_replacements += 1
            specific_f_s_hack_corrections.append((token,corrected_token))
            corrected_tokens[i] = corrected_token

    token_count = len(corrected_tokens)

    # convert corrected tokens back to text for further processing
    corrected_text = untokenize(corrected_tokens) if use_nltk_tokenizer else untokenize_agnostic(corrected_tokens)

    corrected_tokens_real = [x for x in corrected_tokens if any(y.isalpha() for y in x)]

    # create output dictionary
    def as_counts(l):
        return l
        # return dict(Counter(l))

    return {
        'page_text':page_text, 
        **page_attrs, 
        'page_text_clean':corrected_text, 
        # 'page_num_tokens':token_count,
        'page_tokens':corrected_tokens_real,
        'corrections': {
            'headers':as_counts(page_attrs.get('corrections',{}).get('headers',[])),
            'ocr':as_counts(specific_ocr_corrections),
            'linebreaks':as_counts(specific_linebreak_corrections),
            'long_s':as_counts(specific_long_s_corrections),
            'f_s':as_counts(specific_f_s_hack_corrections),
        }
    }

In [5]:
cleanup_str('Paradife loſt de-\nfigned a——___---_-- paradife')

{'page_text': 'Paradife lost defigned a——___---_-- paradife',
 'page_text_clean': 'Paradise lost designed a——___---_-- paradise',
 'page_tokens': ['Paradise', 'lost', 'designed', 'a', 'paradise'],
 'corrections': {'headers': [],
  'ocr': [('defigned', 'designed'), ('paradife', 'paradise')],
  'linebreaks': [('de-\nfigned', 'defigned')],
  'long_s': [('loſt', 'lost')],
  'f_s': [('Paradife', 'Paradise')]}}

In [6]:
corpus = PPACorpus()

In [7]:
# corpus.meta

In [8]:
# pages = corpus.text('mdp.39015019158776').pages

In [9]:
def cleanup_page(page_d):
    txt=page_d.get('page_text_clean', page_d.get('page_text',''))
    odx=cleanup_str(txt, **page_d)
    return odx

def cleanup_pages(pages_ld):
    if type(pages_ld) == pd.DataFrame: pages_ld=pages_ld.to_dict('records')
    pages_ld = process_headers(pages_ld, remove_headers=True) # ideally, we want to set this later when calling the function
    pages_ld = [cleanup_page(page_d) for page_d in pages_ld]
    return pages_ld


In [10]:
# cleanup_pages(pages)

In [11]:
save_plaintext_mini_corpus_jsons()

  1%|          | 11493/1893144 [03:03<6:39:05, 78.58it/s]