<a href="https://colab.research.google.com/github/Princeton-CDH/ppa-nlp/blob/develop/notebooks/008_OCR_cleanup_of_PPA_text_corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleaning up OCR-related artifacts in the PPA text corpus (applied to the text in the JSON-formatted corpus)


In [28]:
# imports needed for notebook
import os,json,random
from tqdm import tqdm
import numpy as np
import pandas as pd
tqdm.pandas()

## Getting corpus

### Corpus metadata

In [5]:
# Mount google drive and set path to corpus
from google.colab import drive
drive.mount('/content/drive/')
path_corpus='/content/drive/Shareddrives/PPA/2023 Full-Text Data Work/2023 Full-Text Corpus/corpus_json'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [6]:
# Set path to metadata and texts
path_metadata = os.path.join(path_corpus, 'metadata.csv')
path_texts = os.path.join(path_corpus, 'texts')

In [7]:
# Read metadata
df_metadata = pd.read_csv(path_metadata).fillna('')
df_metadata.dtypes

sourcepage_id    object
id                int64
source_id        object
record_id        object
title            object
subtitle         object
sort_title       object
author           object
item_type        object
book_journal     object
pub_date         object
pub_place        object
publisher        object
enumcron         object
collections      object
cluster          object
public_notes     object
notes            object
pages_orig       object
pages_digital    object
page_count       object
status           object
source           object
added            object
updated          object
source_url       object
dtype: object

In [8]:
# add filename
df_metadata['filename'] = df_metadata['sourcepage_id'].apply(lambda id: os.path.join(path_texts,id+'.json'))
df_metadata['filename_exists'] = df_metadata['filename'].progress_apply(os.path.exists)
df_metadata[['filename','filename_exists']]

100%|██████████| 6752/6752 [00:38<00:00, 177.63it/s] 


Unnamed: 0,filename,filename_exists
0,/content/drive/Shareddrives/PPA/2023 Full-Text...,True
1,/content/drive/Shareddrives/PPA/2023 Full-Text...,True
2,/content/drive/Shareddrives/PPA/2023 Full-Text...,True
3,/content/drive/Shareddrives/PPA/2023 Full-Text...,True
4,/content/drive/Shareddrives/PPA/2023 Full-Text...,True
...,...,...
6747,/content/drive/Shareddrives/PPA/2023 Full-Text...,True
6748,/content/drive/Shareddrives/PPA/2023 Full-Text...,True
6749,/content/drive/Shareddrives/PPA/2023 Full-Text...,True
6750,/content/drive/Shareddrives/PPA/2023 Full-Text...,True


In [9]:
# missing?
df_metadata_with_existing_files = df_metadata[df_metadata['filename_exists']]
df_metadata_without_existing_files = df_metadata[~df_metadata['filename_exists']]

len(df_metadata_with_existing_files), len(df_metadata_without_existing_files)

(6411, 341)

In [10]:
# show missing files' metadata
cols_to_show = ['sourcepage_id', 'filename', 'source', 'source_id', 'pages_orig', 'pages_digital']
df_metadata_without_existing_files[cols_to_show]

Unnamed: 0,sourcepage_id,filename,source,source_id,pages_orig,pages_digital
75,coo.31924031321189,/content/drive/Shareddrives/PPA/2023 Full-Text...,HathiTrust,coo.31924031321189,,
146,umn.31951002122743m,/content/drive/Shareddrives/PPA/2023 Full-Text...,HathiTrust,umn.31951002122743m,,
147,mdp.39015058601207,/content/drive/Shareddrives/PPA/2023 Full-Text...,HathiTrust,mdp.39015058601207,,
157,mdp.39015010719410,/content/drive/Shareddrives/PPA/2023 Full-Text...,HathiTrust,mdp.39015010719410,,
221,mdp.39015040120563,/content/drive/Shareddrives/PPA/2023 Full-Text...,HathiTrust,mdp.39015040120563,,
...,...,...,...,...,...,...
6509,mdp.39015010702457,/content/drive/Shareddrives/PPA/2023 Full-Text...,HathiTrust,mdp.39015010702457,,
6513,wu.89002287233,/content/drive/Shareddrives/PPA/2023 Full-Text...,HathiTrust,wu.89002287233,,
6584,pst.000006208776,/content/drive/Shareddrives/PPA/2023 Full-Text...,HathiTrust,pst.000006208776,,
6585,inu.30000121122794,/content/drive/Shareddrives/PPA/2023 Full-Text...,HathiTrust,inu.30000121122794,,


### Corpus texts (JSON files)

In [23]:
# get a random text row
row = df_metadata_with_existing_files.sample(2).iloc[0]
row

sourcepage_id                                             hvd.hn1szn
id                                                               397
source_id                                                 hvd.hn1szn
record_id                                                  008991897
title                                         Grammatical analysis :
subtitle                                  with progressive exercises
sort_title         Grammatical analysis : with progressive exerci...
author                            Dalgleish, Walter Scott, 1834-1897
item_type                                                  Full work
book_journal                                                        
pub_date                                                      1871.0
pub_place                                                  New York 
publisher                                                C. Scribner
enumcron                                                            
collections                       

In [24]:
# get its pages as ld
with open(row.filename) as f: pages_ld = json.load(f)

# 2 random pages
random.choices(pages_ld, k=2)

[{'page_id': '00000065',
  'page_i': 65,
  'page_text': "THE COMPOUND SENTENCE.\n59\n11.\nThis Duncan\nHath borne his faculties so meek, hath been\nSo clear in his great office, that his virtues\nWill plead like angels, trumpet-tongued, against\nThe deep damnation of his taking off;\nAnd pity, like a naked, new-born babe,\nStriding the blast, or heaven's cherubiin, horsed\nUpon the sightless couriers of the air,\nShall blow the horrid deed in every eye,\nThat tears shall drown the wind.-Shakespeare.\n12. Seeing that truth consisteth in the right ordering of names in our affirmations,\na man that seeketh precise truth bad need to remember what every name he\nuseth stands for, and to place it accordingly, or else he will find himself\nentangled in words, as a bird in lime-twigs, the more he struggles, the more\nbeliined ; and, therefore, in geometry, which is the only science that it hath\npleased God hitherto to bestow on mankind, men begin at settling the signifi-\ncations of their wor

In [25]:
# all this text's pages as df
pages_df = pd.read_json(row.filename)
pages_df

Unnamed: 0,page_id,page_i,page_text,page_types
0,1,1,\nNEDL TRANSFER\nHN 1SZN +\n,"[FRONT_COVER, IMAGE_ON_PAGE, UNTYPICAL_PAGE, I..."
1,2,2,"K) 30447\nNo.......\nWESTBROOK SEMINARY,\nFROS...","[IMAGE_ON_PAGE, UNTYPICAL_PAGE, IMPLICIT_PAGE_..."
2,3,3,"Exch. Aftuator, July 31,1926.\n","[UNTYPICAL_PAGE, IMPLICIT_PAGE_NUMBER]"
3,4,4,,[IMPLICIT_PAGE_NUMBER]
4,5,5,,[IMPLICIT_PAGE_NUMBER]
...,...,...,...,...
75,76,76,,"[BLANK, IMPLICIT_PAGE_NUMBER]"
76,77,77,,[IMPLICIT_PAGE_NUMBER]
77,78,78,,[IMPLICIT_PAGE_NUMBER]
78,79,79,\n\n,"[IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER]"


In [26]:
# 10 random pages
pages_df.sample(10)

Unnamed: 0,page_id,page_i,page_text,page_types
33,34,34,GRAMMATICAL ANALYSIS.\n6. When I look upon the...,[UNTYPICAL_PAGE]
66,67,67,THE COMPOUND SENTENCE.\n27. I am the more at c...,[UNTYPICAL_PAGE]
44,45,45,"THE ""COMPLEX SENTENCE.\n39\n7. As my heart was...",[UNTYPICAL_PAGE]
53,54,54,"48\nGRAMMATICAL ANALYSIS.\nThe only point, the...",[UNTYPICAL_PAGE]
63,64,64,"58\nGRAMMATICAL ANALYSIS.\n1. That he is mad, ...",[UNTYPICAL_PAGE]
38,39,39,THE COMPLEX SENTENCE.\n33\nin these cases the ...,[UNTYPICAL_PAGE]
50,51,51,THE COMPOUND SENTENCE.\n45\nCHAPTER IV.—THE CO...,[UNTYPICAL_PAGE]
18,19,19,"THE SENTENCE, AND ITS PARTS.\n13\n9. The quali...",[UNTYPICAL_PAGE]
54,55,55,THE COMPOUND SENTENCE.\n40\n81. II. ALTERNATIV...,[UNTYPICAL_PAGE]
8,9,9,PREFACE TO AMERICAN EDITION.\nThe introduction...,"[UNTYPICAL_PAGE, IMPLICIT_PAGE_NUMBER]"


# Cleaning OCR-text in the Princeton Prosody Archive

---

## Process Flow (see also [here](https://drive.google.com/file/d/1fUR3hrZksZNKmQa3CwQwWsFjvLUSVLG-/view?usp=sharing))

1. **Process Headers**
    - **Aim:** Identifies and optionally removes running headers.
    - **How:** Identifies similar lines at the top of adjacent pages using fuzzy matching to account for minor variation.
        - **Original:**
        ```
        CHAPTER 2 -- GRAMMATICAL ANALYSIS
        Grammar is the study of...
        ```
        - **Corrected:**
        ```
        Grammar is the study of...
        ```

2. **Rejoin Linebreaks**
    - **Aim:** Correct words broken up by line breaks.
    - **How:** Searches the text for `-\\n`, which represents a hyphenated line break, and rejoins words affected by it.
        - **Original:** `Thi-\\nngs`
        - **Corrected:** `Things`

3. **Correct Historic Long 's'**
    - **Aim:** Replaces the historic long 's' (ſ) with the regular modern 's'.
    - **How:** Uses a simple find-and-replace operation at the character-level.
      - **Original:** `Cloſe`
      - **Corrected:** `Close`

4. **Clever f-s Hack**
    - **Aim:** Identifies words where the long 's' (ſ) might be erroneously OCR'd as 'f'.
    - **How:** Generates a list of word pairs from the long-s replacement logfile, skipping f-words that exist in the English language.
        - Generated `f to s-word` pairs:
        - `fhould` --> `should`
        - `dictuf` --> `dictus`
        - ~~`found` --> `sound`~~ (we don't want to replace minimal pairs, so these are filtered out, using `wordfreq`)
        - `fcript` --> `script`

5. **Ted's Correction Rules**
    - **Aim:** Corrects typical OCR artifacts.
    - **How:** Replaces the token if it matches an entry in Ted Underwood's list of corrections.
        - **Original:** `wiil`
        - **Corrected:** `will`

6. **f-s Hack**
    - **Aim:** Corrects words that have been written with a long 's' (ſ), which may be incorrectly OCR'd as 'f'.
    - **How:** Replaces the token if it matches an entry in our list of corrections.
        - **Original:** `meſs`
        - **Erroneously OCR'd:** `mefs`
        - **Corrected:** `mess`

---

**Output:**

- The cleaned and tokenized text is then saved into a JSON file for further processing. This file also contains information per page on the number of corrections made, e.g. `"corrections": {"ocr_corrections": 2, "linebreak_corrections": 2, "long_s_corrections": 0, "f_s_word_replacements": 0}}`

- A txt log file for each processed JSON-file, containing all the tokens that are affeced by the correction operations.

- Several log files, generated automatically during processing:
  1. `all_long_s_corrections_log.txt`, e.g.:
    ```
    ſo      so 	 491
    moſt    most    422
    muſt    must    391
    ſome    some    384
    theſe   these   362
    ...     ...   ...
    ```
  2. `clever_f_ſ_hack.txt`, e.g.:
    ```
    moft	  most
    muft	  must
    fome	  some
    fhould    should
    thefe	 these
    fuch	  such
    ...       ...
    ```
  3. `disregard_fſs_replacements.txt`, e.g.:
    ```
    fo	    so
    fame	  same
    fee	   see
    found	 sound
    fort	  sort
    wife	  wise
    cafe	  case
    ...     ...
    ```

In [27]:
!pip install wordfreq

Collecting wordfreq
  Downloading wordfreq-3.0.3-py3-none-any.whl (56.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ftfy>=6.1 (from wordfreq)
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy, wordfreq
Successfully installed ftfy-6.1.1 wordfreq-3.0.3


In [29]:
import re

def remove_trailing_punctuation(word):
    # remove trailing punctuation and spaces
    # don't remove the dash '-', as this might interfere with the function to repair broken words!
    # Question: should we also remove punct at the beginning of the token...? Not doing that now.
    return re.sub(r'[\.,\?!"\')(:;`]+\s*$', '', word)

# small test
word = "...example.,...! "
clean_word = remove_trailing_punctuation(word)
print(clean_word)

...example


In [None]:
import wordfreq

# process a list of word pairs, where each pair consists of an 'incorrect' word with a historic long 's' (ſ) and its 'correct' modern equivalent
# the script then replaces the historic long 's' (ſ) words with 'f', generates new word pairs
# ONLY if the newly generated f-word does NOT exist in the English language, we retain the word!! For this, we use language stats provided by wordfreq
# the resulting pairs are then written to the outfile, while pairs that exists -- with high frequency in English -- are written to a separate disregard_file
# i think this is clever, so i named the function accordingly :-)

def generate_clever_f_s_hack(source_file, output_file, disregard_file, skip_words=None, frequency_threshold=1e-6):
    if skip_words is None:
        skip_words = {'ſlip'}  # add specific words to skip here -- dunno if this is still useful, the file will capture most of these words

    unique_pairs = set()  # set to keep track of unique (incorrect f-word, correct s-word) pairs

    with open(source_file, 'r') as infile, open(output_file, 'w') as outfile, open(disregard_file, 'w') as disregard:
        # skip the title line of the infile
        next(infile)

        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue

            incorrect, correct = parts[:2]
            # e.g.:
            # incorrect correct
            # moſt 	    most
            # muſt 	    must
            # ſo 	      so
            # ſome 	    some
            # ſee       see   etc.

            # strip leading/trailing spaces
            incorrect = incorrect.strip()
            correct = correct.strip()

            # remove trailing punctuation
            incorrect = remove_trailing_punctuation(incorrect)
            correct = remove_trailing_punctuation(correct)

            # replace 'ſ' with 'f' in the incorrect word
            f_incorrect = incorrect.replace('ſ', 'f')
            # e.g.:
            # incorrect correct
            # moft 	    most
            # muft 	    must
            # fo 	      so
            # fome 	    some
            # fee       see   etc.

            # skip if the incorrect word is in skip_words or already in pairs
            if f_incorrect in skip_words or (f_incorrect, correct) in unique_pairs:
                continue

            # check the frequency of the word
            word_frequency = wordfreq.word_frequency(f_incorrect.lower(), 'en')

            # skip if the word exists and its frequency is above the threshold
            if word_frequency > frequency_threshold:
                disregard.write(f"{f_incorrect}\t{correct}\n")
                #print(f'Word that exist with the f-spelling and we don\'t want to include: {f_incorrect}')
                # e.g.
                # Words that exist with the f-spelling and we don't want to include: fame
                # Words that exist with the f-spelling and we don't want to include: found    etc.
                continue

            # check if the generated word exists in English
            if word_frequency <= frequency_threshold:
                outfile.write(f"{f_incorrect}\t{correct}\n")
                unique_pairs.add((f_incorrect, correct))
                # e.g.
                # moft 	    most
                # muft 	    must
                # fo 	      so
                # fome 	    some    etc.

# apply
generate_clever_f_s_hack(
    source_file="/content/drive/Shareddrives/PPA/2023 Full-Text Data Work/2023 Full-Text Corpus/notebooks/ocr_cleanup_rulesets/all_long_s_corrections_log.txt",
    output_file="/content/drive/Shareddrives/PPA/2023 Full-Text Data Work/2023 Full-Text Corpus/notebooks/ocr_cleanup_rulesets/clever_f_ſ_hack.txt",
    disregard_file="/content/drive/Shareddrives/PPA/2023 Full-Text Data Work/2023 Full-Text Corpus/notebooks/ocr_cleanup_rulesets/disregard_fſs_replacements.txt"
)

## Helper functions

In [None]:
import os
import json
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from difflib import SequenceMatcher

nltk.download('punkt')

def load_correction_rules(file_path):
    correction_rules = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                incorrect, correct = parts[:2]
                correction_rules[incorrect] = correct
    return correction_rules

def correct_ocr_errors(text, correction_rules):
    corrections = 0
    for incorrect, correct in correction_rules.items():
        if incorrect in text:
            text = text.replace(incorrect, correct)
            corrections += 1
    return text, corrections

def rejoin_linebreaks(text, specific_linebreak_corrections):
    """
    function to addresses the issue of words that are split between two lines due to a line break, typically indicated by a hyphen
    the function rejoins such words
    """
    corrections = 0
    parts = text.split('-\n')
    corrected_text = parts[0]
    for part in parts[1:]:
        corrected_text_words = corrected_text.split()
        part_words = part.split()

        if corrected_text_words and part_words:  # check if both lists are not empty
            last_word_before_break = corrected_text_words[-1]
            first_word_after_break = part_words[0]

            # form the broken word and the corrected word
            broken_word = last_word_before_break + '-\n' + first_word_after_break
            corrected_word = last_word_before_break + first_word_after_break

            # log the correction (gets later written to the txt file)
            specific_linebreak_corrections[broken_word + " \t " + corrected_word] += 1

            corrected_text += part
            corrections += 1
        else:
            # if either part is empty or doesn't contain words, simply append a hyphen
            corrected_text += '-' + part

    return corrected_text, corrections

def replace_historic_long_s(text, long_s_corrections):
    """
    function to replaces the historic long 's' (ſ) with the regular 's'

    :text: text to be processed
    :long_s_corrections: dictionary to log specific corrections and their counts
    :return: tuple of processed text with long 's' replaced, and the number of corrections made
    """
    corrected_text = text.replace('ſ', 's')
    corrections = 0
    if corrected_text != text:
        words_with_long_s = set(text.split()) - set(corrected_text.split())
        for word in words_with_long_s:
            corrected_word = word.replace('ſ', 's')
            long_s_corrections[f"{word} \t {corrected_word}"] += 1
            corrections += 1
    return corrected_text, corrections

def load_f_s_hack_corrections(file_path):
    """
    little helper script to load the f-->s words (from generate_clever_f_s_hack) into a dict, for convenient lookup
    """
    correction_rules = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) >= 2:
                incorrect, correct = parts[:2]
                correction_rules[incorrect] = correct
    return correction_rules

def process_headers(pages, remove_headers=True, similarity_threshold=80):
    """
    function to identifies and optionally removes running headers
    inspired by Ted Underwood's GREAT headerfinder script: https://github.com/tedunderwood/DataMunging/blob/master/runningheaders/HeaderFinder.py
    some changes made:
      - flexibility to remove headers or just identify them (just by setting the boolean value)
      - we don't explicitly handle roman numerals, the line comparison logic (combining str.isalpha and a threshold for fuzzy matching) should take care of it

    :pages: list of dicts, each representing a page with 'page_text'
    :remove_headers: bool, if set to True --> removes identified headers, otherwise just identifies them and wirtes them to the log
    :similarity_threshold: int, threshold for fuzzy matching to consider lines as similar (default 80 seems to work well)
    :return: list of pages with headers
    """
    identified_headers = []
    headers_set = set()

    def get_substantial_lines(page_text):
        """
        helper function: if the processed line contains less than 5 characters, or if the line consists solely of digits
        it is considered insubstantial and is skipped
        """
        lines = page_text.split('\n')
        substantial_lines = []
        for line in lines:
            if len(line.strip()) < 5 or line.strip().isdigit():
                continue
            substantial_lines.append(line)
            if len(substantial_lines) == 2:
                break
        return substantial_lines

    for i in range(len(pages)):
        current_page_text = pages[i]['page_text']
        current_substantial_lines = get_substantial_lines(current_page_text)

        header_found = False

        # determine the range of pages to compare with
        start_index = max(0, i - 2)
        end_index = min(len(pages), i + 3)
        if i == len(pages) - 1:  # Special handling for the last page
            start_index = max(0, i - 2)  # Compare with pages before

        for j in range(start_index, end_index):
            if i == j:
                continue

            comparison_page_text = pages[j]['page_text']
            comparison_substantial_lines = get_substantial_lines(comparison_page_text)

            for current_line in current_substantial_lines:
                for comparison_line in comparison_substantial_lines:
                    # line comparison logic, considering possible page numbers
                    cleaned_current_line = ''.join(filter(str.isalpha, current_line))
                    cleaned_comparison_line = ''.join(filter(str.isalpha, comparison_line))

                    s = SequenceMatcher(None, cleaned_current_line, cleaned_comparison_line)
                    similarity = s.ratio() * 100

                    if similarity > similarity_threshold:
                        header_key = (i, current_line)
                        if header_key not in headers_set:
                            identified_headers.append(header_key)
                            headers_set.add(header_key)
                        if remove_headers:
                            header_found = True
                        break

                if header_found:
                    lines_of_page = current_page_text.split('\n')
                    for idx, line in enumerate(lines_of_page):
                        if line.strip() == current_line.strip():
                            pages[i]['page_text'] = '\n'.join(lines_of_page[idx+1:])
                            break
                    break

    return pages, identified_headers

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## The actual cleaning!

In [None]:
def tokenize_text_per_page(filename, use_nltk_tokenizer=False, output_folder='/content/drive/Shareddrives/PPA/2023 Full-Text Data Work/2023 Full-Text Corpus/corpus_json_tokenized'):
    """
    this is the main course!
    """
    with open(filename, 'r') as f:
        pages = json.load(f)

    total_token_count = 0

    # dicts to store specific corrections and their counts
    specific_ocr_corrections = defaultdict(int)
    specific_linebreak_corrections = defaultdict(int)
    specific_long_s_corrections = defaultdict(int)

    correction_rules_path = "/content/drive/Shareddrives/PPA/2023 Full-Text Data Work/2023 Full-Text Corpus/notebooks/ocr_cleanup_rulesets/CorrectionRules.txt"
    correction_rules = load_correction_rules(correction_rules_path)

    clever_f_s_hack_path = "/content/drive/Shareddrives/PPA/2023 Full-Text Data Work/2023 Full-Text Corpus/notebooks/ocr_cleanup_rulesets/clever_f_ſ_hack.txt"
    clever_f_s_hack_rules = load_f_s_hack_corrections(clever_f_s_hack_path)

    # add a dictionary for specific f ſ hack corrections
    specific_f_s_hack_corrections = defaultdict(int)

    # handle the headers
    pages, identified_headers = process_headers(pages, remove_headers=True) # ideally, we want to set this later when calling the function

    for page in pages:
        page_text = page['page_text']

        # counters for corrections
        linebreak_corrections = 0
        ocr_corrections = 0
        long_s_corrections = 0
        f_s_word_replacements = 0

        # rejoin line breaks before tokenization and log corrections
        page_text, corrections = rejoin_linebreaks(page_text, specific_linebreak_corrections)
        linebreak_corrections += corrections

        # apply correction for long 's'
        corrected_text, corrections = replace_historic_long_s(page_text, specific_long_s_corrections)
        long_s_corrections += corrections
        page_text = corrected_text

        # tokenization
        tokens = word_tokenize(page_text) if use_nltk_tokenizer else page_text.split()

        # apply OCR corrections on tokens and log corrections
        corrected_tokens = []
        for token in tokens:
            if token in correction_rules:
                corrected_token = correction_rules[token]
                ocr_corrections += 1
                specific_ocr_corrections[f"{token} \t {corrected_token}"] += 1
            else:
                corrected_token = token
            corrected_tokens.append(corrected_token)

        # apply f-ſ-s hack corrections on tokens and log corrections
        for i, token in enumerate(corrected_tokens):
            if token in clever_f_s_hack_rules:
                corrected_token = clever_f_s_hack_rules[token]
                f_s_word_replacements += 1
                specific_f_s_hack_corrections[f"{token} \t {corrected_token}"] += 1
                corrected_tokens[i] = corrected_token

        token_count = len(corrected_tokens)
        total_token_count += token_count

        # convert corrected tokens back to text for further processing
        corrected_text = ' '.join(corrected_tokens)

        # store the final corrected and tokenized text
        page['tokenized_text'] = corrected_tokens

        # store correction counts in the page dictionary
        page['corrections'] = {
            'ocr_corrections': ocr_corrections,
            'linebreak_corrections': linebreak_corrections,
            'long_s_corrections': long_s_corrections,
            'f_s_word_replacements': f_s_word_replacements
        }

    new_filename = os.path.join(output_folder, os.path.basename(filename).replace('.json', '_tokenized.json'))
    os.makedirs(output_folder, exist_ok=True)
    with open(new_filename, 'w') as f:
        json.dump(pages, f)

    # writing specific corrections to a txt file (same name as jsson file)
    log_filename = os.path.join(output_folder, os.path.basename(filename).replace('.json', '_corrections_log.txt'))
    with open(log_filename, 'w') as log_file:
        log_file.write(f"File: {filename}\n")

        # corrections made by Ted's correction rules file
        log_file.write("Corrections made by Ted Underwood's rules\n")
        for correction, count in specific_ocr_corrections.items():
            log_file.write(f"{correction} \t {count}\n")

        # rejoined broken words
        log_file.write("Linebreak corrections\n")
        for correction, count in specific_linebreak_corrections.items():
            log_file.write(f"{correction} \t {count}\n")

        # long s replacements
        log_file.write("Long 's' corrections\n")
        for correction, count in specific_long_s_corrections.items():
            log_file.write(f"{correction} \t {count}\n")

        # identified/removed headers
        log_file.write("Identified and removed headers\n")
        for page_number, header in identified_headers:
          log_file.write(f"Page {page_number}: {header}\n")

        # clever f-ſ-s hack corrections
        log_file.write("Clever f ſ hack corrections\n")
        for correction, count in specific_f_s_hack_corrections.items():
            log_file.write(f"{correction} \t {count}\n")


    return {
        'filename': filename,
        'token_count': total_token_count,
        'new_filename': new_filename,
        'corrections_log_filename': log_filename,
        'long_s_corrections': dict(specific_long_s_corrections),
        'f_s_hack_corrections': dict(specific_f_s_hack_corrections)
    }


def process_files(df, number_of_files=None, use_nltk_tokenizer=True):
    """
    processes a number of files or all files from the df with the json-pathnames

    :df: df containing file metadata and filenames
    :number_of_files: Number of files to process. If None, processes all files
    :use_nltk_tokenizer: bool value, if True uses NLTK's tokenizer, otherwise uses simple split on whitespace
    :returns df with tokenization results
    """
    results = []
    if number_of_files is None:
        number_of_files = df.shape[0]

    # init a dictionary to collect all long 's' corrections across files
    all_long_s_corrections = defaultdict(int)

    for index, row in tqdm(df.iterrows(), total=number_of_files):
        if index >= number_of_files:
            break
        tokenization_result = tokenize_text_per_page(row['filename'], use_nltk_tokenizer)
        results.append(tokenization_result)

        # aggregate long 's' corrections across all pages and files
        for correction, count in tokenization_result['long_s_corrections'].items():
            all_long_s_corrections[correction] += count

    # sorting the all_long_s_corrections dict by count in descending order
    sorted_long_s_corrections = sorted(all_long_s_corrections.items(), key=lambda item: item[1], reverse=True)

    results_df = pd.DataFrame(results)

    # writing all long 's' corrections to a txt file
    all_long_s_corrections_filename = "/content/drive/Shareddrives/PPA/2023 Full-Text Data Work/2023 Full-Text Corpus/notebooks/ocr_cleanup_rulesets/all_long_s_corrections_log.txt"
    with open(all_long_s_corrections_filename, 'w') as all_long_s_file:
        all_long_s_file.write("All long 's' corrections (sorted by count)\n")
        for correction, count in sorted_long_s_corrections:
            all_long_s_file.write(f"{correction} \t {count}\n")

    return results_df

# process only first 100 files
df_tokenized = process_files(df_metadata_with_existing_files, number_of_files=100, use_nltk_tokenizer=True)

df_tokenized.iloc[1].new_filename

 99%|█████████▉| 99/100 [05:09<00:03,  3.13s/it]


'/content/drive/Shareddrives/PPA/2023 Full-Text Data Work/2023 Full-Text Corpus/corpus_json_tokenized/mdp.39015003633594_tokenized.json'

In [None]:
df_tokenized

Unnamed: 0,filename,token_count,new_filename,corrections_log_filename,long_s_corrections
0,/content/drive/Shareddrives/PPA/2023 Full-Text...,507417,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,"{'ſi si': 1, '(ä’ſä (ä’sä': 1, 'ā'rſ). ā..."
1,/content/drive/Shareddrives/PPA/2023 Full-Text...,37720,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,"{'Inſtruction Instruction': 1, 'Engliſh En..."
2,/content/drive/Shareddrives/PPA/2023 Full-Text...,21332,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,"{'soundshiſting, soundshisting,': 1, 'tueiſl..."
3,/content/drive/Shareddrives/PPA/2023 Full-Text...,6806,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,"{'ſ s': 1, 'dictuſ dictus': 1, 'caſite c..."
4,/content/drive/Shareddrives/PPA/2023 Full-Text...,3555,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,{}
5,/content/drive/Shareddrives/PPA/2023 Full-Text...,83002,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,{'liſe. lise.': 1}
6,/content/drive/Shareddrives/PPA/2023 Full-Text...,84297,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,"{'ſyſtem system': 3, 'purſuit pursuit': 3,..."
7,/content/drive/Shareddrives/PPA/2023 Full-Text...,89422,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,"{'Suſficient Susficient': 1, 'Swiſt Swist'..."
8,/content/drive/Shareddrives/PPA/2023 Full-Text...,108174,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,"{'preſa presa': 2, 'Oſt Ost': 1, 'ſeeble ..."
9,/content/drive/Shareddrives/PPA/2023 Full-Text...,90335,/content/drive/Shareddrives/PPA/2023 Full-Text...,/content/drive/Shareddrives/PPA/2023 Full-Text...,"{'ſoundation. soundation.': 1, 'ſundamental ..."
