# Cleaning metadata
### To be used with entire metadata or a subset

In [None]:
import re
import pandas as pd
from utils.constants import Paths
from spellchecker import SpellChecker
from collections import Counter

In [15]:
# Load subset
df = pd.read_excel(Paths.mccray_1940s_cleaner)
df[['Transcript']]

Unnamed: 0,Transcript
0,York Holly Hill Holly Hill City A i k en ...
1,Subscriptions -2 2.80 1.75 1.05 1.05 2.80...
2,"APRlL, 1946 Richard Corley New Haven, Co..."
3,"Reeves' Barber Shop 1 Street Whiteville, Nor..."
4,"Benedict College Columbia, S. C. Lighthouse ..."
...,...
569,MEMORANDUM Abram Arthur Golden Counsellor ...
570,CONDITIONS UNDER WHICH ORDER IS GIVEN AND AGRE...
571,"Septembex 24, 1 46 ""x. McKeever 1835 Fac..."
572,Charleston 16. S. C. OFFICE OF THE DIRECTOR...


### PRE-CLEANING

In [16]:
# Condense white space > 1
df['Transcript'] = df['Transcript'].apply(lambda x: re.sub(r"\s+"," ", x))
df[['Transcript']]


Unnamed: 0,Transcript
0,York Holly Hill Holly Hill City A i k en Gresh...
1,Subscriptions -2 2.80 1.75 1.05 1.05 2.80 2.80...
2,"APRlL, 1946 Richard Corley New Haven, Conn. W...."
3,"Reeves' Barber Shop 1 Street Whiteville, North..."
4,"Benedict College Columbia, S. C. Lighthouse In..."
...,...
569,MEMORANDUM Abram Arthur Golden Counsellor At L...
570,CONDITIONS UNDER WHICH ORDER IS GIVEN AND AGRE...
571,"Septembex 24, 1 46 ""x. McKeever 1835 Face Path..."
572,Charleston 16. S. C. OFFICE OF THE DIRECTOR ^i...


### Split off / additional filtering (to eliminate currently un-needed transcripts to create a small subset)

In [None]:
sc = SpellChecker()

def is_high_quality_text(text, spell_checker=None,
                         min_words=20,
                         min_avg_word_length=3,
                         min_common_word_ratio=0.1,
                         max_single_char_ratio=0.3,
                         max_non_alpha_ratio=0.5,
                         max_misspell_ratio=0.5):
    """
    Returns True if transcript appears to contain meaningful, readable content.
    Can optionally use a spell checker for extra filtering.
    """

    if not text or not text.strip():
        return False

    # Tokenize into words
    words = text.split()
    alpha_words = re.findall(r'\b[a-zA-Z]+\b', text.lower())

    # Too few total words
    if len(words) < min_words:
        return False

    # Too many single-character words (OCR artifacts)
    single_chars = sum(1 for w in words if len(w) == 1)
    if single_chars / len(words) > max_single_char_ratio:
        return False

    # Too many numbers/symbols
    non_alpha = sum(1 for w in words if not w.isalpha())
    if non_alpha / len(words) > max_non_alpha_ratio:
        return False

    # Average word length too short
    avg_length = sum(len(w) for w in alpha_words) / max(1, len(alpha_words))
    if avg_length < min_avg_word_length:
        return False

    # Low variety of vocabulary
    unique_words = set(alpha_words)
    if len(unique_words) < len(alpha_words) * 0.3:
        return False

    # Common English word presence
    common_words = {
        'the', 'and', 'to', 'of', 'a', 'in', 'is', 'it', 'you', 'that',
        'he', 'was', 'for', 'on', 'are', 'as', 'with', 'his', 'they',
        'i', 'at', 'be', 'this', 'have', 'from', 'or', 'one', 'had',
        'by', 'word', 'but', 'not', 'what', 'all', 'were', 'we', 'when'
    }
    common_count = sum(1 for w in alpha_words if w in common_words)
    if common_count / len(alpha_words) < min_common_word_ratio:
        return False

    # Optional spell-check pass
    if spell_checker:
        # Filter words for spell check
        check_words = [w for w in alpha_words if len(w) > 2]
        if check_words:
            misspelled = spell_checker.unknown(check_words)
            if len(misspelled) / len(check_words) > max_misspell_ratio:
                return False

    return True

df['quality_transcript'] = df['Transcript'].apply(lambda x: is_high_quality_text(x, sc))

print(f"Quality: {df['quality_transcript'].sum()}")
print(f"Not up to par: {len(df) - df['quality_transcript'].sum()}")

Quality: 403
Not up to par: 171


In [26]:
# df.head(40)

good_transcripts = df[df['quality_transcript'] == True]

good_transcripts

Unnamed: 0.1,Unnamed: 0,Title,Creator,Contributors,Date,Approximate Date,Source,Subject,Local Subject,S.C. County,...,total_messy_chars,special_messy_sequences,special_messy_count,has_special_messy,total_special_messy_chars,repeat_sequences,repeat_sequence_count,has_repeat_sequence,any_messy,quality_transcript
0,746,"New Subscriptions, 1964, Page 1",,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,[],0,False,0,[],0,False,False,True
3,751,Letter to Rev. James Hinton from Asa Reeves,,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,[],0,False,0,[],0,False,False,True
4,753,Letter to the Editor of the Lighthouse and Inf...,,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,[],0,False,0,[],0,False,False,True
5,763,"The Lighthouse and Informer Press, No. 188",,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,[],0,False,0,[],0,False,False,True
6,764,"The Lighthouse and Informer Press, No. 186",,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,[],0,False,0,[],0,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,14121,"Memo from Abram A. Golden, Counselor at Law, t...","Golden, Abram A. (Counsellor at Law)","McCray, John Henry, 1910-1987",1946-09-04 00:00:00,,Manuscripts; Accession 11294.,Chain gangs,,,...,0,[],0,False,0,[],0,False,False,True
570,14129,Purchase order from the South Carolina Electri...,,"McCray, John Henry, 1910-1987",1946-09-19 00:00:00,,Manuscripts; Accession 11294.,South Carolina Electric & Gas Company;Lighthou...,,,...,0,[],0,False,0,[],0,False,False,True
571,14136,"Letter from Mildred Chestnut, Secretary of the...","Chestnut, Mildred (Secretary for Lighthouse an...","McCray, John Henry, 1910-1987",1946-09-24 00:00:00,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,...,0,[],0,False,0,[],0,False,False,True
572,14139,"Letter from John F. Potts, Director of Avery I...","Potts, John F. (Director of Avery Institute)","McCray, John Henry, 1910-1987",1946-09-26 00:00:00,,Manuscripts; Accession 11294.,Avery Normal Institute;Lighthouse and Informer,,,...,0,[],0,False,0,[],0,False,False,True


### Spellcheck


In [28]:
def correct_spelling_efficient(text):
    """Efficiently correct spelling in a text string"""
    if not isinstance(text, str) or not text.strip():
        return text
    
    # Pre-compile regex patterns for better performance
    word_pattern = re.compile(r'(\W*)([\w\']+)(\W*)')
    
    # Process in one pass with regex
    def replace_word(match):
        leading_punct = match.group(1)
        word = match.group(2)
        trailing_punct = match.group(3)
        
        # Skip correction for numbers or short terms
        if not word or word.isdigit() or len(word) <= 2:
            return leading_punct + word + trailing_punct
        
        # Check if word needs correction
        lowercase_word = word.lower()
        if lowercase_word in sc:
            # Word is correct
            return leading_punct + word + trailing_punct
        else:
            # Get correction while preserving case
            correction = sc.correction(lowercase_word)
            if correction:
                # Preserve original capitalization
                if word.isupper():
                    correction = correction.upper()
                elif word[0].isupper():
                    correction = correction.capitalize()
                return leading_punct + correction + trailing_punct
            else:
                # If no correction found, keep original
                return leading_punct + word + trailing_punct
    
    # Apply the correction with regex substitution
    corrected_text = word_pattern.sub(replace_word, text)
    return corrected_text

# For even more efficiency, process in batches
# This helps reduce the overhead of function calls in apply()
def batch_correct(df, column, batch_size=1000):
    """Process spellchecking in efficient batches"""
    result = []
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        corrected = batch[column].apply(correct_spelling_efficient)
        result.append(corrected)
    return pd.concat(result)

# Apply more efficient spellchecking
good_transcripts['corrected_transcript'] = batch_correct(good_transcripts, 'Transcript')

# Compare a sample
sample = good_transcripts[['Transcript', 'corrected_transcript']].head(3)
for i, row in sample.iterrows():
    print(f"Original: {row['Transcript'][:100]}...")
    print(f"Corrected: {row['corrected_transcript'][:100]}...")
    print("-" * 80)

Original: York Holly Hill Holly Hill City A i k en Gresham Gresham City Aiken Charleston 1.75 2.80 2.80 2.10 1...
Corrected: York Holly Hill Holly Hill City A i k en Gresham Gresham City Aiken Charleston 1.75 2.80 2.80 2.10 1...
--------------------------------------------------------------------------------
Original: Reeves' Barber Shop 1 Street Whiteville, North Car. Rev. James Hinton c/o Lighthouse & Informer News...
Corrected: Reeves Barber Shop 1 Street Whiteville, North Car. Rev. James Hinton c/o Lighthouse & Informer Newsp...
--------------------------------------------------------------------------------
Original: Benedict College Columbia, S. C. Lighthouse Informer Columbia, S. C. Dear Editor, Bill you please re...
Corrected: Benedict College Columbia, S. C. Lighthouse Informer Columbia, S. C. Dear Editor, Bill you please re...
--------------------------------------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  good_transcripts['corrected_transcript'] = batch_correct(good_transcripts, 'Transcript')


In [None]:
output = Paths.mccray_folder + r"changed_data/decade_subsets/McCray (1940s, 100 rows, spellcheck).xlsx"

good_transcripts.to_excel(output)

In [30]:
print(output)

../../data/mccray/changed_data/decade_subsets/McCray (1940s, 100 rows, spellcheck).xlsx
