# Cleaning metadata
### To be used with entire metadata or a subset

In [16]:
import re
import pandas as pd
from utils.constants import Paths
from spellchecker import SpellChecker
from collections import Counter

In [17]:
# Load subset
df = pd.read_excel(Paths.mccray_1940s_cleaner)
df[['Transcript']]

Unnamed: 0,Transcript
0,York Holly Hill Holly Hill City A i k en ...
1,Subscriptions -2 2.80 1.75 1.05 1.05 2.80...
2,"APRlL, 1946 Richard Corley New Haven, Co..."
3,"Reeves' Barber Shop 1 Street Whiteville, Nor..."
4,"Benedict College Columbia, S. C. Lighthouse ..."
...,...
569,MEMORANDUM Abram Arthur Golden Counsellor ...
570,CONDITIONS UNDER WHICH ORDER IS GIVEN AND AGRE...
571,"Septembex 24, 1 46 ""x. McKeever 1835 Fac..."
572,Charleston 16. S. C. OFFICE OF THE DIRECTOR...


### PRE-CLEANING

In [18]:
# Condense white space > 1
df['Transcript'] = df['Transcript'].apply(lambda x: re.sub(r"\s+"," ", x))
df[['Transcript']]

# Spelling err check
sc = SpellChecker()
df['Spelling Err Count'] = df['Transcript'].apply(lambda x: len(sc.unknown(x.split())))
df['Spelling Err Count'].head(50)

domain_words = [
    "Columbia", "SC"
]

### Split off (to eliminate currently un-needed transcripts to create a small subset)

In [None]:
# Length > 15 only, single chars, num/symbols
def is_useless_transcript(text):
    words = text.split()
    
    # Too short
    if len(words) < 20:
        return True
    
    # Too many single characters (OCR artifacts)
    single_chars = sum(1 for word in words if len(word) == 1)
    if single_chars / len(words) > 0.3:  # 30% single characters
        return True
    
    # Too many numbers/symbols
    non_alpha = sum(1 for word in words if not word.isalpha())
    if non_alpha / len(words) > 0.5:
        return True
    
    return False

# Spell check
def quick_quality_check(text, sc):
    words = text.lower().split()
    
    # Filter out short words and numbers
    alpha_words = [w for w in words if w.isalpha() and len(w) > 2]
    
    if len(alpha_words) < 5:
        return False  # Useless
    
    misspelled = sc.unknown(alpha_words)
    error_rate = len(misspelled) / len(alpha_words)
    
    # If more than 50% are "misspelled", probably bad OCR
    return error_rate < 0.5

# Apply functions (method 1 and 2)

# Function 1: Filter out obviously useless ones first 
df['is_useless'] = df['Transcript'].apply(is_useless_transcript)
print(f"Useless transcripts: {df['is_useless'].sum()}")

# Keep only the potentially good ones
df_filtered = df[~df['is_useless']].copy()

# Function 2: Check spelling quality on remaining transcripts
df_filtered['passes_spell_check'] = df_filtered['Transcript'].apply(lambda x: quick_quality_check(x, sc))
print(f"Passed spell check: {df_filtered['passes_spell_check'].sum()}")

# Final good transcripts
good_transcripts = df_filtered[df_filtered['passes_spell_check']].copy()

Useless transcripts: 148
Passed spell check: 426


In [20]:
good_transcripts

Unnamed: 0.1,Unnamed: 0,Title,Creator,Contributors,Date,Approximate Date,Source,Subject,Local Subject,S.C. County,...,special_messy_count,has_special_messy,total_special_messy_chars,repeat_sequences,repeat_sequence_count,has_repeat_sequence,any_messy,Spelling Err Count,is_useless,passes_spell_check
0,746,"New Subscriptions, 1964, Page 1",,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,False,0,[],0,False,False,0,False,True
3,751,Letter to Rev. James Hinton from Asa Reeves,,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,False,0,[],0,False,False,22,False,True
4,753,Letter to the Editor of the Lighthouse and Inf...,,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,False,0,[],0,False,False,16,False,True
5,763,"The Lighthouse and Informer Press, No. 188",,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,False,0,[],0,False,False,12,False,True
6,764,"The Lighthouse and Informer Press, No. 186",,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,False,0,[],0,False,False,11,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,14121,"Memo from Abram A. Golden, Counselor at Law, t...","Golden, Abram A. (Counsellor at Law)","McCray, John Henry, 1910-1987",1946-09-04 00:00:00,,Manuscripts; Accession 11294.,Chain gangs,,,...,0,False,0,[],0,False,False,47,False,True
570,14129,Purchase order from the South Carolina Electri...,,"McCray, John Henry, 1910-1987",1946-09-19 00:00:00,,Manuscripts; Accession 11294.,South Carolina Electric & Gas Company;Lighthou...,,,...,0,False,0,[],0,False,False,19,False,True
571,14136,"Letter from Mildred Chestnut, Secretary of the...","Chestnut, Mildred (Secretary for Lighthouse an...","McCray, John Henry, 1910-1987",1946-09-24 00:00:00,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,...,0,False,0,[],0,False,False,23,False,True
572,14139,"Letter from John F. Potts, Director of Avery I...","Potts, John F. (Director of Avery Institute)","McCray, John Henry, 1910-1987",1946-09-26 00:00:00,,Manuscripts; Accession 11294.,Avery Normal Institute;Lighthouse and Informer,,,...,0,False,0,[],0,False,False,20,False,True


In [None]:
# Eliminate any that don't have a matched percentage of common english words
def has_good_content(text, min_words=15, min_avg_word_length=3):
    """
    Simple function to determine if a transcript has good information.
    Returns True if transcript appears to contain meaningful content.
    """
    if not text or not text.strip():
        return False
    
    # Clean and split into words
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    
    # Basic length check
    if len(words) < min_words:
        return False
    
    # Average word length (filters out OCR artifacts like "a b c d e f")
    avg_length = sum(len(word) for word in words) / len(words)
    if avg_length < min_avg_word_length:
        return False
    
    # Check for reasonable variety (not just repeated words)
    unique_words = set(words)
    if len(unique_words) < len(words) * 0.3:  # At least 30% unique words
        return False
    
    # Check for common English patterns (simple heuristic)
    common_words = {'the', 'and', 'to', 'of', 'a', 'in', 'is', 'it', 'you', 'that', 
                   'he', 'was', 'for', 'on', 'are', 'as', 'with', 'his', 'they', 
                   'i', 'at', 'be', 'this', 'have', 'from', 'or', 'one', 'had', 
                   'by', 'word', 'but', 'not', 'what', 'all', 'were', 'we', 'when'}
    
    common_count = sum(1 for word in words if word in common_words)
    if common_count < len(words) * 0.1:  # At least 10% common English words
        return False
    
    return True

good_transcripts['has_good_content'] = good_transcripts['Transcript'].apply(has_good_content)
good_transcripts = good_transcripts[good_transcripts['has_good_content']].copy()

# Preform spell check with pyspellchecker (sc) on all remaining Transcripts


In [29]:
good_transcripts

Unnamed: 0.1,Unnamed: 0,Title,Creator,Contributors,Date,Approximate Date,Source,Subject,Local Subject,S.C. County,...,Spelling Err Count,is_useless,passes_spell_check,has_good_content,misspelled_words,misspelled_percentage,spell_corrections,needs_spelling_review,misspelled_words_str,spell_corrections_str
0,746,"New Subscriptions, 1964, Page 1",,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,0,False,True,True,{},0.000000,{},False,{},{}
3,751,Letter to Rev. James Hinton from Asa Reeves,,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,22,False,True,True,"{'whiteville': 3, 'subgiect': 1}",1.169591,"{'whiteville': None, 'subgiect': 'subject'}",False,"{'whiteville': 3, 'subgiect': 1}","{'whiteville': None, 'subgiect': 'subject'}"
4,753,Letter to the Editor of the Lighthouse and Inf...,,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,16,False,True,True,"{'horteu': 1, 'pp': 1, 'jms': 1, 'ko': 1, 'enc...",8.333333,"{'mmt': 'met', 'horteu': 'hotel', 'ko': 'to', ...",False,"{'horteu': 1, 'pp': 1, 'jms': 1, 'ko': 1, 'enc...","{'mmt': 'met', 'horteu': 'hotel', 'ko': 'to', ..."
5,763,"The Lighthouse and Informer Press, No. 188",,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,12,False,True,True,"{'publica': 1, 'tion': 1}",6.250000,"{'publica': 'public', 'tion': 'ton'}",False,"{'publica': 1, 'tion': 1}","{'publica': 'public', 'tion': 'ton'}"
6,764,"The Lighthouse and Informer Press, No. 186",,"McCray, John Henry, 1910-1987",1946,,Manuscripts; Accession 11294.,The Lighthouse and Informer,,,...,11,False,True,True,"{'publica': 1, 'tion': 1}",6.250000,"{'publica': 'public', 'tion': 'ton'}",False,"{'publica': 1, 'tion': 1}","{'publica': 'public', 'tion': 'ton'}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,14121,"Memo from Abram A. Golden, Counselor at Law, t...","Golden, Abram A. (Counsellor at Law)","McCray, John Henry, 1910-1987",1946-09-04 00:00:00,,Manuscripts; Accession 11294.,Chain gangs,,,...,47,False,True,True,"{'colier': 2, 'attor': 1, 'ney': 1, 'habeas': ...",5.429864,"{'ney': 'new', 'colier': 'colder', 'boulware':...",False,"{'colier': 2, 'attor': 1, 'ney': 1, 'habeas': ...","{'ney': 'new', 'colier': 'colder', 'boulware':..."
570,14129,Purchase order from the South Carolina Electri...,,"McCray, John Henry, 1910-1987",1946-09-19 00:00:00,,Manuscripts; Accession 11294.,South Carolina Electric & Gas Company;Lighthou...,,,...,19,False,True,True,{},0.000000,{},False,{},{}
571,14136,"Letter from Mildred Chestnut, Secretary of the...","Chestnut, Mildred (Secretary for Lighthouse an...","McCray, John Henry, 1910-1987",1946-09-24 00:00:00,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,...,23,False,True,True,"{'septembex': 1, 'mckeever': 2, 'gt': 1, 'vill...",12.307692,"{'mckeever': 'makeover', 'gt': 'it', 'vill': '...",False,"{'septembex': 1, 'mckeever': 2, 'gt': 1, 'vill...","{'mckeever': 'makeover', 'gt': 'it', 'vill': '..."
572,14139,"Letter from John F. Potts, Director of Avery I...","Potts, John F. (Director of Avery Institute)","McCray, John Henry, 1910-1987",1946-09-26 00:00:00,,Manuscripts; Accession 11294.,Avery Normal Institute;Lighthouse and Informer,,,...,20,False,True,True,"{'isaiuitarg': 1, 'ssutiniian': 1, 'rijnixl': ...",12.280702,"{'jfpjdemcd': None, 'gt': 'it', 'hn': 'in', 's...",False,"{'isaiuitarg': 1, 'ssutiniian': 1, 'rijnixl': ...","{'jfpjdemcd': None, 'gt': 'it', 'hn': 'in', 's..."
