# Transcript Cleaning Pipeline 
## Isolated into one notebook
### Built based upon McCray Metadata
This version uses no spello


In [16]:
import pandas as pd
import numpy as np
import re, ast, json, csv
from utils.constants import Paths
from utils.ocr_cleaning import OCR_Clean, OCR_Check
# from utils.spello_functions import spello_build
# from spello.model import SpellCorrectionModel

### Get original metadata

In [17]:
# Get original metadata
mccray = pd.read_csv(Paths.mccray_original_metadata)
print(list(mccray))

def save_csv(df, title):
    df.to_csv(title,
          index=False,
          encoding='utf-8',
          quoting=csv.QUOTE_NONNUMERIC,   
          escapechar='\\',
          lineterminator='\n')

['Title', 'Creator', 'Contributors', 'Date', 'Approximate Date', 'Source', 'Subject', 'Local Subject', 'S.C. County', 'Description', 'Extent', 'Digital Collection', 'Website', 'Contributing Institution', 'Rights', 'Time Period', 'Geographic Location', 'Language', 'Digitization Specifications', 'Date Digital', 'Type', 'Format', 'Media Type', 'Identifier', 'Note', 'Digital Assistant', 'Transcript', 'OCLC number', 'Date created', 'Date modified', 'Reference URL', 'CONTENTdm number', 'CONTENTdm file name', 'CONTENTdm file path']


### Handle the fields / basic standardization

In [18]:
# Add 'Year' field
mccray['Year'] = mccray['Date']
mccray['Year'] = mccray['Year'].apply(lambda x: str(x))
mccray['Year'] = mccray['Year'].apply(lambda x: re.sub(r'[^0-9]+', "", x))
mccray['Year'] = mccray['Year'].apply(lambda x: x[:4])

# Copy "Transcript" Column as "Original Transcript" & Remove "Transcript" Column
mccray['Original Transcript'] = mccray['Transcript'] 
mccray = mccray.drop('Transcript', axis=1)  

# Replace empty transcripts with na
mccray['Original Transcript'] = mccray['Original Transcript'].fillna('').apply(lambda x: x.strip() if isinstance(x, str) else '')

# Add "Original Len" Column for lengths
mccray['Original Len'] = mccray['Original Transcript'].str.replace(' ', '').str.len()


### Determining messiness and formating into semi-clean transcripts

In [19]:
# Helper to find sequences using regex pattern, allowing for exceptions (created for some patterers in OCR_Clean class)
def find_sequences(pattern, text, exceptions=[]):
    candidates = re.findall(pattern, str(text))
    flagged = []
    for candidate in candidates:
        if not any(re.match(exc, candidate) for exc in exceptions):
            flagged.append(candidate)
    return flagged

# Updated data cleaning metrics with new structure
transcript_col = 'Original Transcript'

# Individual pattern detection (Boolean columns)
mccray['Special Pattern'] = mccray[transcript_col].apply(lambda x: len(find_sequences(OCR_Clean.special_pattern, x)) > 0)
mccray['General Pattern'] = mccray[transcript_col].apply(lambda x: len(find_sequences(OCR_Clean.general_pattern, x, OCR_Clean.general_exceptions)) > 0)
mccray['Repeat Chars'] = mccray[transcript_col].apply(lambda x: len(find_sequences(OCR_Clean.letter_pattern, x)) > 0)
mccray['Short/No Transcript'] = mccray['Original Len'] < 20

# Overall messiness/quaility indicator
mccray['Quality'] = ~(mccray['Special Pattern'] | mccray['General Pattern'] | mccray['Repeat Chars'] | mccray['Short/No Transcript'])

# Issue Types (List of Strings)
def get_issue_types(row):
    types = []
    if row['Special Pattern']:
        types.append('Special Char Artifacts')
    if row['General Pattern']:
        types.append('General Artifacts')
    if row['Repeat Chars']:
        types.append('Repeat Chars')
    if row['Original Len'] == 0:
        types.append('Empty Transcript')
    elif row['Short/No Transcript']:
        types.append('Low Char Count')
    return types

mccray['Issue Types'] = mccray.apply(get_issue_types, axis=1)

# Detected Artifacts (List of Strings)
def get_detected_artifacts(text):
    artifacts = []
    # Add special pattern artifacts
    artifacts.extend(find_sequences(OCR_Clean.special_pattern, text))
    # Add general pattern artifacts
    artifacts.extend(find_sequences(OCR_Clean.general_pattern, text, OCR_Clean.general_exceptions))
    # Add repeat character sequences
    artifacts.extend(find_sequences(OCR_Clean.letter_pattern, text))
    return artifacts

mccray['Detected Artifacts'] = mccray[transcript_col].apply(get_detected_artifacts)

In [20]:
# Save all OG Transcripts w/ Quality=True to 'Semi-clean Transcripts'
sc_col = 'Semi-clean Transcript'

mccray[sc_col] = mccray['Original Transcript'].where(mccray['Quality'] == True, "") # intially normalized the spacing here, but holding off until transcripts w/ issues are combined

display(mccray)


Unnamed: 0,Title,Creator,Contributors,Date,Approximate Date,Source,Subject,Local Subject,S.C. County,Description,...,Original Transcript,Original Len,Special Pattern,General Pattern,Repeat Chars,Short/No Transcript,Quality,Issue Types,Detected Artifacts,Semi-clean Transcript
0,Afro-American Newsboy Application signed by Mr...,,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,An application to be a Newsboy for the Afro-Am...,...,-5491 AFRO-AMERICAN NEWSBOY'S APPLICATION ...,213,False,False,False,False,True,[],[],-5491 AFRO-AMERICAN NEWSBOY'S APPLICATION ...
1,Lighthouse Informer receipt,,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,A blank Lighthouse and Informer receipt.,...,"19 Received from RECEIVED OF ""Shedding Ligh...",121,False,False,False,False,True,[],[],"19 Received from RECEIVED OF ""Shedding Ligh..."
2,"The Lighthouse Solicitor's Record, Home Office...",,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,"A solicitor's record, home office order, and s...",...,SOLICITOR'S RECORD Name Address City St...,392,False,False,False,False,True,[],[],SOLICITOR'S RECORD Name Address City St...
3,The Lighthouse Remittance Envelope(Back),,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,The back of a remittance envelope for the Ligh...,...,,0,False,False,False,True,False,[Empty Transcript],[],
4,The Lighthouse Remittance Envelope(Front),,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,The front of a Lighthouse remittance envelope.,...,Sender's Address Shedding Light For A Growing...,72,False,False,False,False,True,[],[],Sender's Address Shedding Light For A Growing...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19324,Arthur Clement Family,,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,A photograph of Arthur Clement Family celebrat...,...,,0,False,False,False,True,False,[Empty Transcript],[],
19325,Law School Graduation Group Photograph,,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,A group photograph possible on a graduation da...,...,,0,False,False,False,True,False,[Empty Transcript],[],
19326,Arthur Clement Family celebrating Law School g...,,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,A second photograph of the Arthur Clement Fami...,...,,0,False,False,False,True,False,[Empty Transcript],[],
19327,"Arthur Clement Family posing in Columbia, Sout...",,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,Three men celebating a recent graduation from ...,...,,0,False,False,False,True,False,[Empty Transcript],[],


### Pre proccessing to turn OG transcripts w/ issues into semi-clean transcripts

In [21]:
# Pattern removal and condence (may replace pattern removal)
mccray['Cleaning'] = mccray['Original Transcript'].where(
    (mccray['Quality'] == False) & (mccray['Short/No Transcript'] == False), ""
)
mccray['Cleaning'] = mccray['Cleaning'].apply(lambda x: re.sub(OCR_Clean.letter_pattern, r'\1', x))
mccray['Cleaning'] = mccray['Cleaning'].apply(lambda x: re.sub(OCR_Clean.special_pattern, "", x))
mccray['Cleaning'] = mccray['Cleaning'].apply(lambda x: re.sub(OCR_Clean.general_pattern, "", x))

def combine_transcripts(row):
    if row[sc_col] != '':
        return row[sc_col]
    elif row['Cleaning'] != '':
        return row['Cleaning']
    else:
        return ''

mccray[sc_col] = mccray.apply(combine_transcripts, axis=1)

mccray = mccray.drop('Cleaning', axis=1)

### Basic Cleaning / standardization / Preprocessing of Semi-clean

In [22]:
# Regex pattern for this:
# To remove ^ and ^shortphrases, and remove standalone punctuation not attached to words/numbers.

# \^(\w{0,4})   matches ^ followed by 0–4 word characters (short phrases like ^a, ^ok, ^xyz, ,^fele or just ^)
# |             OR
# (?<!\w)       negative lookbehind: ensures the char before is NOT a word character
# [^\w\s]       matches any single character that is NOT a word char and NOT whitespace (i.e., punctuation)
# (?!\w)        negative lookahead: ensures the char after is NOT a word character

p1 = r"\^(\w{0,4})|(?<!\w)[^\w\s](?!\w)"

mccray[sc_col] = mccray[sc_col].apply(lambda x: re.sub(p1, "", x))

# Remove ( and connected letter/word if there is no closing bracket
p2 = r'\([^)]*$'

mccray[sc_col] = mccray[sc_col].apply(lambda x: re.sub(p2, "", x))

# Clean spacing (with auto newlining)
def normalize_spaces(text):
    return re.sub(r" {2,}", lambda m: "\n" if len(m.group(0)) > 3 else " ", text)

mccray[sc_col] = mccray[sc_col].apply(normalize_spaces)

### Final Cleaning of 'Semi-Clean Transcript' Column
- ##### Create set of all words from 'Title' Column
- ##### Use this set for spello dict
- ##### Apply spello
- ##### Find more advanced method to normalizing spacing for readablity (less important)

In [23]:
# # Spello is a spell correcting tool that accounts for domain specific language

# # init model
# sp = SpellCorrectionModel(language='en')

# model_loc = r"C:/Users/zagsk/code/AspireAI_LLM_Project/venv/spello_model/"

# # # create spello dict & train spello model (only once)
# # # Build dict from 'Title' column (and Description for second test)
# spello_build.extract_words(mccray, ["Title", "Description"], Paths.mccray_spello_dictionary)
# training_dict = spello_build.load_spello_dictionary(Paths.mccray_spello_dictionary)
# sp.train(training_dict)
# sp.save(model_loc)

# # model load and settings
# sp.load(model_loc + "model.pkl")

# # confidence needed by model to confirm a change
# sp.config.confidence_threshold = 0.90

# # max edit distance (e.g. For words of length 6, the model can correct them with an edit distance of up to 3)
# sp.config.symspell_allowed_distance_map = {
#     2: 1,
#     3: 1,
#     4: 2,
#     5: 3,
#     6: 3,
#     7: 4,
#     8: 4,
#     9: 5,
#     10: 5,
#     11: 5,
#     12: 5,
#     13: 6,
#     14: 6,
#     15: 6,
#     16: 6,
#     17: 6,
#     18: 6,
#     19: 6,
#     20: 6,
# }

In [24]:
# def correct_text_columns(mccray, model, column_names, suffix=' Spello Output'):
#         """
#         Apply spell correction to specified columns in a DataFrame.
        
#         Args:
#             mccray (DataFrame): Input DataFrame
#             model (SpellCorrector): Configured spello model
#             column_names (list): List of column names to correct
#             suffix (str): Suffix to add to corrected column names
            
#         Returns:
#             DataFrame: DataFrame with corrected columns added
#         """
#         if model is None:
#             print("Error: No valid model provided")
#             return mccray
        
#         df_corrected = mccray.copy()
        
#         for col in column_names:
#             if col not in mccray.columns:
#                 print(f"Warning: Column '{col}' not found in DataFrame")
#                 continue
            
#             print(f"Correcting spelling in column: {col}")
#             corrected_col_name = f"{col}{suffix}"
            
#             # Apply correction to each cell
#             corrected_values = []
#             for idx, cell_value in enumerate(mccray[col]):
#                 if pd.notna(cell_value) and str(cell_value).strip():
#                     try:
#                         # Convert to string and correct
#                         text = str(cell_value)
#                         corrected_text = model.spell_correct(text)
#                         corrected_values.append(corrected_text)
#                     except Exception as e:
#                         print(f"Warning: Error correcting row {idx} in column {col}: {e}")
#                         corrected_values.append(text)  # Keep original if correction fails
#                 else:
#                     corrected_values.append(cell_value)  # Keep NaN/empty values as-is
            
#             df_corrected[corrected_col_name] = corrected_values
#             print(f"    Created column: {corrected_col_name}")
        
#         return df_corrected

In [25]:
# # Apply spello
# # df_corrected = correct_text_columns(mccray, sp, [sc_col])

# # Rename and drop
# # mccray = df_corrected
# cor_col = sc_col + '  Spello Output'


In [26]:
# print(cor_col)
# print(list(mccray))

In [27]:
# # Improve output of spello 
# corrected_texts = []
# correction_dicts = []

# for spello_value in mccray[cor_col]:
#     try:
#         record = ast.literal_eval(spello_value)
#         corrected_texts.append(record.get('spell_corrected_text', ''))
#         correction_dicts.append(record.get('correction_dict', {}))
#     except:
#         corrected_texts.append('')
#         correction_dicts.append({})

# # Add new columns to your dataframe
# mccray['Corrections'] = correction_dicts
# mccray['Clean Transcript'] = corrected_texts

In [28]:
display(mccray)

Unnamed: 0,Title,Creator,Contributors,Date,Approximate Date,Source,Subject,Local Subject,S.C. County,Description,...,Original Transcript,Original Len,Special Pattern,General Pattern,Repeat Chars,Short/No Transcript,Quality,Issue Types,Detected Artifacts,Semi-clean Transcript
0,Afro-American Newsboy Application signed by Mr...,,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,An application to be a Newsboy for the Afro-Am...,...,-5491 AFRO-AMERICAN NEWSBOY'S APPLICATION ...,213,False,False,False,False,True,[],[],-5491 AFRO-AMERICAN NEWSBOY'S APPLICATION I he...
1,Lighthouse Informer receipt,,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,A blank Lighthouse and Informer receipt.,...,"19 Received from RECEIVED OF ""Shedding Ligh...",121,False,False,False,False,True,[],[],"19 Received from RECEIVED OF ""Shedding Light F..."
2,"The Lighthouse Solicitor's Record, Home Office...",,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,"A solicitor's record, home office order, and s...",...,SOLICITOR'S RECORD Name Address City St...,392,False,False,False,False,True,[],[],SOLICITOR'S RECORD Name Address City State Amo...
3,The Lighthouse Remittance Envelope(Back),,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,The back of a remittance envelope for the Ligh...,...,,0,False,False,False,True,False,[Empty Transcript],[],
4,The Lighthouse Remittance Envelope(Front),,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,The front of a Lighthouse remittance envelope.,...,Sender's Address Shedding Light For A Growing...,72,False,False,False,False,True,[],[],Sender's Address Shedding Light For A Growing ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19324,Arthur Clement Family,,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,A photograph of Arthur Clement Family celebrat...,...,,0,False,False,False,True,False,[Empty Transcript],[],
19325,Law School Graduation Group Photograph,,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,A group photograph possible on a graduation da...,...,,0,False,False,False,True,False,[Empty Transcript],[],
19326,Arthur Clement Family celebrating Law School g...,,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,A second photograph of the Arthur Clement Fami...,...,,0,False,False,False,True,False,[Empty Transcript],[],
19327,"Arthur Clement Family posing in Columbia, Sout...",,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,Three men celebating a recent graduation from ...,...,,0,False,False,False,True,False,[Empty Transcript],[],


In [29]:
# Remove unneeded cols
# l0 = [cor_col]
l0 = []

# Option to remove artifact / issue counts
l1 = ['Original Len',	'Special Pattern', 'General Pattern',	'Repeat Chars',	'Short/No Transcript',	'Quality',	'Issue Types',	'Detected Artifacts']

# Option to remove other base (original) columns that are not really relevant for or usecases
l2 = [
    "Contributors",
    "Date",
    "Approximate Date",
    "Source",
    "Subject",
    "Local Subject",
    "S.C. County",
    # "Description",
    "Extent",
    "Digital Collection",
    "Website",
    "Contributing Institution",
    "Rights",
    "Time Period",
    "Geographic Location",
    "Language",
    "Digitization Specifications",
    "Date Digital",
    "Type",
    "Format",
    "Media Type",
    "Identifier",
    "Note",
    "Digital Assistant",
    "OCLC number",
    "Date created",
    "Date modified",
    # "Reference URL",
    # "CONTENTdm number",
    "CONTENTdm file name",
    "CONTENTdm file path"
]

# save before removal
mccray.index = range(2, 2 + len(mccray))
save_csv(mccray, Paths.mccray_modified_metadata)


# # l10
# mccray = mccray.drop(columns=l0, errors='ignore')

# # l1
# mccray = mccray.drop(columns=l1, errors='ignore')

# # l2
# mccray = mccray.drop(columns=l2, errors='ignore')

display(mccray)

Unnamed: 0,Title,Creator,Contributors,Date,Approximate Date,Source,Subject,Local Subject,S.C. County,Description,...,Original Transcript,Original Len,Special Pattern,General Pattern,Repeat Chars,Short/No Transcript,Quality,Issue Types,Detected Artifacts,Semi-clean Transcript
2,Afro-American Newsboy Application signed by Mr...,,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,An application to be a Newsboy for the Afro-Am...,...,-5491 AFRO-AMERICAN NEWSBOY'S APPLICATION ...,213,False,False,False,False,True,[],[],-5491 AFRO-AMERICAN NEWSBOY'S APPLICATION I he...
3,Lighthouse Informer receipt,,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,A blank Lighthouse and Informer receipt.,...,"19 Received from RECEIVED OF ""Shedding Ligh...",121,False,False,False,False,True,[],[],"19 Received from RECEIVED OF ""Shedding Light F..."
4,"The Lighthouse Solicitor's Record, Home Office...",,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,"A solicitor's record, home office order, and s...",...,SOLICITOR'S RECORD Name Address City St...,392,False,False,False,False,True,[],[],SOLICITOR'S RECORD Name Address City State Amo...
5,The Lighthouse Remittance Envelope(Back),,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,The back of a remittance envelope for the Ligh...,...,,0,False,False,False,True,False,[Empty Transcript],[],
6,The Lighthouse Remittance Envelope(Front),,"McCray, John Henry, 1910-1987",,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,The front of a Lighthouse remittance envelope.,...,Sender's Address Shedding Light For A Growing...,72,False,False,False,False,True,[],[],Sender's Address Shedding Light For A Growing ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19326,Arthur Clement Family,,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,A photograph of Arthur Clement Family celebrat...,...,,0,False,False,False,True,False,[Empty Transcript],[],
19327,Law School Graduation Group Photograph,,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,A group photograph possible on a graduation da...,...,,0,False,False,False,True,False,[Empty Transcript],[],
19328,Arthur Clement Family celebrating Law School g...,,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,A second photograph of the Arthur Clement Fami...,...,,0,False,False,False,True,False,[Empty Transcript],[],
19329,"Arthur Clement Family posing in Columbia, Sout...",,"John Henry McCray, 1910-1987",,,"Manuscripts Annex, Letter size papers, 1932 to...","McCray, John Henry, 1910-1987;Pictoral works",,,Three men celebating a recent graduation from ...,...,,0,False,False,False,True,False,[Empty Transcript],[],


In [30]:
# # Save of the data (w/ index starting from 2 to align with actual row # when viewing in excel)
# mccray.index = range(2, 2 + len(mccray))
# mccray.to_csv(Paths.mccray_modified_metadata)