# Removal of Messy Chars for transcript tasking

In [9]:
import pandas as pd
import re
from utils.constants import Paths
from utils.ocr_cleaning import OCR_Clean

In [10]:
df = pd.read_excel(Paths.mccray_folder + r'changed_data/decade_subsets/McCray (1940s, any_messy = True).xlsx')
df

Unnamed: 0.1,Unnamed: 0,Title,Creator,Contributors,Date,Approximate Date,Source,Subject,Local Subject,S.C. County,...,has_messy,total_messy_chars,special_messy_sequences,special_messy_count,has_special_messy,total_special_messy_chars,repeat_sequences,repeat_sequence_count,has_repeat_sequence,any_messy
0,720,Certificate that Mr. Henry Smith is a member o...,,"McCray, John Henry, 1910-1987","February, 1942",,Manuscripts; Accession 11294.,NAACP,,,...,True,70,"['ΓÇó$', '%\\', '^^*^^^', '^^^', '^^', '^┬╗', ...",7,True,25,['l'],1,True,True
1,735,"The Lighthouse and Informer paper notes, Octob...",,"McCray, John Henry, 1910-1987",1941-10-19 00:00:00,,Manuscripts; Accession 11294.,The Lighthouse & Informer,,,...,True,65,"['┬ú', '^^', '┬ú', '&*~', '┬ú', '┬ú*']",6,True,14,['j'],1,True,True
2,736,"The Lighthouse and Informer paper notes, Octob...",,"McCray, John Henry, 1910-1987",1941-10-19 00:00:00,,Manuscripts; Accession 11294.,The Lighthouse & Informer,,,...,True,89,"['┬ú&', '┬ú', '^^', '^^^^^', '^^', '&^&']",6,True,17,[],0,False,True
3,737,"The Lighthouse and Informer paper notes, Febru...",,"McCray, John Henry, 1910-1987",1944-02-06 00:00:00,,Manuscripts; Accession 11294.,The Lighthouse & Informer,,,...,True,107,"['^~', '┬░', '^^', '&┬ú#', 'Γûá', 'Γûá']",6,True,16,[],0,False,True
4,738,"The Lighthouse and Informer paper notes, Febru...",,"McCray, John Henry, 1910-1987",1944-02-06 00:00:00,,Manuscripts; Accession 11294.,The Lighthouse & Informer,,,...,True,153,"['Γûá', '^*^', '^&', '┬ú', '~=', '┬ú', '┬ú']",7,True,16,[],0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3379,14138,"Letter from Mildred Chestnut, Secretary at the...","Chestnut, Mildred (Secretary for Lighthouse an...","McCray, John Henry, 1910-1987",1946-09-25 00:00:00,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,...,True,5,"['┬░', '┬«*']",2,True,5,[],0,False,True
3380,14140,"Letter from Mildred Chestnut, Secretary at the...","Chestnut, Mildred (Secretary for Lighthouse an...","McCray, John Henry, 1910-1987",1946-09-27 00:00:00,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,...,False,0,['┬Ñ'],1,True,2,[],0,False,True
3381,14142,"Letter from John H. McCray to Dr. E. J. Cling,...","McCray, John Henry, 1910-1987","McCray, John Henry, 1910-1987",1946-09-28 00:00:00,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,...,True,5,"['┬ú&ΓÇó', '┬úΓÇó', 'ΓÇó', '┬½']",4,True,16,[],0,False,True
3382,14143,Letter from Mrs. E.M. Parker to Mildred Chestn...,"McCray, John Henry, 1910-1987","McCray, John Henry, 1910-1987",1946-09-30 00:00:00,,Manuscripts; Accession 11294.,Lighthouse and Informer,,,...,False,0,['ΓÇÖ'],1,True,3,[],0,False,True


In [None]:
# Special pattern removal
df['semiclean'] = df['Transcript'].apply(lambda x: re.sub(OCR_Clean.special_pattern, "", x))

In [19]:
# General pattern removal
df['semiclean'] = df['semiclean'].apply(lambda x: re.sub(OCR_Clean.general_pattern, "", x))

In [20]:
# Repeated char condenser
df['semiclean'] = df['semiclean'].apply(lambda x: re.sub(OCR_Clean.letter_pattern, r'\1', x))

In [None]:
# Remove other unclean

# Regex pattern for this:
# To remove ^ and ^shortphrases, and remove standalone punctuation not attached to words/numbers.

# \^(\w{0,4})   matches ^ followed by 0–4 word characters (short phrases like ^a, ^ok, ^xyz, ,^fele or just ^)
# |             OR
# (?<!\w)       negative lookbehind: ensures the char before is NOT a word character
# [^\w\s]       matches any single character that is NOT a word char and NOT whitespace (i.e., punctuation)
# (?!\w)        negative lookahead: ensures the char after is NOT a word character


p1 = r"\^(\w{0,4})|(?<!\w)[^\w\s](?!\w)"

df['semiclean'] = df['semiclean'].apply(lambda x: re.sub(p1, "", x))
df['semiclean']

0         i Class) $1.00      (2    thai  .r.Hpnry  Sm...
1        laa-Ji          Bryb &lt; jg  I   l-O:  f  \s...
2       e-   3    m                     pirn  }ZT:  &l...
3       top  qqu\ UJachdsooih     l     -fLtuLojr   CA...
4       p I  rlyAjxJi CUx Uaama   .fat, H CwMvvu.  PVL...
                              ...                        
3379    September 25, 1946    ?R!V3 Newspaper Syndicat...
3380    September 27, W46    fthelyn  Psxkex  163a Lin...
3381    Sept. 28, 1946      J. Cling     583 King St. ...
3382    loja Line, St.    Charleston, S.C    Sept. 30,...
3383    1. I. Bash    Dalle's B. Salon    Conaway Drug...
Name: semiclean, Length: 3384, dtype: object

In [24]:
# Remove ( and connected letter/word if there is no closing bracket
p2 = r'\([^)]*$'

df['semiclean'] = df['semiclean'].apply(lambda x: re.sub(p2, "", x))
df['semiclean']

0                                    i Class) $1.00      
1        laa-Ji          Bryb &lt; jg  I   l-O:  f  \s...
2       e-   3    m                     pirn  }ZT:  &l...
3       top  qqu\ UJachdsooih     l     -fLtuLojr   CA...
4       p I  rlyAjxJi CUx Uaama   .fat, H CwMvvu.  PVL...
                              ...                        
3379    September 25, 1946    ?R!V3 Newspaper Syndicat...
3380    September 27, W46    fthelyn  Psxkex  163a Lin...
3381    Sept. 28, 1946      J. Cling     583 King St. ...
3382    loja Line, St.    Charleston, S.C    Sept. 30,...
3383    1. I. Bash    Dalle's B. Salon    Conaway Drug...
Name: semiclean, Length: 3384, dtype: object

In [26]:
# Clean spacing (with auto newlining)

def normalize_spaces(text):
    return re.sub(r" {2,}", lambda m: "\n" if len(m.group(0)) > 3 else " ", text)

df['semiclean'] = df['semiclean'].apply(normalize_spaces)
df['semiclean']

0                                        i Class) $1.00\n
1        laa-Ji\nBryb &lt; jg I l-O: f \s\tf\jhL taw c...
2       e- 3\nm\npirn }ZT: &lt;\n'as\nSTL\ny\nCuaiup y...
3       top qqu\ UJachdsooih\nl\n-fLtuLojr CAjjtLcm A4...
4       p I rlyAjxJi CUx Uaama .fat, H CwMvvu. PVLtTL ...
                              ...                        
3379    September 25, 1946\n?R!V3 Newspaper Syndicate ...
3380    September 27, W46\nfthelyn Psxkex 163a Line St...
3381    Sept. 28, 1946\nJ. Cling\n583 King St.\nCharle...
3382    loja Line, St.\nCharleston, S.C\nSept. 30,1946...
3383    1. I. Bash\nDalle's B. Salon\nConaway Drugs\nO...
Name: semiclean, Length: 3384, dtype: object

In [28]:
# Save
df.to_excel(Paths.mccray_folder + r'changed_data/decade_subsets/McCray (1940s, semi-cleaned).xlsx', index=0)