# 

# Metadata split (transcript length)

In [1]:
import pandas as pd
import re
from utils.nb_metrics import Metrics
from utils.constants import Paths

In [2]:
mccray = pd.read_excel(Paths.mccray_modified_metadata)
print(list(mccray))
mccray[['Title']]

['Title', 'Creator', 'Contributors', 'Date', 'Approximate Date', 'Source', 'Subject', 'Local Subject', 'S.C. County', 'Description', 'Extent', 'Digital Collection', 'Website', 'Contributing Institution', 'Rights', 'Time Period', 'Geographic Location', 'Language', 'Digitization Specifications', 'Date Digital', 'Type', 'Format', 'Media Type', 'Identifier', 'Note', 'Digital Assistant', 'Transcript', 'OCLC number', 'Date created', 'Date modified', 'Reference URL', 'CONTENTdm number', 'CONTENTdm file name', 'CONTENTdm file path', 'Year', 'messy_sequences', 'messy_count', 'has_messy', 'total_messy_chars']


Unnamed: 0,Title
0,Afro-American Newsboy Application signed by Mr...
1,Lighthouse Informer receipt
2,"The Lighthouse Solicitor's Record, Home Office..."
3,The Lighthouse Remittance Envelope(Back)
4,The Lighthouse Remittance Envelope(Front)
...,...
19324,Arthur Clement Family
19325,Law School Graduation Group Photograph
19326,Arthur Clement Family celebrating Law School g...
19327,"Arthur Clement Family posing in Columbia, Sout..."


In [7]:
# Replace empty with na
print(mccray.iloc[512]['Transcript'])
mccray['Transcript'] = mccray['Transcript'].fillna('').apply(lambda x: x.strip() if isinstance(x, str) else '')
print(mccray.iloc[512]['Transcript']) 
mccray[['Transcript']]

nan



Unnamed: 0,Transcript
0,-5491 AFRO-AMERICAN NEWSBOY'S APPLICATION ...
1,"19 Received from RECEIVED OF ""Shedding Ligh..."
2,SOLICITOR'S RECORD Name Address City St...
3,
4,Sender's Address Shedding Light For A Growing...
...,...
19324,
19325,
19326,
19327,


In [8]:
# Create 'no_transcripts'
mccray_no_transcripts = mccray[mccray['Transcript'] == ""]

# Update mccray to have only transcripts
mccray = mccray[(mccray['Transcript'] != "")]

# Create mccray subsets with subsets transcript length <=5 char and <=20 char
mccray_five_char = mccray[(mccray['Transcript'].str.len()) <= 5] 
mccray_twenty_char = mccray[((mccray['Transcript'].str.len()) > 5) & ((mccray['Transcript'].str.len()) <= 20)] 

# Update mccray to only have transcripts >20 char 
mccray = mccray[(mccray['Transcript'].str.len()) > 20] 

# Print <= 5 char transcripts
mccray_five_char[['Transcript']]

Unnamed: 0,Transcript
953,far
2029,I
2046,Γûá
2818,T-i
2821,from.
...,...
17781,0 I
17785,.
18015,V
19267,B


In [9]:
# Metric Collection of split groups (helper func from util class)

print("with transcript")
sum1 = Metrics.df_type_counts(mccray)

print("transcript <= five chars")
sum2 = Metrics.df_type_counts(mccray_five_char)

print("transcript <= twenty chars")
sum3 = Metrics.df_type_counts(mccray_twenty_char)

print("without transcript (0 char)")
sum4 = Metrics.df_type_counts(mccray_no_transcripts)

print(f"Total row count: {sum1+sum2+sum3+sum4}")


with transcript
16 rows Type = "still image", 14045 rows Type = "text", 82 rows "still image; text", 2 rows of other types
Sum = 14145

transcript <= five chars
3 rows Type = "still image", 187 rows Type = "text", 0 rows "still image; text", 0 rows of other types
Sum = 190

transcript <= twenty chars
7 rows Type = "still image", 264 rows Type = "text", 0 rows "still image; text", 0 rows of other types
Sum = 271

without transcript (0 char)
1330 rows Type = "still image", 3371 rows Type = "text", 19 rows "still image; text", 3 rows of other types
Sum = 4723

Total row count: 19329


In [10]:
# Print <=20 char transcripts
mccray_twenty_char[['Transcript']]

Unnamed: 0,Transcript
434,POST CARD
954,G. HIT ΓÇó
1147,JOURNAL No. 1020
1809,* . ...
1884,I /U4- ~2g /
...,...
17756,Γûá┬╗ - -
17787,- I - 4 Γûá -
18004,r 7
18010,WSl EOl'( Γûá


In [11]:
# Print <=20 char transcripts
mccray[['Transcript']]

Unnamed: 0,Transcript
0,-5491 AFRO-AMERICAN NEWSBOY'S APPLICATION ...
1,"19 Received from RECEIVED OF ""Shedding Ligh..."
2,SOLICITOR'S RECORD Name Address City St...
4,Sender's Address Shedding Light For A Growing...
5,"Just Telephone Us, Place your order. It'll be..."
...,...
19316,"September 27, W46 fthelyn ┬Ñ. Psxkex 163a ..."
19317,"Sept. 28, 1946 MX* Ait hoi W* Aiken 429 ..."
19318,"Sept. 28, 1946 ┬ú&ΓÇó ┬úΓÇó J. Cling \ 5..."
19319,"loja Line, St., Charleston, S.C Sept. 30..."


In [None]:
folder = Paths.mccray_folder + r'changed_data/trscp_subsets/'
mccray.to_excel(folder + r'McCray (with transcripts).xlsx', index=False)
mccray_no_transcripts.to_excel(folder + r'McCray (no transcripts).xlsx', index=False)
mccray_five_char.to_excel(folder + r'McCray (less than 5 letter transcripts).xlsx', index=False)
mccray_twenty_char.to_excel(folder + r'McCray (less than 20 letter transcripts).xlsx', index=False)