# MNBlackNewspaperIndex Regex Parse

In [33]:
import re #regex
import PyPDF2 #read in pdf
import shutil # copy working file
import pandas as pd


In [24]:
# import the acrobat ocr'd pdf file,
# keeps the newline formatting of the original, easier to parse than the .txt
with open('MNBlackNewspaperIndex.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    # extract the text
    ocr_text = []
    for page in reader.pages:
        text = page.extract_text()
        ocr_text.append(text)


In [20]:
# check that length of list matches number of pages, should be 129
len(ocr_text)

129

In [58]:
# write out the ocr_text as a text file with the line ending as the break
with open('MNBlackNewspaperIndexRaw.txt', 'w') as outfile:
    outfile.write("\n".join(ocr_text))

In [59]:
# # write out a copy of the text file to edit and work on
# # Do this once and comment it out to prevent overwrite

# source_file = 'MNBlackNewspaperIndexRaw.txt'  # Path to the source file
# destination_file = 'MNBlackNewspaperIndexWorking.txt'  # Path to the destination file (renamed copy)

# # Make a copy of the file with the new name
# shutil.copyfile(source_file, destination_file)


'MNBlackNewspaperIndexWorking.txt'

In [79]:
# read in the working file 
with open('MNBlackNewspaperIndexWorking.txt', 'r') as file:
    lines = file.readlines()
    dict = {}
    for i, line in enumerate(lines):
        dict[i] = line.strip()

In [80]:
# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(dict, orient='index', columns=['blob'])

In [81]:
df


Unnamed: 0,blob
0,Advertisement Rural real estate offered for sa...
1,"Albright, Joseph Duluthian gets WPA post TCH 1..."
2,"Albright, Joseph Open letter calls for statewi..."
3,"Albright, Joseph Part of Black democrats worki..."
4,"Albright, Joseph Addresses FL convention in Du..."
...,...
5896,YMCA/YWCA Report on bias at U of M NWM 6-10-19...
5897,"Young, Jeremiah U of M professor refuses seat ..."
5898,"Young, Jeremiah Prof apparently reprimanded fo..."
5899,"Young, Joe Former TC real estate agent kills h..."


In [82]:
# Create new columns for the regex groups, run the string extract on the clumn of the DF with the text,
# 
df[['blob1', 'publication', 'month', 'day', 'year', 'page']] = df['blob'].str.extract('^(?P<blob1>.+) (?P<publication>[A-Z]{2,}) (?P<month>\d+)-(?P<day>\d+)-(?P<year>\d{4}) p\s?(?P<page>\w)')

In [83]:
df

Unnamed: 0,blob,blob1,publication,month,day,year,page
0,Advertisement Rural real estate offered for sa...,,,,,,
1,"Albright, Joseph Duluthian gets WPA post TCH 1...",,,,,,
2,"Albright, Joseph Open letter calls for statewi...","Albright, Joseph Open letter calls for statewi...",MSP,5,1,1936,1
3,"Albright, Joseph Part of Black democrats worki...","Albright, Joseph Part of Black democrats worki...",MSP,9,4,1936,1
4,"Albright, Joseph Addresses FL convention in Du...","Albright, Joseph Addresses FL convention in Du...",MSP,4,1,1938,l
...,...,...,...,...,...,...,...
5896,YMCA/YWCA Report on bias at U of M NWM 6-10-19...,YMCA/YWCA Report on bias at U of M,NWM,6,10,1930,1
5897,"Young, Jeremiah U of M professor refuses seat ...","Young, Jeremiah U of M professor refuses seat ...",,4,22,1921,l
5898,"Young, Jeremiah Prof apparently reprimanded fo...","Young, Jeremiah Prof apparently reprimanded fo...",MM,5,7,1921,4
5899,"Young, Joe Former TC real estate agent kills h...","Young, Joe Former TC real estate agent kills h...",MSP,1,15,1943,l


In [84]:
# Grab the rows where the regex had problems to find errors/ manually clean and correct

df[df['blob1'].isna()]

Unnamed: 0,blob,blob1,publication,month,day,year,page
0,Advertisement Rural real estate offered for sa...,,,,,,
1,"Albright, Joseph Duluthian gets WPA post TCH 1...",,,,,,
8,"Albright, Joseph Gives strong address at Eucha...",,,,,,
11,Baseball Bertha John Donaldson again playing f...,,,,,,
12,Baseball Calgary Black Sox beat white Stillwat...,,,,,,
...,...,...,...,...,...,...,...
5697,"Wigington, Clarence Black architect ad for hom...",,,,,,
5794,"Wilkins, Roy To speak at local NAACP gathering...",,,,,,
5800,"Williams, Billy Photo, St. Paul Amateur Baseba...",,,,,,
5803,"Williams, Billy Article with photo TCS 7-22-1911",,,,,,


## Desired Output
- spreadsheet
    - columns
        - subj_name
        - description
        - newspaper
        - year
        - date
        - page

## Notes on text
- first few lines are key to newspaper
    - remove
- if name
    - Last, First
- if subject
    - SubjectA SubjectB
        - how to handle this? Only capture first word? exceptions? if else?

## Approach

- newspaper
    - do a find and replace to get full titles based on key

In [19]:
# subject_names = []  # List to store the extracted subject names

# pattern = r'^(\w+\s+\w+)'

# for line in ocr_text:
#     match = re.match(pattern, line)
#     if match:
#         subject_name = match.group(1)
#         subject_names.append(subject_name)

# # Print the extracted subject names
# for subject_name in subject_names:
#     print(subject_name)

Bias Schools
Duluth Lynchings
Minnesota Black
Baseball Bertha
Baseball Miami
Baseball St
Basketball Harlem
Bias Defense
Bias Department
Bias Hotels
Bias Religion
Bias U
Birth of
Black Press
Black Press
Black Press
Black Press
Black Press
Black Press
Black Press
Black Press
Black Press
Black Press
Boxing Black
Communists Black
Credjafawn Club
Crime 6
Crime Trial
Crime Newman
Duluth Lynchings
Elks Ames
Federation of
Film Further
Football William
Football U
Gambling Morals
Green Pastures
Housing Bias
Iowa Motion
Jewish Relations
KKK Mpls
Labor 11
Labor Elevator
Labor Editorial
Labor 1st
Labor Black
Restaurant bias
Restaurant bias
State Fair
Sumner Field
Theater Traveling
Urban League
Urban League
West Hotel
