# MNBlackNewspaperIndex Regex Parse

In [85]:
import re #regex
import PyPDF2 #read in pdf
import shutil # copy working file
import pandas as pd


In [24]:
# import the acrobat ocr'd pdf file,
# keeps the newline formatting of the original, easier to parse than the .txt
with open('MNBlackNewspaperIndex.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    # extract the text
    ocr_text = []
    for page in reader.pages:
        text = page.extract_text()
        ocr_text.append(text)


In [20]:
# check that length of list matches number of pages, should be 129
len(ocr_text)

129

In [58]:
# write out the ocr_text as a text file with the line ending as the break
with open('MNBlackNewspaperIndexRaw.txt', 'w') as outfile:
    outfile.write("\n".join(ocr_text))

In [59]:
# # write out a copy of the text file to edit and work on
# # Do this once and comment it out to prevent overwrite

# source_file = 'MNBlackNewspaperIndexRaw.txt'  # Path to the source file
# destination_file = 'MNBlackNewspaperIndexWorking.txt'  # Path to the destination file (renamed copy)

# # Make a copy of the file with the new name
# shutil.copyfile(source_file, destination_file)


'MNBlackNewspaperIndexWorking.txt'

In [86]:
# read in the working file 
with open('MNBlackNewspaperIndexWorking.txt', 'r') as file:
    lines = file.readlines()
    dict = {}
    for i, line in enumerate(lines):
        dict[i] = line.strip()

In [87]:
# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(dict, orient='index', columns=['blob'])

In [88]:
df


Unnamed: 0,blob
0,Advertisement Rural real estate offered for sa...
1,"Albright, Joseph Duluthian gets WPA post TCH 1..."
2,"Albright, Joseph Open letter calls for statewi..."
3,"Albright, Joseph Part of Black democrats worki..."
4,"Albright, Joseph Addresses FL convention in Du..."
...,...
5853,YMCA/YWCA Report on bias at U of M NWM 6-10-19...
5854,"Young, Jeremiah U of M professor refuses seat ..."
5855,"Young, Jeremiah Prof apparently reprimanded fo..."
5856,"Young, Joe Former TC real estate agent kills h..."


In [96]:
# Create new columns for the regex groups, run the string extract on the clumn of the DF with the text,
# 
df[['blob1', 'publication', 'month', 'day', 'year', 'page', 'column']] = df['blob'].str.extract('^(?P<blob1>.+) (?P<publication>[A-Z]{2,}) (?P<month>\d+)-(?P<day>\d+)-(?P<year>\d{4}) p\s?(?P<page>\w) c\s?(?P<column>\w)')

In [97]:
df.head(10)

Unnamed: 0,blob,blob1,publication,month,day,year,page,column
0,Advertisement Rural real estate offered for sa...,Advertisement Rural real estate offered for sale,AP,11.0,27.0,1915.0,4.0,2.0
1,"Albright, Joseph Duluthian gets WPA post TCH 1...","Albright, Joseph Duluthian gets WPA post",TCH,12.0,21.0,1935.0,1.0,2.0
2,"Albright, Joseph Open letter calls for statewi...","Albright, Joseph Open letter calls for statewi...",MSP,5.0,1.0,1936.0,1.0,6.0
3,"Albright, Joseph Part of Black democrats worki...","Albright, Joseph Part of Black democrats worki...",MSP,9.0,4.0,1936.0,1.0,6.0
4,"Albright, Joseph Addresses FL convention in Du...","Albright, Joseph Addresses FL convention in Du...",MSP,4.0,1.0,1938.0,1.0,4.0
5,"Albright, Joseph Loses his state accounting jo...","Albright, Joseph Loses his state accounting jo...",MSP,8.0,4.0,1939.0,1.0,5.0
6,"Albright, Joseph Involved in messy divorce fro...","Albright, Joseph Involved in messy divorce fro...",MSP,10.0,10.0,1939.0,1.0,4.0
7,"Albright, Joseph Gets local job with food stam...","Albright, Joseph Gets local job with food stam...",MSP,12.0,1.0,1939.0,1.0,6.0
8,"Albright, Joseph Gives strong address at Eucha...",,,,,,,
9,"Albright, Joseph Update on his California mili...","Albright, Joseph Update on his California mili...",MSP,2.0,19.0,1943.0,3.0,4.0


In [98]:
# Grab the rows where the regex had problems to find errors/ manually clean and correct

df[df['blob1'].isna()]

Unnamed: 0,blob,blob1,publication,month,day,year,page,column
8,"Albright, Joseph Gives strong address at Eucha...",,,,,,,
14,Baseball Douglass Base Ball Club AP 6-12-1897 p3,,,,,,,
190,Real estate Big real estate ad luring Blacks t...,,,,,,,
223,"Withers, Rev. M. W. Photo of Fergus Falls Bapt...",,,,,,,
259,"Adams, J. Q. 25th anniversary of Appeal souven...",,,,,,,
...,...,...,...,...,...,...,...,...
5793,"Wilson, Charles Photographic assistant to Harr...",,,,,,,
5803,"Wilson, Woodrow William Trotter speaks against...",,,,,,,
5806,"Winston, Eliza Obit for white man who played r...",,,,,,,
5825,World War I (see also Home Guard),,,,,,,


In [93]:
df.iloc[9]

blob           Albright, Joseph Update on his California mili...
blob1          Albright, Joseph Update on his California mili...
publication                                                  MSP
month                                                          2
day                                                           19
year                                                        1943
page                                                           3
Name: 9, dtype: object

## Desired Output
- spreadsheet
    - columns
        - subj_name
        - description
        - newspaper
        - year
        - date
        - page

## Notes on text
- first few lines are key to newspaper
    - remove
- if name
    - Last, First
- if subject
    - SubjectA SubjectB
        - how to handle this? Only capture first word? exceptions? if else?

## Approach

- newspaper
    - do a find and replace to get full titles based on key