# PLSE ETL - Part 2: Scraping PDFs to DB
Note: Using Sample Data Set (count=199)

In [6]:
from PyPDF2 import PdfFileReader

In [7]:
from os import listdir
from os.path import isfile, join

In [8]:
import re

In [9]:
path_to_dockets = '../tmp/sample_dockets'

In [10]:
dockets = [f for f in listdir(path_to_dockets) if isfile(join(path_to_dockets, f))]

In [11]:
def compiled_pattern(pattern):
    return re.compile(r'{}'.format(pattern))

In [12]:
dn = '(?<=Docket\sNumber\:\s)\w{2}\-\d{2}\-\w{2}\-\d{7}\-\d{4}'

In [13]:
fd = '(?<=Date\sFiled\:\s{2})\d{2}\/\d{2}\/\d{4}'

In [53]:
gen_regex = lambda x: re.compile(r'%s' % x)

In [54]:
keys = {
    'DocketNumber': (gen_regex(dn), None),
    'FiledDate': (gen_regex(fd), None),
    'DisposingJudge': (gen_regex(ja), None)
}

In [157]:
ja = '(?<=Judge\sAssigned\n\:\s{2})\w+\,\s\w+\s\w\.(?=Initiation\sDate\:)'

ja_test_strings = ['01/25/1989Judge Assigned\n:  Kafrissen, Arthur S.Initiation Date:']

In [None]:
fml_names_test_strings = ['Commonwealth of Pennsylvaniav.Carl R BurchCASE INFORMATION']

In [152]:
def gen_keys():
    r = {
        'DocketNumber': (gen_regex(dn), None),
        'FiledDate': (gen_regex(fd), None),
        'DisposingJudge': (gen_regex(ja), None)
    }
    
    return r

In [158]:
def scrape_pdf(filename):
    path = '{}/{}'.format(path_to_dockets, filename)
    data_keys = gen_keys()
    
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        #info = pdf.getDocumentInfo()
        page_count = pdf.getNumPages()
        
        print('File Path: {}'.format(f.name))

        for p in range(page_count):
            print('\tPage', p)
            
            page_data = pdf.getPage(p).extractText()
            # @note may have to replace `\n` with `\s`
            print(page_data)
            
            for k in data_keys:
                d = data_keys
                
                if d[k][1] is None:
                    m = re.search(d[k][0], page_data)

                    if m and m.group():
                        print('\t\t{}: {}'.format(k, m.group()))
                        d[k] = (d[k][0], m.group())
            

In [159]:
f = scrape_pdf(dockets[83])

File Path: ../tmp/sample_dockets/MC-51-CR-0129461-1989.pdf
	Page 0
MUNICIPAL COURT OF PHILADELPHIA COUNTY  
DOCKETDocket Number: MC-51-CR-0129461-1989Court CaseCRIMINAL DOCKET
Page 1 of 2Commonwealth of Pennsylvaniav.Carl R BurchCASE INFORMATION
Cross Court Docket Nos:  CP-51-CR-0217891-1989Date Filed:  01/25/1989Judge Assigned
:  Kafrissen, Arthur S.Initiation Date: 01/25/1989 OTN:  M 382476-3Originating Docket No:  LOTN:  Initial Issuing Authority
:  Final Issuing Authority
:  Arthur S. KafrissenArresting Agency
:  Philadelphia PdArresting Officer
:  Affiant
Complaint/Incident #:  Case Local Number Type
(s)Case Local Number(s)M8901294611
Legacy Docket Number8902005732District Control Number8902005732Police Incident NumberSTATUS INFORMATION
Case Status:ClosedArrest Date:01/25/1989Processing StatusStatus Date02/01/1989Completed01/25/1989Migrated Case (Active)01/25/1989Complaint Date:DEFENDANT INFORMATION
Date Of Birth:02/01/1968City/State/Zip:  Phila, PA  
19104CASE PARTICIPANTS
NamePa

In [155]:
for docket in dockets:
    try:
        scrape_pdf(docket)
    except:
      print("Something else went wrong") 

File Path: ../tmp/sample_dockets/MC-51-CR-0307801-2003.pdf
	Page 0
		DocketNumber: MC-51-CR-0307801-2003
		FiledDate: 03/07/2003
	Page 1
	Page 2
	Page 3
	Page 4
File Path: ../tmp/sample_dockets/MC-51-CR-0108981-1996.pdf
	Page 0
		DocketNumber: MC-51-CR-0108981-1996
		FiledDate: 01/12/1996
	Page 1
	Page 2
File Path: ../tmp/sample_dockets/MC-51-CR-0224291-2002.pdf
	Page 0
		DocketNumber: MC-51-CR-0224291-2002
		FiledDate: 02/20/2002
		DisposingJudge: Anderson, Linda F.
	Page 1
File Path: ../tmp/sample_dockets/CP-51-CR-0100651-2003.pdf
	Page 0
		DocketNumber: CP-51-CR-0100651-2003
		FiledDate: 01/10/2003
	Page 1
	Page 2
	Page 3
	Page 4
	Page 5
	Page 6
File Path: ../tmp/sample_dockets/MC-51-CR-1205031-1994.pdf
	Page 0
		DocketNumber: MC-51-CR-1205031-1994
		FiledDate: 12/08/1994
		DisposingJudge: Silberstein, Alan K.
	Page 1
	Page 2
File Path: ../tmp/sample_dockets/CP-51-MD-9992548-2002.pdf
	Page 0
		DocketNumber: CP-51-MD-9992548-2002
		FiledDate: 04/30/2002
		DisposingJudge: Kean, Joyce 