# PLSE ETL - Part 2: Scraping PDFs to DB
Note: Using Sample Data Set (count=199)

In [48]:
from PyPDF2 import PdfFileReader

In [49]:
from os import listdir
from os.path import isfile, join

In [50]:
import re

In [51]:
path_to_dockets = '../tmp/sample_dockets'

In [52]:
dockets = [f for f in listdir(path_to_dockets) if isfile(join(path_to_dockets, f))]

In [53]:
def compiled_pattern(pattern):
    return re.compile(r'{}'.format(pattern))

In [54]:
dn = '(?<=Docket\sNumber\:\s)\w{2}\-\d{2}\-\w{2}\-\d{7}\-\d{4}'
dn_t = ['DOCKETDocket Number: MC-51-CR-0129461-1989Court Case']

In [55]:
fd = '(?<=Date\sFiled\:\s{2})\d{2}\/\d{2}\/\d{4}'
fd_t = ['1989Date Filed:  01/25/1989Judge']

In [56]:
ja = '(?<=Judge\sAssigned\n\:\s{2})\w+\,\s\w+(\s\w\.)?(?=Initiation\sDate\:)'
ja_t = ['01/25/1989Judge Assigned\n:  Kafrissen, Arthur S.Initiation Date:', '01/25/1989Judge Assigned\n:  Kafrissen, ArthurInitiation Date:']

In [57]:
fml = '(?<=Pennsylvaniav\.)(\w|\s)+(?=CASE)'
fml_t = ['Commonwealth of Pennsylvaniav.Carl R BurchCASE INFORMATION', 'Pennsylvaniav.Raymond BlizzardCASE', 'Pennsylvaniav.Shakina Thompson\nCASE INFORMATION']

In [None]:
otn = '(?<=OTN\:\s{2})\w\s\d{6}\-\d+(?=Originating)'
otn_t = ['2004 OTN:  N 247812-5Originating',
        '1985 OTN:  Originating', # expected to "fail"
        '2001 OTN:  N 069615-0Originating',
        '1985 OTN:  M 226837-2Originating']

In [58]:
gen_regex = lambda x: re.compile(r'%s' % x)

In [72]:
def gen_keys():
    r = {
        'DocketNumber': (gen_regex(dn), None),
        'FiledDate': (gen_regex(fd), None),
        'DisposingJudge': (gen_regex(ja), None),
        'FullName': (gen_regex(fml), None), # needs to be split FirstName LastName MiddleName
        'OffenseTrackingNumber': (gen_regex(otn), None)
    }
    
    return r

In [None]:
result_stats = {
    # 
}

In [77]:
def scrape_pdf(filename):
    path = '{}/{}'.format(path_to_dockets, filename)
    data_keys = gen_keys()
    
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        #info = pdf.getDocumentInfo()
        page_count = pdf.getNumPages()
        
        print('File Path: {}'.format(f.name))

        for p in range(page_count):
            print('\tPage', p)
            
            page_data = pdf.getPage(p).extractText()
            # @note may have to replace `\n` with `\s`
            print(page_data)
            
            for k in data_keys:
                d = data_keys
                
                if d[k][1] is None:
                    m = re.search(d[k][0], page_data)

                    if m and m.group():
                        print('\t\t{}: {}'.format(k, m.group()))
                        d[k] = (d[k][0], m.group())
            

In [89]:
f = scrape_pdf(dockets[43])

File Path: ../tmp/sample_dockets/MC-51-SU-0000543-1991.pdf
	Page 0
MUNICIPAL COURT OF PHILADELPHIA COUNTY  
DOCKETDocket Number: MC-51-SU-0000543-1991Non-Traffic
SUMMARY DOCKET
Page 1 of 2Commonwealth of Pennsylvaniav.Kevin JonesCASE INFORMATION
Date Filed:  02/03/1991Judge Assigned
:  Silberstein, Alan K.Initiation Date: 02/03/1991 OTN:  Originating Docket No:  LOTN:  Initial Issuing Authority
:  Final Issuing Authority
:  Arresting Agency
:  Philadelphia PdArresting Officer
:  Affiant
Complaint/Incident #:  910300356101Case Local Number Type
(s)Case Local Number(s)9103003561District Control Number910300356101Legacy Docket NumberSTATUS INFORMATION
Case Status:ClosedArrest Date:02/03/1991Processing StatusStatus Date08/19/1993Completed02/03/1991Complaint Date:DEFENDANT INFORMATION
Date Of Birth:12/26/1962City/State/Zip:  PHILA, PA  
19134Alias NameBristow, LesterJones, GeraldThompson, JerryCASE PARTICIPANTS
NameParticipant Type
DefendantBristow, LusterCHARGESSeq.Statute DescriptionGrade

In [76]:
for docket in dockets:
    try:
        scrape_pdf(docket)
    except:
      print("Something else went wrong") 

File Path: ../tmp/sample_dockets/MC-51-CR-0307801-2003.pdf
	Page 0
		DocketNumber: MC-51-CR-0307801-2003
		FiledDate: 03/07/2003
		FullName: Timothy N Moore

		OffenseTrackingNumber: N 196760-4
	Page 1
	Page 2
	Page 3
	Page 4
File Path: ../tmp/sample_dockets/MC-51-CR-0108981-1996.pdf
	Page 0
		DocketNumber: MC-51-CR-0108981-1996
		FiledDate: 01/12/1996
		DisposingJudge: Krase, Morton
		FullName: Sean Terrell

		OffenseTrackingNumber: M 685610-2
	Page 1
	Page 2
File Path: ../tmp/sample_dockets/MC-51-CR-0224291-2002.pdf
	Page 0
		DocketNumber: MC-51-CR-0224291-2002
		FiledDate: 02/20/2002
		DisposingJudge: Anderson, Linda F.
		FullName: Mary Hydock
		OffenseTrackingNumber: N 130155-4
	Page 1
File Path: ../tmp/sample_dockets/CP-51-CR-0100651-2003.pdf
	Page 0
		DocketNumber: CP-51-CR-0100651-2003
		FiledDate: 01/10/2003
		FullName: Raymond Blizzard
		OffenseTrackingNumber: N 185749-4
	Page 1
	Page 2
	Page 3
	Page 4
	Page 5
	Page 6
File Path: ../tmp/sample_dockets/MC-51-CR-1205031-1994.pdf
