In [1]:
import pandas as pd
import urllib.request, json 
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
import PyPDF2
import os
from os import walk
import os.path
from os import path
import numpy as np
#!pip install py-readability-metrics
from readability import Readability
#!python -m nltk.downloader punkt

import time
from datetime import date

import spacy
from sklearn.preprocessing import normalize

from joblib import dump, load

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
def vectorize(text):
    output = nlp(str(text)).vector   
    return output

def norm(row):
    try:
        matrix = row.reshape(1,-1).astype(np.float64)
        return normalize(matrix, axis=1, norm='l1')[0]
    except Exception as e: 
        print("===================")
        print(row)
        print("-------------------")
        print(e)
        print("===================")
        #return np.NaN

In [33]:
included_fields = load('../data/processed/ML/norm_fields/included_fields.joblib') 
jurisdictions = load('../data/processed/ML/norm_fields/jurisdictions.joblib') 
groups = load('../data/processed/ML/norm_fields/groups.joblib') 
clf_field_names = load('../data/processed/ML/norm_fields/clf_field_names.joblib') 

def normalize_name(jur,group,n,per,last_field,this_field):
    
    # Add hard coded conversions maybe by calling a function
    # if returns 0 then fail over to ML or otherway around poor prob -> check hard-coded

    if this_field not in included_fields:
        this_field = reCase(this_field)

        out_put = re.sub("^(My\s)?Name$","users1_name",this_field, flags=re.IGNORECASE)
        out_put = re.sub("^Printed Name\s?\d*$","users1_name",out_put, flags=re.IGNORECASE)  

        out_put = re.sub("^Address$","users1_address_line_one",out_put, flags=re.IGNORECASE)
        out_put = re.sub("^City State Zip$","users1_address_line_two",out_put, flags=re.IGNORECASE)
        out_put = re.sub("^Phone$","users1_phone_number",out_put, flags=re.IGNORECASE)
        out_put = re.sub("^Email$","users1_email",out_put, flags=re.IGNORECASE)

        out_put = re.sub("^DOB$","users1_birthdate",out_put, flags=re.IGNORECASE)

        out_put = re.sub("^Case (No\s?A?|Number)$","docket_number",out_put, flags=re.IGNORECASE)
        out_put = re.sub("^Date\s?\d*$","signature_date",out_put, flags=re.IGNORECASE)

        conf = 1.0

        if out_put==this_field:
            params = []
            for item in jurisdictions:
                if jur== item:
                    params.append(1)
                else:
                    params.append(0)
            for item in groups:
                if group== item:
                    params.append(1)
                else:
                    params.append(0)
            params.append(n)
            params.append(per)
            for vec in norm(vectorize(this_field)):
                params.append(vec)
            #for vec in norm(vectorize(last_field)):
            #    params.append(vec)

            for item in included_fields:
                if last_field==item:
                    params.append(1)
                else:
                    params.append(0)

            pred = clf_field_names.predict([params])
            prob = clf_field_names.predict_proba([params])

            conf = prob[0].tolist()[prob[0].tolist().index(max(prob[0].tolist()))]
            out_put = pred[0]
            
    else:
        out_put = this_field
        conf = 1
            
    if out_put in included_fields:
        if conf >= 0:
            return "*"+out_put+" (%s, %s)"%(conf,this_field) #, conf
        else:
            return this_field+" (%s, %s)"%(out_put,conf)
    else:
        return this_field+" (%s, %s)"%(out_put,conf)


In [25]:
def reCase(text):
    output = re.sub("(\w|\d)(_|-)(\w|\d)","\\1 \\3",text.strip())
    output = re.sub("([a-z])([A-Z]|\d)","\\1 \\2",output)
    output = re.sub("(\d)([A-Z]|[a-z])","\\1 \\2",output)
    return output

In [26]:
def read_pdf (file):
    try:
        pdfFile = PyPDF2.PdfFileReader(open(file, "rb"))
        if pdfFile.isEncrypted:
            try:
                pdfFile.decrypt('')
                #print ('File Decrypted (PyPDF2)')
            except:
                #
                #
                # This didn't go so well on my Windows box so I just ran this in the pdf folder's cmd:
                # for %f in (*.*) do copy %f temp.pdf /Y && "C:\Program Files (x86)\qpdf-8.0.2\bin\qpdf.exe" --password="" --decrypt temp.pdf %f
                #
                #
                #
                
                command="cp "+file+" tmp/temp.pdf; qpdf --password='' --decrypt tmp/temp.pdf "+file
                os.system(command)
                #print ('File Decrypted (qpdf)')
                #re-open the decrypted file
                pdfFile = PyPDF2.PdfFileReader(open(file, "rb"))
        text = ""
        for page in pdfFile.pages:
            text = text + " " + page.extractText()
        return text
    except:
        return ""


In [27]:
def parse_form(fileloc,title=None,jur=None,cat=None,normalize=1):
    f = PyPDF2.PdfFileReader(fileloc)
    npages = f.getNumPages()
    ff = f.getFields()
    if ff:
        fields = list(ff.keys())
    else:
        fields = []
    f_per_page = len(fields)/npages
    text = read_pdf(fileloc)
    try:
        readbility = int(Readability(text).flesch_kincaid().grade_level)
    except:
        readbility = None
    
    if title is None:
        title = reCase(re.search("(.*)\n",text).group(1).strip())
        
    if normalize==1:
        i = 0 
        length = len(fields)
        last = "null"
        new_fields = []
        for field in fields:
            #print(jur,cat,i,i/length,last,field)
            new_fields.append(normalize_name(jur,cat,i,i/length,last,field))
            last = field
    else:
        new_fields = fields
    
    stats = {
            "title":title,
            "category":cat,
            "pages":npages,
            "reading grade level": readbility,
            "avg fields per page": f_per_page,
            "fields":new_fields
            }    
    return stats

In [28]:
parse_form("../data/processed/www.utcourts.gov/forms/b8dc2bf7b60946f2bbbf45cc3967f78f.pdf",title=None,jur="UT",cat=None,normalize=1)

{'title': 'Notice of Right to Object',
 'category': None,
 'pages': 2,
 'reading grade level': 10,
 'avg fields per page': 15.0,
 'fields': ['*users1_name (1.0, Name)',
  '*users1_address_line_one (1.0, Address)',
  '*users1_address_line_two (1.0, City State Zip)',
  '*users1_phone_number (1.0, Phone)',
  '*users1_email (1.0, Email)',
  'Attorney for the Guardian Conservator and my Utah Bar number is (_nonstandard_, 0.9999421107549978)',
  'undefined (_nonstandard_, 0.9999232446025053)',
  '*docket_number (1.0, Case Number)',
  'a minor (_nonstandard_, 0.9052558432983975)',
  'Judge (_nonstandard_, 0.9999162805800589)',
  'Name and Address (_nonstandard_, 0.9986338747035546)',
  '*signature_date (1.0, Date)',
  'Signature (_nonstandard_, 0.9999492319215736)',
  '*users1_name (1.0, Printed Name)',
  'Notice of Right to Object (_nonstandard_, 0.999958920838529)',
  'Persons Name Row 1 (_nonstandard_, 0.8911365269685986)',
  'Served at this Address    Mail     Hand Delivery    Efiled    E

In [29]:
files_df_ut = pd.read_csv("../data/raw/www.utcourts.gov/form_data.csv")
files_df_mi = pd.read_csv("../data/raw/www.courts.michigan.gov/form_data.csv")

df = pd.concat([files_df_ut,files_df_mi],ignore_index=True)

In [30]:
df["pages"] = ""
df["fields"] = ""
df["f_per_p"] = ""
df["reading"] = ""

for index,row in df.iterrows():
    if row["pages"] == "":
        try:
            stats = parse_form("../data/processed/"+row["source"]+"/forms/"+row["id"]+".pdf",row["title"],row["jurisdiction"],row["group"],1)
            df.at[index, 'pages'] = stats["pages"]
            df.at[index, 'fields'] = stats["fields"]
            df.at[index, 'f_per_p'] = stats["avg fields per page"]
            df.at[index, 'reading'] = stats["reading grade level"]
            #print(index)
        except:
            print("error: "+"../data/raw/"+row["source"]+"/forms/"+row["id"]+".pdf")
        
df



Unnamed: 0,id,jurisdiction,source,title,group,url,filename,downloaded,pages,fields,f_per_p,reading
0,8c0acf55825b4add9983d712091f37e9,UT,www.utcourts.gov,Community Service Worksheet Third District Juv...,3rd District Juvenile Court: Forms and Pamphlets,https://www.utcourts.gov/courts/juv/juvsites/3...,COMMUNITY%20SERVICE%20WORKSHEET-FRONT%20AND%20...,2021-11-08,2,"[NAME 1 (_nonstandard_, 0.9999160153710195), N...",7.5,10
1,ecd85cba539c44e3b46b26ecc6ad676a,UT,www.utcourts.gov,Third District Juvenile Court Work Program Ref...,3rd District Juvenile Court: Forms and Pamphlets,https://www.utcourts.gov/courts/juv/juvsites/3...,Work_Crew_Application-2007.pdf,2021-11-08,2,"[Court Case (_nonstandard_, 0.932392068274583)...",19.0,11
2,dc740a9058834b14aa5ec135f397f4e4,UT,www.utcourts.gov,Utah State District Juvenile Court Probation O...,3rd District Juvenile Court: Forms and Pamphlets,https://www.utcourts.gov/courts/juv/juvsites/3...,Probation%20Order%20revised.050502.pdf,2021-11-08,5,[of the probation department or by order of th...,4.8,15
3,56118b9bfab54487ac73b6d7493c35f2,UT,www.utcourts.gov,Adoptee's Consent to Adoption and Waiver of Ri...,Adopting a Minor Stepchild,https://www.utcourts.gov/howto/family/adoption...,02_Consent_Adoptee.pdf,2021-11-08,3,"[*users1_name (1.0, Name), *users1_address_lin...",14.666667,10
4,1f66a584995e4045a06b3931330aa389,UT,www.utcourts.gov,Adoption Agreement,Adopting a Minor Stepchild,https://www.utcourts.gov/howto/family/adoption...,08_Agreement.pdf,2021-11-08,1,"[*users1_name (1.0, Name), *users1_address_lin...",19.0,11
...,...,...,...,...,...,...,...,...,...,...,...,...
1523,94be7e4a467c46c5b8f18a81df4c3371,MI,www.courts.michigan.gov,Claim of Appeal of Right/Request to Withdraw A...,civil infraction,https://www.courts.michigan.gov/4a2ca8/siteass...,cia05.pdf,2021-11-03,1,"[disno (_nonstandard_, 0.9999987058713511), ca...",40.0,20
1524,9d46a79a162844ebb78fc592873c027b,MI,www.courts.michigan.gov,"Default Judgment, Civil Infraction",civil infraction,https://www.courts.michigan.gov/4a7328/siteass...,cia07.pdf,2021-11-03,1,[],0.0,14
1525,6d97d3ca4103413b8e161013c26943d6,MI,www.courts.michigan.gov,"Judgment, Civil Infraction",civil infraction,https://www.courts.michigan.gov/4a7336/siteass...,cia02.pdf,2021-11-03,2,[STATE OF MICHIGAN JUDICIAL DISTRICT COUNTY (_...,20.5,9
1526,480c76eb519d4242a7f44722bb7fbea6,MI,www.courts.michigan.gov,Motion to Set Aside Default Judgment and Order...,civil infraction,https://www.courts.michigan.gov/4a2cb4/siteass...,cia04.pdf,2021-11-03,1,"[disno (_nonstandard_, 0.9999987058713511), ca...",22.0,17


In [34]:
df.to_csv("../data/processed/form_data.csv", index=False, encoding="utf-8")    

In [35]:
os.system('cp ../data/processed/form_data.csv docs/forms/form_data.csv')
os.system('cp ../data/processed/www.utcourts.gov/forms/* docs/forms/')
os.system('cp ../data/processed/www.courts.michigan.gov/forms/* docs/forms/')

0