To Do:
- create downloadable csv for each jurisdiction
- deal with duble field entries esp from the simple normalizing

In [1]:
jurs = [
        "CA",
        "MI",
        "UT",
        "ME"
        ]

In [2]:
import pandas as pd
import urllib.request, json 
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
import PyPDF2
from PyPDF2 import PdfFileWriter

import pikepdf

import os
from os import walk
import os.path
from os import path
import numpy as np
#!pip install py-readability-metrics
#from readability import Readability

import textstat

#!python -m nltk.downloader punkt

import time
from datetime import date

import spacy
from sklearn.preprocessing import normalize

from joblib import dump, load


import networkx as nx

import math

from sklearn.metrics.pairwise import cosine_similarity# define matrix with all zero values

#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
with open('../data/keys/spot_token.txt', 'r') as file:
    token = file.read().rstrip()

In [5]:
def vectorize(text):
    output = nlp(str(text)).vector   
    return output

def norm(row):
    try:
        matrix = row.reshape(1,-1).astype(np.float64)
        return normalize(matrix, axis=1, norm='l1')[0]
    except Exception as e: 
        print("===================")
        print(row)
        print("-------------------")
        print(e)
        print("===================")
        #return np.NaN

In [6]:
def regex_norm_field(text):
    # Takes an auto-generated form field name and uses
    # regex to convert it into an Assembly Line standard field.
    # See https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/label_variables/

    regex_list = [

        # Personal info
        ## Name & Bio
        ["^((My|Full( legal)?) )?Name$","users1_name"],
        ["^(Typed or )?Printed Name\s?\d*$","users1_name"],
        ["^(DOB|Date of Birth|Birthday)$","users1_birthdate"],
        ## Address
        ["^(Street )?Address$","users1_address_line_one"],
        ["^City State Zip$","users1_address_line_two"],
        ["^City$","users1_address_city"],
        ["^State$","users1_address_state"],
        ["^Zip( Code)?$","users1_address_zip"],
        ## Contact
        ["^(Phone|Telephone)$","users1_phone_number"],
        ["^Email( Adress)$","users1_email"],
        
        # Parties
        ["plaintiff","plantiff1_name"],
        ["defendant","defendant1_name"],
        ["petitioners","petitioners1_name"],
        ["respondents","respondents1_name"],

        # Court info
        ["^(Court\s)?Case\s?(No|Number)?\s?A?$","docket_number"],
        ["^File\s?(No|Number)?\s?A?$","docket_number"],

        # Form info
        ["^(Signature|Sign( here)?)\s?\d*$","users1_signature"],
        ["^Date\s?\d*$","signature_date"],
    ]

    for regex in regex_list:
        text = re.sub(regex[0],regex[1],text, flags=re.IGNORECASE)
    return text

included_fields = load('../data/processed/ML/norm_fields/included_fields.joblib') 
jurisdictions = load('../data/processed/ML/norm_fields/jurisdictions.joblib') 
groups = load('../data/processed/ML/norm_fields/groups.joblib') 
clf_field_names = load('../data/processed/ML/norm_fields/clf_field_names.joblib') 

stop_words = set(stopwords.words('english'))

def reformat_field(text,max_length=30):
    # h/t https://towardsdatascience.com/nlp-building-a-summariser-68e0c19e3a93
    
    #print(text)
    
    orig_title = text.lower()
    orig_title = re.sub("[^a-zA-Z]+"," ",orig_title)
    orig_title_words = orig_title.split()
   
    deduped_sentence = []
    for word in orig_title_words:
        if word not in deduped_sentence:
            deduped_sentence.append(word)
            
    filtered_sentence = [w for w in deduped_sentence if not w.lower() in stop_words]   

    filtered_title_words = filtered_sentence
    
    characters = len(' '.join(filtered_title_words))
    
    if characters > 0:

        words = len(filtered_title_words)
        av_word_len = math.ceil(len(' '.join(filtered_title_words))/len(filtered_title_words))
        x_words = math.floor((max_length)/av_word_len)


        sim_mat = np.zeros([len(filtered_title_words),len(filtered_title_words)])# will populate it with cosine_similarity values 
        # for each word compared to other
        for i in range(len(filtered_title_words)):
            for j in range(len(filtered_title_words)):
                if i != j:
                    sim_mat[i][j] = cosine_similarity(nlp(filtered_title_words[i]).vector.reshape(1,300), nlp(filtered_title_words[j]).vector.reshape(1,300))[0,0]

        try:
            nx_graph = nx.from_numpy_array(sim_mat)
            scores = nx.pagerank(nx_graph)# print final values of words
            sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)

            if x_words > len(scores):
                x_words=len(scores)

            i = 0
            new_title = ""
            for x in filtered_title_words:
                #print(scores[i],sorted_scores[x_words][1])
                if scores[i] >= sorted_scores[x_words-1][1]: 
                    if len(new_title)>0: new_title+="_"
                    new_title += x
                i+=1

            return new_title
        except:
            return '_'.join(filtered_title_words)
    else:
        if re.search("^(\d+)$", text):
            return "unknown"
        else:
            return re.sub("\s+","_",text.lower())
    

def normalize_name(jur,group,n,per,last_field,this_field):
    
    # Add hard coded conversions maybe by calling a function
    # if returns 0 then fail over to ML or otherway around poor prob -> check hard-coded

    if this_field not in included_fields:
        this_field = reCase(this_field)

        out_put = regex_norm_field(this_field)
        conf = 1.0

        if out_put==this_field:
            params = []
            for item in jurisdictions:
                if jur== item:
                    params.append(1)
                else:
                    params.append(0)
            for item in groups:
                if group== item:
                    params.append(1)
                else:
                    params.append(0)
            params.append(n)
            params.append(per)
            for vec in norm(vectorize(this_field)):
                params.append(vec)
            #for vec in norm(vectorize(last_field)):
            #    params.append(vec)

            for item in included_fields:
                if last_field==item:
                    params.append(1)
                else:
                    params.append(0)

            pred = clf_field_names.predict([params])
            prob = clf_field_names.predict_proba([params])

            conf = prob[0].tolist()[prob[0].tolist().index(max(prob[0].tolist()))]
            out_put = pred[0]
            
    else:
        out_put = this_field
        conf = 1
            
    if out_put in included_fields:
        if conf >= 0:
            return "*"+out_put,conf #+"| was <i>%s</i> (%.2f conf)"%(this_field,conf) #, conf
        else:
            return reformat_field(this_field),conf #+"| was <i>%s</i> (%.2f conf)"%(this_field,conf) #, conf
    else:
        return reformat_field(this_field),conf #+"| was <i>%s</i> (%.2f conf)"%(this_field,conf) #, conf


In [7]:
def reCase(text):
    output = re.sub("(\w|\d)(_|-)(\w|\d)","\\1 \\3",text.strip())
    output = re.sub("([a-z])([A-Z]|\d)","\\1 \\2",output)
    output = re.sub("(\d)([A-Z]|[a-z])","\\1 \\2",output)
    return output

In [8]:
def read_pdf (file):
    try:
        pdfFile = PyPDF2.PdfFileReader(open(file, "rb"))
        if pdfFile.isEncrypted:
            try:
                pdfFile.decrypt('')
                #print ('File Decrypted (PyPDF2)')
            except:
                #
                #
                # This didn't go so well on my Windows box so I just ran this in the pdf folder's cmd:
                # for %f in (*.*) do copy %f temp.pdf /Y && "C:\Program Files (x86)\qpdf-8.0.2\bin\qpdf.exe" --password="" --decrypt temp.pdf %f
                #
                #
                #
                
                command="cp "+file+" tmp/temp.pdf; qpdf --password='' --decrypt tmp/temp.pdf "+file
                os.system(command)
                #print ('File Decrypted (qpdf)')
                #re-open the decrypted file
                pdfFile = PyPDF2.PdfFileReader(open(file, "rb"))
        text = ""
        for page in pdfFile.pages:
            text = text + " " + page.extractText()
        text = re.sub("(\.|,|;|:|!|\?|\n)","\\1 ",text)
        return text
    except:
        return ""


In [9]:
import signal
from contextlib import contextmanager

class TimeoutException(Exception): pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

#try:
#    with time_limit(10):
#        [code here]
#except TimeoutException as e:
#    print("Timed out!")

In [10]:
import requests
import json

def recursive_get_id(values_to_unpack, tmpl=None):
    # thanks Quinten and Bryce for this code ;)
    if not tmpl:
        tmpl = set()
    if isinstance(values_to_unpack, dict):
        tmpl.add(values_to_unpack.get('id'))
        if values_to_unpack.get('children'):
            tmpl.update(recursive_get_id(values_to_unpack.get('children'), tmpl))
        return tmpl
    elif isinstance(values_to_unpack, list):
        for item in values_to_unpack:
            tmpl.update(recursive_get_id(item, tmpl))
        return tmpl
    else:
        return set()
    
def spot(text):

    headers = { "Authorization": "Bearer " + token, "Content-Type":"application/json" }

    body = {
      "text": text,
      "save-text": 0,
      "cutoff-lower": 0.25,
      "cutoff-pred": 0.5,
      "cutoff-upper": 0.6
    }

    r = requests.post('https://spot.suffolklitlab.org/v0/entities-nested/', headers=headers, data=json.dumps(body))
    output_ = r.json()

    #return output_
    try:
        return list(recursive_get_id(output_["labels"]))
    except:
        return []


In [11]:
def parse_form(fileloc,title=None,jur=None,cat=None,normalize=1,rewrite=1):
    f = PyPDF2.PdfFileReader(fileloc)

    if f.isEncrypted:
        pdf = pikepdf.open(fileloc, allow_overwriting_input=True)
        pdf.save(fileloc)
        f = PyPDF2.PdfFileReader(fileloc)
        
    npages = f.getNumPages()
  
    try:
        with time_limit(15):
            ff = f.getFields()
    except TimeoutException as e:
        print("Timed out!")
        ff = None   
    
    if ff:
        fields = list(ff.keys())
    else:
        fields = []
    f_per_page = len(fields)/npages
    text = read_pdf(fileloc)
    
    try:
        #readbility = int(Readability(text).flesch_kincaid().grade_level)
        consensus = textstat.text_standard(text)
        readbility = eval(re.sub("^(\d+)[^0-9]+(\d+)\w*.*","(\\1+\\2)/2",consensus))
    except:
        readbility = None

    if title is None:
        title = reCase(re.search("(.*)\n",text).group(1).strip())

    nmsi = spot(title + ". " +text)      
        
    if normalize==1:
        i = 0 
        length = len(fields)
        last = "null"
        new_fields = []
        new_fields_conf = []
        for field in fields:
            #print(jur,cat,i,i/length,last,field)
            this_field,this_conf = normalize_name(jur,cat,i,i/length,last,field)
            new_fields.append(this_field)
            new_fields_conf.append(this_conf)
            last = field
        
        new_fields = [v + "__" + str(new_fields[:i].count(v) + 1) if new_fields.count(v) > 1 else v for i, v in enumerate(new_fields)]
    else:
        new_fields = fields
    
    stats = {
            "title":title,
            "category":cat,
            "pages":npages,
            "reading grade level": readbility,
            "list":nmsi,
            "avg fields per page": f_per_page,
            "fields":new_fields,
            "fields_conf":new_fields_conf,
            "fields_old":fields
            }    
    
    if rewrite==1:
        try:
            if 1==1:
                my_pdf = pikepdf.Pdf.open(fileloc)
                fields_too = my_pdf.Root.AcroForm.Fields #[0]["/Kids"][0]["/Kids"][0]["/Kids"][0]["/Kids"]

                #print(repr(fields_too))
                k =0
                for field in new_fields:
                    #print(k,field)
                    fields_too[k].T = re.sub("^\*","",field)
                    k+=1

                #f2.T = 'new_hospital_name'
                filename = re.search("\/(\w*\.pdf)$",fileloc).groups()[0]
                #my_pdf.save('../data/processed/forms/%s'%(filename))
                my_pdf.save('%s'%(filename))
            else:
                file = PdfFileWriter()

                first_page = f.getPage(0)

                file.cloneDocumentFromReader(f)
                #file.appendPagesFromReader(f)

                x ={}
                for y in ff:
                    x[y]=""

                #print(x)

                file.updatePageFormFieldValues(first_page,x)

                output = open('blankPdf.pdf', 'wb')
                file.write(output)  
        except:
            error = "could not change form fields"
    
    return stats

In [12]:
#parse_form("../data/processed/www.utcourts.gov/forms/898269a99ff1c65be10b1ae35bb34ba469fc14b7301b7ed7b126d195.pdf",title=None,jur="UT",cat=None,normalize=1)
#parse_form("../data/processed/www.utcourts.gov/forms/2532cd2b6d3aaff8c47726a0abd168fb4e5cdb4977c065cd27bde8c7.pdf",title=None,jur="UT",cat=None,normalize=1)
#parse_form("../data/processed/www.utcourts.gov/forms/6ec7576210513907e699b5adf3397639507c688801a60bc34c201984.pdf",title=None,jur="UT",cat=None,normalize=1)
#parse_form("../data/processed/mjbportal.courts.maine.gov/forms/1519fe450d870a36a428a0b006c0665a.pdf",title=None,jur="UT",cat=None,normalize=1)
#parse_form("../data/processed/www.courts.ca.gov/forms/3979f1c1c9f165ccac026b26cf20252c.pdf",title=None,jur="UT",cat=None,normalize=1)

#parse_form("../data/processed/www.courts.michigan.gov/forms/52b2bf502a4bd8bc3a39a494a0ea5b0f491552e4d2da2ebe82beba3d.pdf",title=None,jur="UT",cat=None,normalize=1)
parse_form("../data/processed/www.courts.michigan.gov/forms/147d1063a642a9f94693331190cc14599152610dc5cd489b5d17e46d.pdf",title=None,jur="UT",cat=None,normalize=1)



{'title': 'APPLICATION1. Onjudgment was entered against the defendant(s) and the plaintiff was awarded',
 'category': None,
 'pages': 2,
 'reading grade level': 14.5,
 'list': ['CO-07-00-00-00',
  'HO-09-00-00-00',
  'CO-00-00-00-00',
  'HO-06-00-00-00',
  'HO-00-00-00-00'],
 'avg fields per page': 13.5,
 'fields': ['state_judicial_district',
  'text__1',
  'text__2',
  'text__3',
  'text__4',
  'defendant_names_addresses',
  '1_on',
  'judgment_defendants_plaintiff',
  'following_described_property__1',
  'following_described_property__2',
  'payment_made_received_except',
  'received_following_conditions',
  'unknown__1',
  'unknown__2',
  'true_knowledge_belief',
  'text__5',
  '*signature_date__1',
  'text__6',
  '*signature_date__2',
  'service_fee',
  'text__7',
  'text__8',
  'text__9',
  'text__10',
  'text__11',
  'total_fee',
  '*docket_number'],
 'fields_conf': [0.57,
  0.6,
  0.66,
  0.65,
  0.62,
  1.0,
  0.67,
  1.0,
  0.61,
  0.65,
  0.68,
  0.67,
  0.57,
  0.61,
  0.65,

In [106]:
parse_form("../data/processed/www.courts.ca.gov/forms/e2c17a8503879d28d12932434d7c755b.pdf",title=None,jur="UT",cat=None,normalize=1,rewrite=0)

{'title': 'Form Adopted for Mandatory Use Judicial Council of California',
 'category': None,
 'pages': 5,
 'reading grade level': 38.5,
 'list': [],
 'avg fields per page': 27.4,
 'fields': ['case_number',
  'atty_bar',
  'name',
  'atty_firm',
  'street',
  'city',
  'state',
  'zip',
  'phone',
  'fax',
  'email',
  'atty',
  'atty_party_info',
  'crt_county',
  'crt_street',
  'crt_mailing_add',
  'crt_city_zip',
  'crt_branch',
  'court_info',
  'party__1',
  'party__2',
  'title_party_name',
  'p_caption',
  'fill_text__1',
  'fill_text__2',
  'item__1',
  'list__1',
  'ch__1',
  'ch__2',
  'lia',
  'ch__3',
  'ch__4',
  'lib',
  'list__2',
  'ch__5',
  'ch__6',
  'ch__7',
  'ch__8',
  'list__3',
  'page__1',
  'px_caption',
  'ch__9',
  'li__1',
  'ch__10',
  'ch__11',
  'sub_li',
  'li__2',
  'sub_lic',
  'lic',
  'ch__12',
  'ch__13',
  'text_field__1',
  'ch__14',
  'ch__15',
  'text_field__2',
  'list__4',
  'check_box__1',
  'item__2',
  'list__5',
  'check_box__2',
  'chec

In [15]:
text = read_pdf("../data/processed/www.courts.ca.gov/forms/e2c17a8503879d28d12932434d7c755b.pdf")
print(text)
consensus = textstat.text_standard(text)
readbility = eval(re.sub("^(\d+)[^0-9]+(\d+)\w*.*","(\\1+\\2)/2",consensus))
readbility

 Form Adopted for Mandatory Use Judicial Council of California 
 UD-101 [Rev.  October 1,  2021]PLAINTIFF'S MANDATORY COVER SHEET AND SUPPLEMENTAL ALLEGATIONSŠUNLAWFUL DETAINERCode of Civil Procedure,  § 1179. 01 et seq. www. courts. ca. govUD-101FOR COURT USE ONLYCASE NUMBER: ATTORNEY OR PARTY WITHOUT ATTORNEYSTATE BAR NUMBER: NAME: FIRM NAME: STREET ADDRESS: CITY: STATE: ZIP CODE: TELEPHONE NO. : FAX NO. : EMAIL ADDRESS: ATTORNEY FOR (name): SUPERIOR COURT OF CALIFORNIA,  COUNTY OFSTREET ADDRESS: MAILING ADDRESS: CITY AND ZIP CODE: BRANCH NAME: PLAINTIFF: DEFENDANT: PLAINTIFF'S MANDATORY COVER SHEET AND  SUPPLEMENTAL ALLEGATIONSŠUNLAWFUL DETAINERAll plaintiffs in unlawful detainer proceedings must file and serve this form.   Filing this form complies with the requirement in Code of Civil Procedure section 1179. 01. 5(c).        Ł Serve this form and any attachments to it with the summons.         Ł If a summons has already been served without this form,  then serve it by mail or any 

38.5

In [28]:
textstat.text_standard("""
corresponding to the amount demanded in the notice underlying the complaint?  
""")

'13th and 14th grade'

In [86]:
stats[0]["/Kids"][0]["/Kids"][0]["/Kids"][0]["/Kids"][0].T

pikepdf.String("Button1[0]")

In [91]:
stats[0]["/Kids"][0]["/Kids"][0]["/Kids"][0].T

pikepdf.String("P1Footer[0]")

In [96]:
stats[0]["/Kids"][0]["/Kids"][1]["/Kids"][0].T

pikepdf.String("FooterButtons[0]")

In [94]:
stats[0]["/Kids"][0]["/Kids"][1].T

pikepdf.String("Page3[0]")

In [68]:
my_pdf = pikepdf.Pdf.open("../data/processed/www.courts.ca.gov/forms/3979f1c1c9f165ccac026b26cf20252c.pdf")
fields_too = my_pdf.Root.AcroForm.Fields#[0]["/Kids"][0]["/Kids"][0]["/Kids"][0]["/Kids"]

In [77]:
fields_too[0]["/Kids"][0]["/Kids"][0]["/Kids"][0]["/Kids"][0].T

pikepdf.String("Button1[0]")

In [13]:
files_df_ut = pd.read_csv("../data/raw/www.utcourts.gov/form_data.csv")
files_df_mi = pd.read_csv("../data/raw/www.courts.michigan.gov/form_data.csv")
files_df_me = pd.read_csv("../data/raw/mjbportal.courts.maine.gov/form_data.csv")
files_df_ca = pd.read_csv("../data/raw/www.courts.ca.gov/form_data.csv")

df = pd.concat([files_df_ut,files_df_mi,files_df_me,files_df_ca],ignore_index=True)
#df = pd.read_csv("../data/raw/www.courts.ca.gov/form_data.csv")
df.head()

Unnamed: 0,id,jurisdiction,source,title,group,url,filename,downloaded
0,04b3a0734774c02edf8eb9056d23954aa38e96c77c3392...,UT,www.utcourts.gov,Community Service Worksheet Third District Juv...,3rd District Juvenile Court: Forms and Pamphlets,https://www.utcourts.gov/courts/juv/juvsites/3...,COMMUNITY%20SERVICE%20WORKSHEET-FRONT%20AND%20...,2021-11-11
1,6e420f1b3575cfd8ef94b71977da9e38252e3395a78439...,UT,www.utcourts.gov,Third District Juvenile Court Work Program Ref...,3rd District Juvenile Court: Forms and Pamphlets,https://www.utcourts.gov/courts/juv/juvsites/3...,Work_Crew_Application-2007.pdf,2021-11-11
2,2532cd2b6d3aaff8c47726a0abd168fb4e5cdb4977c065...,UT,www.utcourts.gov,Utah State District Juvenile Court Probation O...,3rd District Juvenile Court: Forms and Pamphlets,https://www.utcourts.gov/courts/juv/juvsites/3...,Probation%20Order%20revised.050502.pdf,2021-11-11
3,f6a6814890f21c11524d5785d772272916ff95909b7dba...,UT,www.utcourts.gov,Adoptee's Consent to Adoption and Waiver of Ri...,Adopting a Minor Stepchild,https://www.utcourts.gov/howto/family/adoption...,02_Consent_Adoptee.pdf,2021-11-11
4,be656a9a361db7c3532026b0bae372e704beb160fce37e...,UT,www.utcourts.gov,Adoption Agreement,Adopting a Minor Stepchild,https://www.utcourts.gov/howto/family/adoption...,08_Agreement.pdf,2021-11-11


In [14]:
df["pages"] = ""
df["fields"] = ""
df["fields_conf"] = ""
df["fields_old"] = ""
df["f_per_p"] = ""
df["reading"] = ""
df["list"] = ""

#for x in range(100):
#    print("#", end='')
#print("\n")
print(len(df))
i=0
for index,row in df.iterrows():
    if (row["pages"] == "") & (row["jurisdiction"] in jurs):
        print("%s*, "%i, end='')
        try:
            stats = parse_form("../data/processed/"+row["source"]+"/forms/"+row["id"]+".pdf",row["title"],row["jurisdiction"],row["group"],1,0)
            df.at[index, 'pages'] = stats["pages"]
            df.at[index, 'fields'] = stats["fields"]
            df.at[index, 'fields_conf'] = stats["fields_conf"]
            df.at[index, 'fields_old'] = stats["fields_old"]
            df.at[index, 'f_per_p'] = stats["avg fields per page"]
            df.at[index, 'reading'] = stats["reading grade level"]
            df.at[index, 'list'] = stats["list"]
            #print(index)
        except:
            print("error: "+"../data/raw/"+row["source"]+"/forms/"+row["id"]+".pdf")
    else:
        print("%s, "%i, end='')
        
    i+=1

df

3224
0*, 1*, 2*, 3*, 4*, 5*, 6*, 7*, 8*, 9*, 10*, 11*, 12*, 13*, 14*, 15*, 16*, 17*, 18*, 19*, 20*, 21*, 22*, 23*, 24*, 25*, 26*, 27*, 28*, 29*, 30*, 31*, 32*, 33*, 34*, 35*, 36*, 37*, 38*, 39*, 40*, 41*, 42*, 43*, 44*, 45*, 46*, 47*, 48*, 49*, 50*, 51*, 52*, 53*, 54*, 55*, 56*, 57*, 58*, 59*, 60*, 61*, 62*, 63*, 64*, 65*, 66*, 67*, 68*, 69*, 70*, 71*, 72*, 73*, 74*, 75*, 76*, 77*, 78*, 79*, 80*, 81*, 82*, 83*, 84*, 85*, 86*, 87*, 88*, 89*, 90*, 91*, 92*, 93*, 94*, 95*, 96*, 97*, 98*, 99*, 100*, 101*, 102*, 103*, 104*, 105*, 106*, 107*, 108*, 109*, 110*, 111*, 112*, 113*, 114*, 115*, 116*, 117*, 118*, 119*, 120*, 121*, 122*, 123*, 124*, 125*, 126*, 127*, 128*, 129*, 130*, 131*, 132*, 133*, 134*, 135*, 136*, 137*, 138*, 139*, 140*, 141*, 142*, 143*, 144*, 145*, 146*, 147*, 148*, 149*, 150*, 151*, 152*, 153*, 154*, 155*, 156*, 157*, 158*, 159*, 160*, 161*, 162*, 163*, 164*, 165*, 166*, 167*, 168*, 169*, 170*, 171*, 172*, 173*, 174*, 175*, 176*, 177*, 178*, 179*, 180*, 181*, 182*, 183*, 1



700*, 701*, 702*, 703*, 704*, 705*, 706*, 707*, 708*, 709*, 710*, 711*, 712*, 



713*, 714*, 715*, 



716*, 717*, 718*, 719*, 720*, 721*, 722*, 723*, 724*, 725*, 726*, 727*, 728*, 729*, 730*, 731*, 732*, 733*, 734*, 735*, 736*, 737*, 738*, 739*, 740*, 741*, 742*, 



743*, 744*, 745*, 746*, 747*, 748*, 749*, 750*, 751*, 752*, 753*, 754*, 755*, 756*, 757*, 758*, 759*, 760*, 



761*, 762*, 



763*, 764*, 765*, 766*, 767*, 768*, 769*, 770*, 771*, 772*, 773*, 774*, 775*, 776*, 777*, 778*, 779*, 780*, 781*, 782*, 783*, 784*, 785*, 786*, 787*, 788*, 789*, 



790*, 791*, 792*, 793*, 794*, 



795*, 796*, 797*, 798*, 799*, 800*, 801*, 802*, 803*, 



804*, 



805*, 806*, 807*, 808*, 809*, 810*, 811*, 812*, 813*, 814*, 815*, 816*, 817*, 818*, 819*, 820*, 821*, 822*, 823*, 824*, 825*, 826*, 827*, 828*, 829*, 830*, 831*, 832*, 833*, 834*, 835*, 



836*, 837*, 838*, 839*, 840*, 841*, 842*, 843*, 844*, 845*, 846*, 847*, 848*, 849*, 850*, 851*, 852*, 853*, 



854*, 855*, 856*, 857*, 858*, 859*, 860*, 861*, 862*, 863*, 864*, 865*, 866*, 867*, 868*, 869*, 870*, 871*, 872*, 873*, 874*, 875*, 876*, 877*, 878*, 879*, 880*, 881*, 882*, 883*, 884*, 885*, 886*, 887*, 888*, 889*, 890*, 891*, 892*, 893*, 894*, 895*, 896*, 897*, 898*, 899*, 900*, 901*, 902*, 903*, 904*, 905*, 906*, 907*, 908*, 909*, 



910*, 911*, 912*, 



913*, 



914*, 915*, 916*, 917*, 



918*, 919*, 920*, 921*, 



922*, 923*, 924*, 925*, 926*, 



927*, 



928*, 929*, 930*, 931*, 932*, 933*, 934*, 935*, 



936*, 937*, 938*, 



939*, 



940*, 941*, 942*, 943*, 944*, 945*, 946*, 947*, 948*, 949*, 950*, 951*, 952*, 953*, 954*, 955*, 956*, 957*, 



958*, 959*, 960*, 961*, 962*, 



963*, 964*, 965*, 966*, 967*, 968*, 969*, 970*, 971*, 972*, 



973*, 974*, 975*, 976*, 977*, 978*, 979*, 980*, 



981*, 982*, 983*, 984*, 985*, 986*, 987*, 988*, 989*, 990*, 991*, 



992*, 993*, 994*, 995*, 996*, 997*, 998*, 999*, 1000*, 1001*, 1002*, 1003*, 1004*, 1005*, 1006*, 1007*, 1008*, 1009*, 1010*, 1011*, 1012*, 1013*, 1014*, 1015*, 1016*, 1017*, 1018*, 1019*, 1020*, 1021*, 1022*, 1023*, 1024*, 1025*, 1026*, 1027*, 1028*, 1029*, 1030*, 1031*, 1032*, 1033*, 1034*, 1035*, 1036*, 1037*, 1038*, 1039*, 1040*, 1041*, 1042*, 1043*, 1044*, 1045*, 



1046*, 1047*, 1048*, 1049*, 1050*, 1051*, 1052*, 1053*, 1054*, 1055*, 1056*, 1057*, 1058*, 1059*, 1060*, 1061*, 1062*, 1063*, 1064*, 1065*, 1066*, 1067*, 1068*, 1069*, 1070*, 1071*, 1072*, 1073*, 1074*, 1075*, 1076*, 1077*, 1078*, 1079*, 1080*, 1081*, 1082*, 1083*, 1084*, 1085*, 1086*, 1087*, 1088*, 1089*, 1090*, 1091*, 1092*, 1093*, 1094*, 1095*, 1096*, 1097*, 1098*, 1099*, 1100*, 1101*, 1102*, 1103*, 1104*, 1105*, 1106*, 1107*, 1108*, 1109*, 1110*, 1111*, 1112*, 1113*, 1114*, 1115*, 1116*, 1117*, 1118*, 1119*, 1120*, 1121*, 1122*, 1123*, 1124*, 1125*, 1126*, 1127*, 1128*, 1129*, 1130*, 1131*, 1132*, 1133*, 1134*, 1135*, 1136*, 1137*, 1138*, 1139*, 1140*, 1141*, 1142*, 1143*, 1144*, 1145*, 1146*, 1147*, 1148*, 1149*, 1150*, 1151*, 1152*, 1153*, 1154*, 1155*, 1156*, 1157*, 1158*, 1159*, 1160*, 1161*, 1162*, 1163*, 1164*, 1165*, 1166*, 1167*, 1168*, 1169*, 1170*, 1171*, 1172*, 1173*, 1174*, 1175*, 1176*, 



1177*, 1178*, 1179*, 1180*, 



1181*, 1182*, 1183*, 1184*, 1185*, 



1186*, 



1187*, 1188*, 1189*, 1190*, 1191*, 1192*, 1193*, 1194*, 1195*, 1196*, 1197*, 1198*, 1199*, 1200*, 1201*, 1202*, 1203*, 1204*, 1205*, 1206*, 1207*, 1208*, 1209*, 1210*, 1211*, 1212*, 1213*, 1214*, 1215*, 1216*, 1217*, 1218*, 1219*, 1220*, 1221*, 1222*, 1223*, 1224*, 1225*, 1226*, 1227*, 1228*, 1229*, 1230*, 1231*, 1232*, 1233*, 1234*, 1235*, 1236*, 1237*, 1238*, 1239*, 1240*, 1241*, 1242*, 1243*, 1244*, 1245*, 1246*, 1247*, 1248*, 1249*, 1250*, 1251*, 1252*, 1253*, 1254*, 1255*, 1256*, 1257*, 1258*, 1259*, 1260*, 1261*, 1262*, 1263*, 1264*, 1265*, 1266*, 1267*, 1268*, 1269*, 1270*, 1271*, 1272*, 1273*, 1274*, 1275*, 1276*, 1277*, 1278*, 1279*, 1280*, 1281*, 1282*, 



1283*, 1284*, 1285*, 1286*, 1287*, 1288*, 1289*, 1290*, 



1291*, 1292*, 1293*, 1294*, 1295*, 1296*, 1297*, 1298*, 1299*, 1300*, 1301*, 1302*, 1303*, 1304*, 1305*, 1306*, 1307*, 1308*, 1309*, 1310*, 1311*, 1312*, 1313*, 



1314*, 1315*, 1316*, 1317*, 1318*, 1319*, 1320*, 



1321*, 1322*, 1323*, 



1324*, 1325*, 1326*, 1327*, 1328*, 1329*, 1330*, 



1331*, 1332*, 1333*, 1334*, 1335*, 1336*, 1337*, 1338*, 1339*, 1340*, 1341*, 1342*, 1343*, 1344*, 1345*, 1346*, 1347*, 1348*, 1349*, 1350*, 1351*, 1352*, 1353*, 1354*, 1355*, 1356*, 1357*, 1358*, 1359*, 1360*, 1361*, 1362*, 1363*, 1364*, 1365*, 1366*, 1367*, 1368*, 1369*, 1370*, 1371*, 1372*, 1373*, 1374*, 1375*, 1376*, 1377*, 1378*, 1379*, 1380*, 1381*, 1382*, 1383*, 1384*, 1385*, 1386*, 1387*, 1388*, 1389*, 1390*, 1391*, 



1392*, 



1393*, 1394*, 



1395*, 1396*, 



1397*, 1398*, 1399*, 1400*, 1401*, 1402*, 1403*, 1404*, 1405*, 1406*, 1407*, 1408*, 1409*, 1410*, 1411*, 1412*, 1413*, 1414*, 



1415*, 1416*, 1417*, 1418*, 1419*, 1420*, 1421*, 1422*, 1423*, 1424*, 1425*, 1426*, 1427*, 1428*, 1429*, 1430*, 1431*, 1432*, 1433*, 1434*, 1435*, 1436*, 1437*, 1438*, 1439*, 1440*, 1441*, 1442*, 1443*, 1444*, 1445*, 1446*, 1447*, 1448*, 1449*, 1450*, 1451*, 1452*, 1453*, 1454*, 1455*, 1456*, 1457*, 1458*, 1459*, 1460*, 



1461*, 1462*, 1463*, 1464*, 1465*, 1466*, 1467*, 1468*, 



1469*, 1470*, 1471*, 1472*, 1473*, 1474*, 1475*, 1476*, 1477*, 1478*, 1479*, 1480*, 1481*, 1482*, 1483*, 1484*, 1485*, 1486*, 1487*, 1488*, 1489*, 



1490*, 



1491*, 1492*, 1493*, 1494*, 1495*, 1496*, 1497*, 1498*, 1499*, 1500*, 1501*, 1502*, 1503*, 1504*, 1505*, 1506*, 1507*, 1508*, 



1509*, 1510*, 1511*, 1512*, 1513*, 1514*, 1515*, 1516*, 



1517*, 1518*, 1519*, 1520*, 1521*, 1522*, 



1523*, 1524*, 1525*, 1526*, 1527*, 



1528*, 1529*, 1530*, 1531*, 1532*, 1533*, 1534*, 1535*, 1536*, 1537*, 1538*, 1539*, 1540*, 1541*, 1542*, 1543*, 1544*, 1545*, 1546*, 1547*, 1548*, 1549*, 1550*, 1551*, 1552*, 1553*, 1554*, 1555*, 1556*, 1557*, 1558*, 1559*, 1560*, 1561*, 1562*, 1563*, 1564*, 1565*, 1566*, 1567*, 1568*, 1569*, 1570*, 1571*, 



1572*, 1573*, 1574*, 1575*, 1576*, 1577*, 1578*, 1579*, 1580*, 1581*, 1582*, error: ../data/raw/mjbportal.courts.maine.gov/forms/8634548e69040b8d837714f03aacfc40.pdf
1583*, 1584*, 1585*, 1586*, 1587*, 1588*, 1589*, 1590*, 1591*, 1592*, 1593*, 1594*, 1595*, 1596*, 1597*, 



1598*, 1599*, 1600*, 1601*, 1602*, 1603*, 1604*, 1605*, 1606*, 1607*, 1608*, 1609*, 1610*, 1611*, 1612*, 1613*, 1614*, 1615*, 1616*, 1617*, 1618*, 1619*, 1620*, 1621*, 1622*, 1623*, 1624*, 1625*, 1626*, 1627*, 1628*, 1629*, 1630*, 1631*, 1632*, 1633*, 1634*, 1635*, 1636*, 1637*, 1638*, 1639*, 1640*, 1641*, 1642*, 1643*, 1644*, 1645*, 1646*, 1647*, 1648*, 1649*, 1650*, 1651*, 1652*, 1653*, 1654*, 1655*, 1656*, 1657*, 1658*, 1659*, 1660*, 1661*, 1662*, 1663*, 1664*, 1665*, 



1666*, 1667*, 1668*, 1669*, 1670*, 1671*, 1672*, 1673*, 1674*, 1675*, 1676*, 1677*, 1678*, 1679*, 1680*, 1681*, 1682*, 1683*, 1684*, 1685*, 1686*, 1687*, 1688*, 1689*, 1690*, 1691*, 1692*, 1693*, 1694*, 1695*, 1696*, 1697*, 1698*, 1699*, 1700*, 1701*, 1702*, 1703*, 1704*, 1705*, 1706*, 1707*, 1708*, 1709*, 1710*, 1711*, 



1712*, 1713*, 1714*, 1715*, 1716*, 1717*, 1718*, 1719*, 1720*, 1721*, 1722*, 1723*, 1724*, 1725*, 1726*, 1727*, 1728*, 1729*, 1730*, 1731*, 1732*, 



1733*, 1734*, 1735*, 1736*, 1737*, 1738*, 1739*, 1740*, 1741*, 1742*, 1743*, 1744*, 1745*, 1746*, 1747*, 1748*, 1749*, 1750*, 1751*, 1752*, 1753*, Timed out!
1754*, 1755*, 1756*, 1757*, 1758*, 1759*, 1760*, 1761*, 1762*, 1763*, 1764*, 1765*, 1766*, Timed out!
1767*, 1768*, Timed out!
1769*, 1770*, 1771*, 1772*, 1773*, 1774*, error: ../data/raw/www.courts.ca.gov/forms/0779d1041707e4ff0ec111426f11a971.pdf
1775*, 1776*, 1777*, 1778*, 1779*, 1780*, 1781*, 1782*, 1783*, 1784*, 1785*, 1786*, 1787*, 1788*, 1789*, 1790*, 1791*, 1792*, 1793*, 1794*, 1795*, 1796*, 1797*, 1798*, 1799*, 1800*, 1801*, 1802*, 1803*, 1804*, 1805*, 1806*, 1807*, 1808*, 1809*, 1810*, 1811*, 1812*, 1813*, 1814*, 1815*, 1816*, 1817*, 1818*, 1819*, 1820*, 1821*, 1822*, 1823*, Timed out!
1824*, 1825*, Timed out!
1826*, 1827*, 1828*, 1829*, 1830*, 1831*, 1832*, 1833*, 1834*, 1835*, 1836*, 1837*, 1838*, 1839*, 1840*, 1841*, 1842*, 1843*, 1844*, 1845*, 1846*, 1847*, 1848*, 1849*, 1850*, 1851*, 1852*, 1853*, 1854*, 1855*, 1856

2543*, 2544*, 2545*, 2546*, 2547*, 2548*, 2549*, 2550*, 2551*, 2552*, 2553*, 2554*, 2555*, 2556*, 2557*, 2558*, 2559*, 2560*, 2561*, 2562*, 2563*, 2564*, 2565*, 2566*, 2567*, 2568*, 2569*, 2570*, 2571*, 2572*, 2573*, 2574*, 2575*, 2576*, 2577*, 2578*, 2579*, 2580*, 2581*, 2582*, 2583*, 2584*, 2585*, 2586*, 2587*, 2588*, 2589*, 2590*, 2591*, 2592*, 2593*, 2594*, 2595*, 2596*, 2597*, 2598*, 2599*, 2600*, 2601*, 2602*, 2603*, 2604*, 2605*, 2606*, 2607*, 2608*, 2609*, 2610*, 2611*, 2612*, 2613*, 2614*, 2615*, 2616*, 2617*, 2618*, 2619*, 2620*, 2621*, 2622*, 2623*, 2624*, 2625*, 2626*, 2627*, 2628*, 2629*, 2630*, 2631*, 2632*, 2633*, 2634*, 2635*, 2636*, 2637*, 2638*, 2639*, 2640*, 2641*, 2642*, 2643*, 2644*, 2645*, 2646*, 2647*, 2648*, 2649*, 2650*, 2651*, 2652*, 2653*, 2654*, 2655*, 2656*, 2657*, 2658*, 2659*, 2660*, 2661*, 2662*, 2663*, 2664*, 2665*, 2666*, 2667*, 2668*, 2669*, 2670*, 2671*, 2672*, 2673*, 2674*, 2675*, 2676*, 2677*, 2678*, 2679*, 2680*, 2681*, 2682*, 2683*, 2684*, 2685*,

Unnamed: 0,id,jurisdiction,source,title,group,url,filename,downloaded,pages,fields,fields_conf,fields_old,f_per_p,reading,list
0,04b3a0734774c02edf8eb9056d23954aa38e96c77c3392...,UT,www.utcourts.gov,Community Service Worksheet Third District Juv...,3rd District Juvenile Court: Forms and Pamphlets,https://www.utcourts.gov/courts/juv/juvsites/3...,COMMUNITY%20SERVICE%20WORKSHEET-FRONT%20AND%20...,2021-11-11,2,"[name__1, name__2, *docket_number, *users1_add...","[0.61, 0.65, 1.0, 1.0, 1.0, 0.61, 0.53, 0.57, ...","[NAME 1, NAME 2, CASE NUMBER, ADDRESS, DATE OF...",7.5,,[]
1,6e420f1b3575cfd8ef94b71977da9e38252e3395a78439...,UT,www.utcourts.gov,Third District Juvenile Court Work Program Ref...,3rd District Juvenile Court: Forms and Pamphlets,https://www.utcourts.gov/courts/juv/juvsites/3...,Work_Crew_Application-2007.pdf,2021-11-11,2,"[*docket_number, *users1_birthdate, male, juve...","[1.0, 1.0, 0.7, 0.61, 1.0, 1.0, 1.0, 1.0, 0.71...","[Court Case, Date of Birth, Male, Juveniles Na...",19.0,,[]
2,2532cd2b6d3aaff8c47726a0abd168fb4e5cdb4977c065...,UT,www.utcourts.gov,Utah State District Juvenile Court Probation O...,3rd District Juvenile Court: Forms and Pamphlets,https://www.utcourts.gov/courts/juv/juvsites/3...,Probation%20Order%20revised.050502.pdf,2021-11-11,5,"[probation_department_court, special_condition...","[0.73, 0.65, 0.7, 0.54, 0.68, 0.67, 0.62, 0.59...",[of the probation department or by order of th...,4.8,20.5,[]
3,f6a6814890f21c11524d5785d772272916ff95909b7dba...,UT,www.utcourts.gov,Adoptee's Consent to Adoption and Waiver of Ri...,Adopting a Minor Stepchild,https://www.utcourts.gov/howto/family/adoption...,02_Consent_Adoptee.pdf,2021-11-11,3,"[*users1_name, *users1_address_line_one, *user...","[1.0, 1.0, 1.0, 1.0, 0.61, 0.7, 0.67, 1.0, 0.6...","[Name, Address, City State Zip, Phone, Email, ...",14.666667,12.5,[]
4,be656a9a361db7c3532026b0bae372e704beb160fce37e...,UT,www.utcourts.gov,Adoption Agreement,Adopting a Minor Stepchild,https://www.utcourts.gov/howto/family/adoption...,08_Agreement.pdf,2021-11-11,1,"[*users1_name, *users1_address_line_one, *user...","[1.0, 1.0, 1.0, 1.0, 0.61, 0.7, 0.67, 1.0, 0.6...","[Name, Address, City State Zip, Phone, Email, ...",19.0,18.5,[GO-00-00-00-00]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3219,6efc9088604befed5fb59fc42aeee0f5,CA,www.courts.ca.gov,Notice of Hearing to Renew Restraining Order,WV,https://www.courts.ca.gov/documents/wv710.pdf,wv710.pdf,2021-11-17,2,"[court_info, case_number, right_caption, full_...","[0.69, 0.56, 0.64, 0.55, 0.55, 0.69, 0.66, 0.6...","[CourtInfo[0], CaseNumber[0], RightCaption[0],...",19.0,12.5,"[CO-07-00-00-00, CO-00-00-00-00]"
3220,740afc14f63fc6bf594ee344126ddb54,CA,www.courts.ca.gov,Response to Request to Renew Restraining Order,WV,https://www.courts.ca.gov/documents/wv720.pdf,wv720.pdf,2021-11-17,2,"[court_info, case_number, right_caption, full_...","[0.69, 0.56, 0.64, 0.55, 0.55, 0.69, 0.66, 0.6...","[CourtInfo[0], CaseNumber[0], RightCaption[0],...",22.0,16.5,"[CO-07-00-00-00, CO-00-00-00-00]"
3221,c3642c835a4c4c0a167fdcc4758b68d9,CA,www.courts.ca.gov,Order Renewing Workplace Violence Restraining ...,WV,https://www.courts.ca.gov/documents/wv730.pdf,wv730.pdf,2021-11-17,1,"[court_info, case_number, right_caption, full_...","[0.69, 0.56, 0.64, 0.55, 0.55, 0.69, 0.66, 0.6...","[CourtInfo[0], CaseNumber[0], RightCaption[0],...",50.0,13.5,[CO-00-00-00-00]
3222,e2917df3dd9a497daf3a29ca49782672,CA,www.courts.ca.gov,"Proof of Firearms Turned In, Sold, or Stored",WV,https://www.courts.ca.gov/documents/wv800.pdf,wv800.pdf,2021-11-17,2,"[turned_agent_ft, turned_date_dt, turned_time_...","[0.68, 0.68, 0.8, 0.68, 0.69, 0.78, 0.68, 0.7,...","[TurnedInAgent_ft[0], TurnedInDate_dt[0], Turn...",32.0,18.5,[]


In [15]:
#df1 = pd.read_csv("../data/processed/form_data.csv")
#df1

In [16]:
#df2 = df[(df["jurisdiction"]=="CA") & (df["pages"]!="")]
#df2

In [17]:
#dfall = pd.concat([df1,df2],ignore_index=True)
#dfall

In [18]:
#dfall.to_csv("../data/processed/form_data.csv", index=False, encoding="utf-8")    
df.to_csv("../data/processed/form_data.csv", index=False, encoding="utf-8")    

In [None]:
#os.system('cp ../data/processed/www.utcourts.gov/forms/* ../data/processed/forms/')
#os.system('cp ../data/processed/www.courts.michigan.gov/forms/* ../data/processed/forms/')