In [4]:
import os
import re
import spacy
import PyPDF2
import pikepdf
import textstat
import requests
import json
import numpy as np
from numpy import unique
from numpy import where
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from joblib import load
try:
    from nltk.corpus import stopwords
except:
    print("Downloading stopwords")
    nltk.download('stopwords')
    from nltk.corpus import stopwords
import math
import signal
from contextlib import contextmanager
import threading
import _thread

In [5]:
stop_words = set(stopwords.words('english'))

try:
    nlp = spacy.load('en_core_web_lg') # this takes a while to loadimport os
except:
    print("Downloading word2vec model en_core_web_lg")
    import subprocess
    bashCommand = "python -m spacy download en_core_web_lg"
    process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    nlp = spacy.load('en_core_web_lg') # this takes a while to loadimport os

Load local variables, models, and API key(s).

In [6]:
# load local stuff
#########################
#      Start Test
#########################

included_fields = load('../formfyxer/data/included_fields.joblib')
jurisdictions = load('../formfyxer/data/jurisdictions.joblib')
groups = load('../formfyxer/data/groups.joblib')
clf_field_names = load('../formfyxer/data/clf_field_names.joblib')
with open('../../keys/spot_token.txt', 'r') as file:
    spot_token = file.read().rstrip()
    
print(included_fields,jurisdictions,groups)
    
#########################
#       End Test
#########################

['users1_name', 'users1_birthdate', 'users1_address_line_one', 'users1_address_line_two', 'users1_address_city', 'users1_address_state', 'users1_address_zip', 'users1_phone_number', 'users1_email', 'plantiffs1_name', 'defendants1_name', 'petitioners1_name', 'respondents1_name', 'docket_number', 'trial_court_county', 'users1_signature', 'signature_date'] ['state'] ['state']


This creates a timeout exception that can be triggered when something hangs too long. 

In [7]:
class TimeoutException(Exception): pass
@contextmanager
def time_limit(seconds):
    timer = threading.Timer(seconds, lambda: _thread.interrupt_main())
    timer.start()
    try:
        yield
    except KeyboardInterrupt:
        raise TimeoutException("Timed out.")
    finally:
        # if the action ends in specified time, timer is canceled
        timer.cancel()
#    def signal_handler(signum, frame):
#        raise TimeoutException("Timed out!")
#    signal.signal(signal.SIGALRM, signal_handler)
#    signal.alarm(seconds)
#    try:
#        yield
#    finally:
#        signal.alarm(0)

In [8]:
#########################
#      Start Test
#########################

import time
try:
    with time_limit(1):
        time.sleep(3)
except TimeoutException as e:
    print("Timed out!")
    
#########################
#       End Test
#########################

Timed out!


Pull ID values out of the LIST/NSMI results from Spot.

In [9]:
def recursive_get_id(values_to_unpack, tmpl=None):
    # h/t to Quinten and Bryce for this code ;)
    if not tmpl:
        tmpl = set()
    if isinstance(values_to_unpack, dict):
        tmpl.add(values_to_unpack.get('id'))
        if values_to_unpack.get('children'):
            tmpl.update(recursive_get_id(values_to_unpack.get('children'), tmpl))
        return tmpl
    elif isinstance(values_to_unpack, list):
        for item in values_to_unpack:
            tmpl.update(recursive_get_id(item, tmpl))
        return tmpl
    else:
        return set()

In [10]:
#########################
#      Start Test
#########################

spot_output = {  'build': 9,
                 'query-id': '0dd2c6502bd64c76ae70b18d1c33029f',
                 'text': 'My landlord is kicking me out of my home!',
                 'save-text': 0,
                 'cutoff-lower': 0.25,
                 'cutoff-pred': 0.5,
                 'cutoff-upper': 0.6,
                 'labels': [{  
                               'id': 'HO-00-00-00-00',
                               'name': 'Housing',
                               'lower': 0.6576830054321086,
                               'pred': 0.6982554666277648,
                               'upper': 0.7171144999635295,
                               'children': [{
                                             'id': 'HO-06-00-00-00',
                                             'name': 'Renting or leasing a home',
                                             'lower': 0.6705320866392293,
                                             'pred': 0.8859675570562203,
                                             'upper': 0.9113575931804385
                                            }]
                            }]
               }
recursive_get_id(spot_output["labels"])

#########################
#       End Test
#########################

{'HO-00-00-00-00', 'HO-06-00-00-00'}

Call the Spot API, but return only the IDs of issues found in the text.

In [11]:
def spot(text,lower=0.25,pred=0.5,upper=0.6,verbose=0):

    headers = { "Authorization": "Bearer " + spot_token, "Content-Type":"application/json" }

    body = {
      "text": text,
      "save-text": 0,
      "cutoff-lower": lower,
      "cutoff-pred": pred,
      "cutoff-upper": upper
    }

    r = requests.post('https://spot.suffolklitlab.org/v0/entities-nested/', headers=headers, data=json.dumps(body))
    output_ = r.json()
    
    try:
        output_["build"]
        if verbose!=1:
            try:
                return list(recursive_get_id(output_["labels"]))
            except:
                return []
        else:
            return output_
    except:
        return output_

In [12]:
#########################
#      Start Test
#########################

spot("My landlord is kicking me out of my home!",verbose=1)

#########################
#       End Test
#########################

{'build': 9,
 'query-id': '53a58bd5b5ad4ccfb683eb66386444ac',
 'text': 'My landlord is kicking me out of my home!',
 'save-text': 0,
 'cutoff-lower': 0.25,
 'cutoff-pred': 0.5,
 'cutoff-upper': 0.6,
 'labels': [{'id': 'HO-00-00-00-00',
   'name': 'Housing',
   'lower': 0.6576830054321086,
   'pred': 0.6982554666277648,
   'upper': 0.7171144999635295,
   'children': [{'id': 'HO-06-00-00-00',
     'name': 'Renting or leasing a home',
     'lower': 0.6705320866392293,
     'pred': 0.8859675570562203,
     'upper': 0.9113575931804385}]}]}

A function to pull words out of snake_case, camelCase and the like.

In [13]:
def reCase(text):
    output = re.sub("(\w|\d)(_|-)(\w|\d)","\\1 \\3",text.strip())
    output = re.sub("([a-z])([A-Z]|\d)","\\1 \\2",output)
    output = re.sub("(\d)([A-Z]|[a-z])","\\1 \\2",output)
    output = re.sub("([A-Z]|[a-z])(\d)","\\1 \\2",output)
    return output

In [14]:
#########################
#      Start Test
#########################

reCase("Deal with snake_case, camelCase, and similarly-formated text.")

#########################
#       End Test
#########################

'Deal with snake case, camel Case, and similarly formated text.'

Takes text from an auto-generated field name and uses regex to convert it into an Assembly Line standard field.
See https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/label_variables/

In [15]:
def regex_norm_field(text):
    
    regex_list = [

        # Personal info
        ## Name & Bio
        ["^((My|Your|Full( legal)?) )?Name$","users1_name"],
        ["^(Typed or )?Printed Name\s?\d*$","users1_name"],
        ["^(DOB|Date of Birth|Birthday)$","users1_birthdate"],
        ## Address
        ["^(Street )?Address$","users1_address_line_one"],
        ["^City State Zip$","users1_address_line_two"],
        ["^City$","users1_address_city"],
        ["^State$","users1_address_state"],
        ["^Zip( Code)?$","users1_address_zip"],
        ## Contact
        ["^(Phone|Telephone)$","users1_phone_number"],
        ["^Email( Adress)$","users1_email"],

        # Parties
        ["^plaintiff\(?s?\)?$","plantiff1_name"],
        ["^defendant\(?s?\)?$","defendant1_name"],
        ["^petitioner\(?s?\)?$","petitioners1_name"],
        ["^respondent\(?s?\)?$","respondents1_name"],

        # Court info
        ["^(Court\s)?Case\s?(No|Number)?\s?A?$","docket_number"],
        ["^File\s?(No|Number)?\s?A?$","docket_number"],

        # Form info
        ["^(Signature|Sign( here)?)\s?\d*$","users1_signature"],
        ["^Date\s?\d*$","signature_date"],
    ]

    for regex in regex_list:
        text = re.sub(regex[0],regex[1],text, flags=re.IGNORECASE)
    return text

In [16]:
#########################
#      Start Test
#########################

regex_norm_field("Name")

#########################
#       End Test
#########################

'users1_name'

Tranforms a string of text into a snake_case variable close in length to `max_length` name by summarizing the string and stiching the summary together in snake_case. h/t h/t https://towardsdatascience.com/nlp-building-a-summariser-68e0c19e3a93

In [17]:
def reformat_field(text,max_length=30):
    orig_title = text.lower()
    orig_title = re.sub("[^a-zA-Z]+"," ",orig_title)
    orig_title_words = orig_title.split()

    deduped_sentence = []
    for word in orig_title_words:
        if word not in deduped_sentence:
            deduped_sentence.append(word)

    filtered_sentence = [w for w in deduped_sentence if not w.lower() in stop_words]

    filtered_title_words = filtered_sentence

    characters = len(' '.join(filtered_title_words))
    
    if characters > 0:

        words = len(filtered_title_words)
        av_word_len = math.ceil(len(' '.join(filtered_title_words))/len(filtered_title_words))
        x_words = math.floor((max_length)/av_word_len)

        sim_mat = np.zeros([len(filtered_title_words),len(filtered_title_words)])
        # for each word compared to other
        for i in range(len(filtered_title_words)):
            for j in range(len(filtered_title_words)):
                if i != j:
                    sim_mat[i][j] = cosine_similarity(nlp(filtered_title_words[i]).vector.reshape(1,300), nlp(filtered_title_words[j]).vector.reshape(1,300))[0,0]

        try:
            nx_graph = nx.from_numpy_array(sim_mat)
            scores = nx.pagerank(nx_graph)
            sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)

            if x_words > len(scores):
                x_words=len(scores)

            i = 0
            new_title = ""
            for x in filtered_title_words:
                if scores[i] >= sorted_scores[x_words-1][1]:
                    if len(new_title)>0: new_title+="_"
                    new_title += x
                i+=1

            return new_title
        except:
            return '_'.join(filtered_title_words)
    else:
        if re.search("^(\d+)$", text):
            return "unknown"
        else:
            return re.sub("\s+","_",text.lower())

In [18]:
#########################
#      Start Test
#########################

reformat_field("this is a name field where you fill out your name")

#########################
#       End Test
#########################

'name_field_fill'

Normalize a word vector.

In [19]:
def norm(row):
    try:
        matrix = row.reshape(1,-1).astype(np.float64)
        return normalize(matrix, axis=1, norm='l1')[0]
    except Exception as e:
        print("===================")
        print("Error: ",e)
        print("===================")
        return np.NaN

In [20]:
#########################
#      Start Test
#########################

word_vector = np.array([-2.13013589e-01,  3.12421650e-01, -1.20467708e-01, -3.12428959e-02,
        3.62497084e-02,  1.28887519e-01,  5.64192720e-02, -2.78512329e-01,
       -3.36407930e-01,  1.55933702e+00, -1.72489524e-01, -1.16480077e-02,
       -2.68756121e-01, -2.70458981e-02, -3.18187058e-01, -2.45410472e-01,
        5.21429814e-02,  1.54589176e+00, -1.12968512e-01,  1.06057107e-01,
        8.91231745e-02,  3.39188278e-02, -4.11080495e-02, -4.37215306e-02,
       -7.70686269e-02, -4.02998850e-02, -2.89326757e-01, -3.38935503e-03,
       -1.47982361e-02, -1.34107858e-01, -1.00483648e-01,  1.41670063e-01,
       -1.06036467e-02, -1.25555053e-01,  2.97597766e-01,  3.82642411e-02,
        7.61021525e-02,  1.06348701e-01, -1.63999036e-01, -2.03580678e-01,
       -8.27748924e-02,  2.44145423e-01, -2.77349427e-02, -1.78780317e-01,
        1.18556768e-01,  1.16618676e-02, -3.60679418e-01, -3.39524031e-01,
       -6.04647063e-02,  3.09849262e-01, -4.76841219e-02,  1.36718765e-01,
        1.21821702e-01, -9.25804079e-02,  8.70111734e-02, -1.52639061e-01,
       -7.90559500e-02,  9.54141170e-02,  1.68904290e-01,  3.97630036e-02,
       -2.11293235e-01,  1.45039514e-01,  3.15100588e-02,  2.40374088e-01,
        7.17499405e-02, -1.18390419e-01, -1.81074649e-01, -3.58754098e-02,
        2.25285348e-02,  2.77734697e-02,  3.02323312e-01, -9.28211138e-02,
        7.65662342e-02,  1.88735843e-01, -1.68899714e-03,  2.14816511e-01,
        8.44987035e-02, -6.52168840e-02,  5.20729385e-02,  7.60016739e-02,
        9.39117000e-02,  1.09185288e-02, -3.11423540e-02, -5.33087254e-02,
       -6.13311753e-02, -4.95994017e-02,  5.93046546e-01,  1.85197070e-01,
        2.77876616e-01,  6.86455891e-02,  2.50347108e-02, -4.20601144e-02,
       -8.79584923e-02, -7.20472410e-02,  1.62315980e-01, -1.38627421e-02,
        1.91538319e-01,  1.21159710e-01, -1.06149234e-01,  9.70576610e-03,
       -3.58194076e-02,  6.16732985e-02, -1.64352193e-01,  1.01822123e-01,
       -2.37698719e-01, -8.40747774e-01,  5.41184768e-02,  2.35891771e-02,
        9.67000872e-02, -8.21658745e-02,  7.28224739e-02,  1.89202391e-02,
        4.26500067e-02, -3.47012818e-01,  6.10405281e-02,  2.06719086e-01,
        1.27913013e-01, -1.03432693e-01,  1.77932382e-02,  1.33496165e-01,
       -3.47520933e-02,  9.39188059e-03,  4.44273576e-02,  8.23991820e-02,
        2.70191506e-02,  4.90782270e-03,  3.91958794e-03, -1.88633695e-01,
       -4.46445867e-02,  4.89131846e-02, -1.09635480e-01, -6.73763920e-04,
       -6.61776261e-03, -2.47771200e-02, -5.22734001e-02, -1.63720530e-02,
       -6.74265251e-02, -1.09030060e-01,  1.85679402e-02, -2.34510377e-01,
       -1.79144228e+00,  2.23494042e-02, -5.80148846e-02,  5.55704013e-02,
       -1.16520695e-01,  2.05797702e-02, -7.28819296e-02, -3.87611636e-03,
        2.59244412e-01, -1.23371825e-01, -8.16638917e-02, -2.48691179e-02,
       -1.02678597e-01,  1.45087942e-01,  1.00950181e-01, -5.60914055e-02,
       -1.03730531e-02, -9.74975824e-02,  1.15013108e-01,  1.27673715e-01,
        1.10676512e-01,  1.40324058e-02, -4.92751062e-01,  1.73023537e-01,
       -2.87811807e-03, -3.77760604e-02,  8.23729411e-02, -1.60091534e-01,
        3.33402939e-02,  1.51509978e-02, -3.67459655e-02, -4.89723310e-02,
        7.74993524e-02,  4.90004718e-02, -2.54121333e-01,  2.84605585e-02,
        6.42405301e-02,  7.97412395e-02,  3.11562061e-01, -1.66010365e-01,
       -5.73812351e-02,  1.46178296e-02,  8.05270206e-03, -1.81335919e-02,
       -7.72526562e-02,  3.38562243e-02,  1.64881200e-02, -6.56182319e-02,
        6.28871769e-02,  9.94776487e-02, -1.25716060e-01,  6.70209378e-02,
       -1.79496646e-01,  4.09843512e-02, -9.67781842e-02,  1.44944191e-01,
       -1.56434000e-01, -1.21532470e-01,  8.18147659e-02, -1.49101183e-01,
       -2.74047069e-02, -6.11314848e-02,  1.78608179e-01, -9.15855095e-02,
        2.79406458e-01,  1.27912417e-01, -4.35665883e-02, -1.05978232e-02,
        1.81453362e-01,  1.07644172e-02, -8.09091777e-02,  1.60250574e-01,
        4.79899859e-03, -3.13255899e-02,  2.76214033e-01,  2.60181010e-01,
       -2.59297676e-02,  2.29535148e-01, -1.49327949e-01, -2.23262887e-02,
        8.88084620e-02,  5.86275943e-02,  8.89510661e-02,  6.69021392e-03,
        7.57751837e-02,  1.94170550e-02, -2.98427671e-01,  1.51288763e-01,
       -1.29115418e-01,  1.92619577e-01,  7.42087066e-02,  2.51058280e-03,
        2.58868188e-02,  1.03090875e-01, -1.14060774e-01, -2.13483363e-01,
       -3.89549397e-02, -8.34967047e-02, -1.63730040e-01,  2.38725409e-01,
        2.16264129e-01,  2.45557595e-02,  1.08124174e-01, -5.15342280e-02,
        1.58610597e-01, -1.34693861e-01, -3.76047045e-02, -3.42159599e-01,
       -1.18749730e-01,  1.06453717e-01,  3.35164189e-01, -2.83597976e-01,
       -1.43357873e-01, -5.87592982e-02, -1.30963936e-01,  2.30296150e-01,
        1.45184398e-01,  9.03599337e-03,  1.93905517e-01,  1.09368414e-01,
        1.44885898e-01,  1.38844848e-01, -1.26822963e-01,  1.93229869e-01,
        4.46362421e-02,  3.87822315e-02, -9.51401070e-02, -8.87758583e-02,
       -4.60669361e-02,  3.68877321e-01,  2.39974946e-01, -3.79055925e-02,
        9.47600007e-02, -2.42426455e-01, -2.48907149e-01,  7.64537752e-02,
        7.63775334e-02,  1.27537757e-01, -9.76211056e-02,  1.27716690e-01,
        1.07440069e-01,  1.70593366e-01, -1.14196517e-01, -1.49709731e-01,
        1.88920572e-02, -2.33909085e-01,  7.79872984e-02,  4.88006091e-03,
       -1.52774289e-01, -1.95967734e-01,  1.88715328e-02, -1.20045125e-01,
       -8.56644586e-02, -2.28846353e-02,  9.67323482e-02,  4.18395996e-02,
       -1.66458189e-01, -2.40994707e-01, -8.85597616e-02,  1.80236936e-01])

norm(word_vector)

#########################
#       End Test
#########################

array([-5.50101741e-03,  8.06820327e-03, -3.11104546e-03, -8.06839202e-04,
        9.36138759e-04,  3.32848476e-03,  1.45701220e-03, -7.19250437e-03,
       -8.68764236e-03,  4.02694501e-02, -4.45449456e-03, -3.00806598e-04,
       -6.94055298e-03, -6.98452887e-04, -8.21709334e-03, -6.33765801e-03,
        1.34657817e-03,  3.99222299e-02, -2.91738078e-03,  2.73889565e-03,
        2.30158149e-03,  8.75944408e-04, -1.06160408e-03, -1.12909651e-03,
       -1.99027611e-03, -1.04073346e-03, -7.47178400e-03, -8.75291624e-05,
       -3.82160381e-04, -3.46329858e-03, -2.59496260e-03,  3.65859045e-03,
       -2.73836262e-04, -3.24242474e-03,  7.68538054e-03,  9.88163513e-04,
        1.96531718e-03,  2.74642598e-03, -4.23523004e-03, -5.25741507e-03,
       -2.13763885e-03,  6.30498847e-03, -7.16247274e-04, -4.61695256e-03,
        3.06169596e-03,  3.01164526e-04, -9.31444687e-03, -8.76811481e-03,
       -1.56148443e-03,  8.00177205e-03, -1.23142935e-03,  3.53072454e-03,
        3.14601198e-03, -

Vectorize a string of text. 

In [21]:
def vectorize(text, normalize=1):
    output = nlp(str(text)).vector
    if normalize==1:
        return norm(output)
    else:
        return output

In [22]:
#########################
#      Start Test
#########################

vectorize("how much wood would a wood chuck chuck, if a wood chuck could chuck wood?")

#########################
#       End Test
#########################

array([-5.50101742e-03,  8.06820326e-03, -3.11104545e-03, -8.06839201e-04,
        9.36138759e-04,  3.32848477e-03,  1.45701220e-03, -7.19250437e-03,
       -8.68764235e-03,  4.02694501e-02, -4.45449456e-03, -3.00806598e-04,
       -6.94055299e-03, -6.98452888e-04, -8.21709334e-03, -6.33765801e-03,
        1.34657817e-03,  3.99222300e-02, -2.91738078e-03,  2.73889566e-03,
        2.30158149e-03,  8.75944407e-04, -1.06160408e-03, -1.12909651e-03,
       -1.99027611e-03, -1.04073346e-03, -7.47178401e-03, -8.75291623e-05,
       -3.82160381e-04, -3.46329858e-03, -2.59496261e-03,  3.65859045e-03,
       -2.73836264e-04, -3.24242475e-03,  7.68538053e-03,  9.88163512e-04,
        1.96531718e-03,  2.74642598e-03, -4.23523004e-03, -5.25741506e-03,
       -2.13763885e-03,  6.30498847e-03, -7.16247275e-04, -4.61695257e-03,
        3.06169595e-03,  3.01164527e-04, -9.31444686e-03, -8.76811480e-03,
       -1.56148443e-03,  8.00177206e-03, -1.23142935e-03,  3.53072454e-03,
        3.14601197e-03, -

Given an auto-generated field name and context from the form where it appeared, this function attempts to normalize the field name. Here's what's going on:
1. It will `reCase` the variable text
2. Then it will run the output through `regex_norm_field`
3. If it doesn't find anything, it will use the ML model `clf_field_names`
4. If the prediction isn't very confident, it will run it through `reformat_field`  

In [23]:
def normalize_name(jur,group,n,per,last_field,this_field):

    # Add hard coded conversions maybe by calling a function
    # if returns 0 then fail over to ML or otherway around poor prob -> check hard-coded

    if this_field not in included_fields:
        this_field = reCase(this_field)

        out_put = regex_norm_field(this_field)
        conf = 1.0

        if out_put==this_field:
            params = []
            for item in jurisdictions:
                if jur== item:
                    params.append(1)
                else:
                    params.append(0)
            for item in groups:
                if group== item:
                    params.append(1)
                else:
                    params.append(0)
            params.append(n)
            params.append(per)
            for vec in vectorize(this_field):
                params.append(vec)

            for item in included_fields:
                if last_field==item:
                    params.append(1)
                else:
                    params.append(0)

            pred = clf_field_names.predict([params])
            prob = clf_field_names.predict_proba([params])

            conf = prob[0].tolist()[prob[0].tolist().index(max(prob[0].tolist()))]
            out_put = pred[0]

    else:
        out_put = this_field
        conf = 1

    if out_put in included_fields:
        if conf >= 0:
            return "*"+out_put,conf # this * is a hack to show when something is in the list of known fields later. I need to fix this
        else:
            return reformat_field(this_field),conf
    else:
        return reformat_field(this_field),conf

In [24]:
#########################
#      Start Test
#########################

normalize_name("UT",None,2,0.3,"null","Name thing")

#########################
#       End Test
#########################

('name_thing', 0.38)

Take a list of AL variables and spits out suggested groupings. Here's what's going on:

1. It reads in a list of fields (e.g., `["user_name","user_address"]`)
2. Splits each field into words (e.g., turning `user_name` into `user name`)
3. It then turns these ngrams/"sentences" into vectors using word2vec. 
4. For the collection of fields, it finds clusters of these "sentences" within the semantic space defined by word2vec. Currently it uses Affinity Propagation. See https://machinelearningmastery.com/clustering-algorithms-with-python/

In [25]:
def cluster_screens(fields=[],damping=0.7):
    # Takes in a list (fields) and returns a suggested screen grouping
    # Set damping to value >= 0.5 or < 1 to tune how related screens should be

    vec_mat = np.zeros([len(fields),300])
    for i in range(len(fields)):
        vec_mat[i] = [nlp(reCase(fields[i])).vector][0]

    # create model
    model = AffinityPropagation(damping=damping)
    #model = AffinityPropagation(damping=damping,random_state=4) consider using this to get consitent results. note will have to requier newer version
    # fit the model
    model.fit(vec_mat)
    # assign a cluster to each example
    yhat = model.predict(vec_mat)
    # retrieve unique clusters
    clusters = unique(yhat)

    screens = {}
    #sim = np.zeros([5,300])
    i=0
    for cluster in clusters:
        this_screen = where(yhat == cluster)[0]
        vars = []
        j=0
        for screen in this_screen:
            #sim[screen]=vec_mat[screen] # use this spot to add up vectors for compare to list
            vars.append(fields[screen])
            j+=1
        screens["screen_%s"%i]=vars
        i+=1

    return screens

In [26]:
#########################
#      Start Test
#########################

fields= [
        "users1_name",
        "users1_birthdate",
        "users1_address_line_one",
        "users1_address_line_two",
        "users1_address_city",
        "users1_address_state",
        "users1_address_zip",
        "users1_phone_number",
        "users1_email",
        "plantiffs1_name",
        "defendants1_name",
        "petitioners1_name",
        "respondents1_name",
        "docket_number",
        "trial_court_county",
        "users1_signature",
        "signature_date"
        ]

cluster_screens(fields,damping=0.7)

#########################
#       End Test
#########################

{'screen_0': ['users1_name',
  'users1_birthdate',
  'users1_address_line_one',
  'users1_address_line_two',
  'users1_address_city',
  'users1_address_state',
  'users1_address_zip',
  'users1_phone_number',
  'users1_email',
  'users1_signature'],
 'screen_1': ['plantiffs1_name',
  'defendants1_name',
  'petitioners1_name',
  'respondents1_name'],
 'screen_2': ['docket_number'],
 'screen_3': ['trial_court_county'],
 'screen_4': ['signature_date']}

In [27]:
#########################
#      Start Test
#########################

vec_mat = np.zeros([len(fields),300])
for i in range(len(fields)):
    vec_mat[i] = [nlp(reCase(fields[i])).vector][0]

parts = np.zeros([5,300])

for row in vec_mat:
    sim = []
    for part in parts:
        sim.append(cosine_similarity(vec_mat[0].reshape(1, -1),row.reshape(1, -1))[0][0])
    print(sim)   

#########################
#       End Test
#########################

[1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002]
[0.8038686975124008, 0.8038686975124008, 0.8038686975124008, 0.8038686975124008, 0.8038686975124008]
[0.8481956135581707, 0.8481956135581707, 0.8481956135581707, 0.8481956135581707, 0.8481956135581707]
[0.826921329803093, 0.826921329803093, 0.826921329803093, 0.826921329803093, 0.826921329803093]
[0.8197875020998185, 0.8197875020998185, 0.8197875020998185, 0.8197875020998185, 0.8197875020998185]
[0.8059375150874605, 0.8059375150874605, 0.8059375150874605, 0.8059375150874605, 0.8059375150874605]
[0.7847333680991196, 0.7847333680991196, 0.7847333680991196, 0.7847333680991196, 0.7847333680991196]
[0.8077588614146448, 0.8077588614146448, 0.8077588614146448, 0.8077588614146448, 0.8077588614146448]
[0.8619738791744108, 0.8619738791744108, 0.8619738791744108, 0.8619738791744108, 0.8619738791744108]
[0.5654755902680005, 0.5654755902680005, 0.5654755902680005, 0.5654755902680005, 0.5654755902680

Get the text content of a pdf.

In [28]:
def read_pdf (file):
    try:
        pdfFile = PyPDF2.PdfFileReader(open(file, "rb"))
        if pdfFile.isEncrypted:
            try:
                pdfFile.decrypt('')
                #print ('File Decrypted (PyPDF2)')
            except:
                #
                #
                # This didn't go so well on my Windows box so I just ran this in the pdf folder's cmd:
                # for %f in (*.*) do copy %f temp.pdf /Y && "C:\Program Files (x86)\qpdf-8.0.2\bin\qpdf.exe" --password="" --decrypt temp.pdf %f
                #
                #
                #
                
                command="cp "+file+" tmp/temp.pdf; qpdf --password='' --decrypt tmp/temp.pdf "+file
                os.system(command)
                #print ('File Decrypted (qpdf)')
                #re-open the decrypted file
                pdfFile = PyPDF2.PdfFileReader(open(file, "rb"))
        text = ""
        for page in pdfFile.pages:
            text = text + " " + page.extractText()
        text = reCase(text)
        text = re.sub("(\.|,|;|:|!|\?|\n|\]|\))","\\1 ",text)
        text = re.sub("(\(|\[)"," \\1",text)
        text = re.sub(" +"," ",text)
        return text
    except:
        return ""

In [29]:
#########################
#      Start Test
#########################

read_pdf("ML_training/auto/3902bb0b832b4fa4b20e7635201017aa.pdf")

#########################
#       End Test
#########################

"Request\n \n for Order \n to Examine Respondent\n \n Approved Board of District Court Judges \n September 12, 2012\n \n Revised \n December 19\n , 201\n 9\n \n Page \n 1\n \n of \n 3\n \n \n \n \n Name\n \n \n \n \n Address\n \n \n \n \n City, State, Zip\n \n \n \n \n Phone\n \n \n \n Check your email. \n You will receive information and \n documents at this email address. \n \n \n Email\n \n \n \n I am \n \n [ ] Petitioner\n \n [ ] Respondent\n \n [ ] \n Interested Person\n \n [ ] Petitioner\n \n ttorney\n \n [ ] Respondent\n \n \n \n [ ] \n Interested Person's Attorney\n \n (Utah Bar #: \n _ _\n _ __\n _ _\n ) \n \n In the\n \n District \n Court of Utah\n \n _ __ __ __ Judicial District _ __ __ __ __ __ County\n \n Court Address _ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ _\n \n In t\n he Matter of Protection for\n \n _ __ __ __ __ __ __ __ __ __ __ ___, \n \n Respondent\n \n Request\n \n for Order \n to Examine \n Respondent\n \n _ __ __ __ __ __ __ __ __ __ __\n \n Case N

Read in a pdf, pull out basic stats, attempt to normalize its form fields, and re-write the file with the new fields (if `rewrite=1`). 

In [30]:
def parse_form(fileloc,title=None,jur=None,cat=None,normalize=1,use_spot=0,rewrite=0):
    f = PyPDF2.PdfFileReader(fileloc)

    if f.isEncrypted:
        pdf = pikepdf.open(fileloc, allow_overwriting_input=True)
        pdf.save(fileloc)
        f = PyPDF2.PdfFileReader(fileloc)
        
    npages = f.getNumPages()
  
    # When reading some pdfs, this can hang due to their crazy field structure
    try:
        with time_limit(15):
            ff = f.getFields()
    except TimeoutException as e:
        print("Timed out!")
        ff = None   
    
    if ff:
        fields = list(ff.keys())
    else:
        fields = []
    f_per_page = len(fields)/npages
    text = read_pdf(fileloc)
    
    if title is None:
        matches = re.search("(.*)\n",text)
        if matches:
            title = reCase(matches.group(1).strip())
        else:
            title = "(Untitled)"        

    try:
        #readbility = int(Readability(text).flesch_kincaid().grade_level)
        text = re.sub("_"," ",text)
        text = re.sub("\n",". ",text)
        text = re.sub(" +"," ",text)
        if text!= "":
            consensus = textstat.text_standard(text)
            readbility = eval(re.sub("^(\d+)[^0-9]+(\d+)\w*.*","(\\1+\\2)/2",consensus))
        else:
            readbility = None
    except:
        readbility = None

    if use_spot==1:
        nmsi = spot(title + ". " +text)      
    else:
        nmsi = []
        
    if normalize==1:
        i = 0 
        length = len(fields)
        last = "null"
        new_fields = []
        new_fields_conf = []
        for field in fields:
            #print(jur,cat,i,i/length,last,field)
            this_field,this_conf = normalize_name(jur,cat,i,i/length,last,field)
            new_fields.append(this_field)
            new_fields_conf.append(this_conf)
            last = field
        
        new_fields = [v + "__" + str(new_fields[:i].count(v) + 1) if new_fields.count(v) > 1 else v for i, v in enumerate(new_fields)]
    else:
        new_fields = fields
        new_fields_conf = []
    
    stats = {
            "title":title,
            "category":cat,
            "pages":npages,
            "reading grade level": readbility,
            "list":nmsi,
            "avg fields per page": f_per_page,
            "fields":new_fields,
            "fields_conf":new_fields_conf,
            "fields_old":fields,
            "text":text
            }    
    
    if rewrite==1:
        try:
            if 1==1:
                my_pdf = pikepdf.Pdf.open(fileloc, allow_overwriting_input=True)
                fields_too = my_pdf.Root.AcroForm.Fields #[0]["/Kids"][0]["/Kids"][0]["/Kids"][0]["/Kids"]
                #print(repr(fields_too))
                
                k =0
                for field in new_fields:
                    #print(k,field)
                    fields_too[k].T = re.sub("^\*","",field)
                    k+=1

                #f2.T = 'new_hospital_name'
                #filename = re.search("\/(\w*\.pdf)$",fileloc).groups()[0]
                #my_pdf.save('/%s'%(filename))
                my_pdf.save(fileloc)
            else:
                file = PdfFileWriter()

                first_page = f.getPage(0)

                file.cloneDocumentFromReader(f)
                #file.appendPagesFromReader(f)

                x ={}
                for y in ff:
                    x[y]=""

                #print(x)

                file.updatePageFormFieldValues(first_page,x)

                output = open('blankPdf.pdf', 'wb')
                file.write(output)  
        except:
            error = "could not change form fields"
    
    return stats

In [31]:
#########################
#      Start Test
#########################

text = read_pdf("ML_training/auto/095b9dc651ce47eb8b62e0790974970f.pdf")
text = re.sub("_"," ",text)
text = re.sub("\s",". ",text)
text = re.sub(" +"," ",text)
print(text)
print(text!="")
textstat.text_standard(text)

#########################
#       End Test
#########################

FOC. 78. . . (3/11). . . OBJECTION. TO. PROPOSED. ORDERMCR. 2.. 602. (B). Approved,. SCAOSTATE. OF. MICHIGANCASE. NO.. . . JUDICIAL. CIRCUITCOUNTYCourt. address. . Court. telephone. no.. . . Original. -. Court. 1. st. copy. -. Moving. party. . 2. nd. copy. -. Objecting. party. 3. rd. copy. -. Friend. of. the. court. 4. th. copy. -. Proof. of. service. . 5. th. copy. -. Proof. of. service. Plaintiff's. name,. address,. and. telephone. no.. moving. party. . Defendant's. name,. address,. and. telephone. no.. moving. party. . v. OBJECTION. TO. PROPOSED. ORDERI. received. a. notice. to. enter. a. proposed. order. without. a. hearing. dated. . I. object. to. the. entry. of. the. proposed. order. and. request. a. hearing. by. the. court.. My. objection. is. based. on. the. following. reason. (. . s). :. CERTIFICATE. OF. MAILINGI. certify. that. on. this. date. I. served. a. copy. of. this. objection. on. the. parties. or. their. attorneys. by. first. class. mail. addressed. to. theirlast. kno

'8th and 9th grade'

In [32]:
#########################
#      Start Test
#########################

#parse_form("../data/processed/www.utcourts.gov/forms/898269a99ff1c65be10b1ae35bb34ba469fc14b7301b7ed7b126d195.pdf",title=None,jur="UT",cat=None,normalize=1)
#parse_form("../data/processed/www.utcourts.gov/forms/2532cd2b6d3aaff8c47726a0abd168fb4e5cdb4977c065cd27bde8c7.pdf",title=None,jur="UT",cat=None,normalize=1)
#parse_form("../data/processed/www.utcourts.gov/forms/6ec7576210513907e699b5adf3397639507c688801a60bc34c201984.pdf",title=None,jur="UT",cat=None,normalize=1)
#parse_form("../data/processed/mjbportal.courts.maine.gov/forms/1519fe450d870a36a428a0b006c0665a.pdf",title=None,jur="UT",cat=None,normalize=1)
#parse_form("../data/processed/www.courts.ca.gov/forms/3979f1c1c9f165ccac026b26cf20252c.pdf",title=None,jur="UT",cat=None,normalize=1)
#parse_form("../data/processed/www.courts.michigan.gov/forms/52b2bf502a4bd8bc3a39a494a0ea5b0f491552e4d2da2ebe82beba3d.pdf",title=None,jur="UT",cat=None,normalize=1)

#parse_form("../data/processed/www.utcourts.gov/forms/d94720b568d800e2510fbc04955687282a7e7419b78565d3e52c461c.pdf",title=None,jur="MI",cat=None,normalize=1,use_spot=1,rewrite=0)
#parse_form("../data/processed/www.courts.michigan.gov/forms/147d1063a642a9f94693331190cc14599152610dc5cd489b5d17e46d.pdf",title=None,jur="MI",cat=None,normalize=1,use_spot=1,rewrite=0)
#parse_form("../data/processed/www.courts.ca.gov/forms/e2c17a8503879d28d12932434d7c755b.pdf",title=None,jur="CA",cat=None,normalize=1,use_spot=1,rewrite=0)

#parse_form("../data/processed/www.courts.ca.gov/forms/0d795fb4c4e35655370b5a6defa6b5cb.pdf",title=None,jur="CA",cat=None,normalize=1,use_spot=1,rewrite=0)

parse_form("ML_training/auto/095b9dc651ce47eb8b62e0790974970f.pdf",title=None,jur="UT",cat=None,normalize=1,use_spot=1,rewrite=0)

#my_pdf = pikepdf.Pdf.open("../data/processed/www.courts.ca.gov/forms/0d795fb4c4e35655370b5a6defa6b5cb.pdf", allow_overwriting_input=True)
#fields_too = my_pdf.Root.AcroForm.Fields #[0]["/Kids"][0]["/Kids"][0]["/Kids"][0]["/Kids"]
#print(repr(fields_too))


#########################
#       End Test
#########################

{'title': 'FOC 78',
 'category': None,
 'pages': 1,
 'reading grade level': 10.5,
 'list': [],
 'avg fields per page': 18.0,
 'fields': ['moving_party__1',
  'moving_party__2',
  'moving_party__3',
  '*signature_date__1',
  'name_type_print',
  '*signature_date__2',
  '*docket_number',
  'judge',
  'circuit',
  'county',
  '*users1_address_line_one',
  'telno',
  'plaintiffs_name_address_telephone',
  'defendants_name_address_telephone',
  'third_partys_name_address_telephone',
  'dated',
  'reasons',
  'form_instructions'],
 'fields_conf': [0.53,
  0.56,
  0.55,
  1.0,
  0.62,
  1.0,
  1.0,
  0.68,
  0.58,
  0.64,
  1.0,
  0.97,
  0.52,
  0.55,
  0.48,
  0.55,
  0.53,
  0.61],
 'fields_old': ['moving party',
  'moving party_2',
  'moving party_3',
  'Date',
  'Name type or print',
  'Date_2',
  'CASE NO',
  'judge',
  'circuit',
  'county',
  'address',
  'telno',
  'Plaintiffs name address and telephone',
  'Defendants name address and telephone',
  'Third partys name address and tel

In [74]:
def form_complexity(text,fields,reading_lv):
    
    # check for fields that requier user to look up info, when found add to complexity
    # maybe score these by minutes to recall/fill out
    # so, figure out words per minute, mix in with readability and page number and field numbers
    
    return 0

In [33]:
#########################
#      Start Test
#########################

# Save this notebook, then run this cell.

import nbformat
from nbconvert import PythonExporter
from datetime import date
today = date.today().strftime("%Y-%m-%d")

with open('functions.ipynb') as fh:
    nb = nbformat.reads(fh.read(), nbformat.NO_CONVERT)

exporter = PythonExporter()
source, meta = exporter.from_notebook_node(nb)

with open('../formfyxer/lit_explorer.py', 'w+') as fh:
    fh.writelines(source)

local_load ="""
included_fields = load(os.path.join(os.path.dirname(__file__), 'data', 'included_fields.joblib'))
jurisdictions = load(os.path.join(os.path.dirname(__file__), 'data', 'jurisdictions.joblib'))
groups = load(os.path.join(os.path.dirname(__file__), 'data', 'groups.joblib'))
clf_field_names = load(os.path.join(os.path.dirname(__file__), 'data', 'clf_field_names.joblib'))
with open(os.path.join(os.path.dirname(__file__), 'keys', 'spot_token.txt'), 'r') as file:
    spot_token = file.read().rstrip()
"""

with open("../formfyxer/lit_explorer.py", "r") as file:
    content = file.read() # read everything in the file
    content = re.sub("#!/usr/bin/env python\n","",content,flags=re.M)
    content = re.sub("# coding: utf-8\n","",content,flags=re.M)
    content = re.sub("# load local stuff\n",local_load,content,flags=re.M)
    content = re.sub("(?<=#{25}\n#\s{6}Start Test\n#{25}\n)(^(?!.*#{25}).*$\n)*(?=#{25}\n#\s{7}End Test\n#{25}\n)","",content,flags=re.M)
    content = re.sub("#{25}\n#\s{6}Start Test\n#{25}\n|#{25}\n#\s{7}End Test\n#{25}\n|#\sIn\[(\d*|\s*)\]:\n","",content,flags=re.M)
    content = re.sub("\n\n\n\n+","\n\n",content,flags=re.M)
    content = re.sub("^\n+","",content)
    
with open("../formfyxer/lit_explorer.py", "w") as file:
    file.write("# Updated on "+today+"\n\n"+content)
    
with open("../setup.py", "r") as file:
    content = file.read()
    version = re.findall("version='(\d+\.\d+\.\d+)'",content)[0]
    
print("Current version: %s"%version)


Current version: 0.0.1


In [34]:
new_v = input("Enter new version number or leave blank to keep.\n")
if len(re.findall("(\d+\.\d+\.\d+)",new_v))>0:
    
    with open("../setup.py", "w") as file:
        content = re.sub("version='(\d+\.\d+\.\d+)'","version='%s'"%new_v,content)
        file.write(content)

Enter new version number or leave blank to keep.
0.0.2


In [35]:
#########################
#       End Test
#########################