In [1]:
# import necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import pickle to dump extracted features 
import pickle

#import spacy
import spacy
from spacy.matcher import PhraseMatcher #import PhraseMatcher class

In [2]:
# Lood large English model 
nlp = spacy.load('en_core_web_lg')

In [3]:
# create the PhraseMatcher object
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

In [112]:
# read file and create nlp object
with open('./Data/Input/Legal_doc.txt') as f:
    doc = nlp(f.read())

In [113]:
#read terms as list
with open('./Data/Input/Legal_Terms.txt') as f:
    terms = f.readlines()
    
terms = [t.rstrip() for t in terms]

In [115]:
terms

['Identity of the parties',
 'Purpose of the agreement',
 'Contractual terms',
 'Underlying assumptions',
 'Warranties and disclaimers',
 'Liquidated damages',
 'Liability limitations',
 'Confidentiality provision',
 'Default',
 'Governing law',
 'Arbitration clause',
 'Indemnification agreement',
 'Lawsuit venues',
 'Signatures of authorized parties',
 'Statement constituting entire agreement',
 'Offer and acceptance',
 'Parties who can legally agree to terms',
 'Lawful subject matter',
 'Valuable consideration',
 'Mutuality of agreement and obligation',
 'acquittal',
 'affidavit',
 'agreement',
 'affirme',
 'answer',
 'appeal',
 'appellate',
 'arraignment',
 'bail',
 'bankruptcy',
 'bench trial',
 'brief',
 'chambers',
 'capital offense',
 'case law',
 'charge to the jury',
 'chief judge',
 'circumstantial evidence',
 'clerk of court',
 'common law',
 'complaint',
 'contract',
 'conviction',
 'counsel',
 'counterclaim',
 'court',
 'court reporter',
 'damages',
 'default judgement',
 

In [116]:
patterns = [nlp.make_doc(term) for term in terms]
matcher.add("LegalTerms", None, *patterns)

In [171]:
entry_list = []

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end] 
    entry_list.append((span.text.lower(), str(doc[start-20:end+20]).lower()))
    

#convert to dataframe  
df= pd.DataFrame(entry_list,columns=['Terms','Sents'])

In [172]:
df.head(25)

Unnamed: 0,Terms,Sents
0,acquittal,
1,judgement,
2,defendant,
3,affidavit,mattis nunc sed blandit libero volutpat. at te...
4,appellate,"it, before a notary or officer having authorit..."
5,court,"of the appellate courts, the decree or order i..."
6,answer,"courts, the decree or order is declared valid ..."
7,defendant,declared valid and will stand as rendered in t...
8,complaint,as rendered in the lower court.\nanswer the fo...
9,appeal,formal written statement by a defendant respon...


In [173]:
def Clean_Punctuation(string): 
    
    # special characters * punctuations marks 
    punctuations = '\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n'    
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "")   
    
    return(string)  


In [174]:
#clean sents column
df['Sents'] = df['Sents'].apply(Clean_Punctuation)

#replace empty string with nan
df.replace(r'^\s*$', np.nan, regex=True)

Unnamed: 0,Terms,Sents
0,acquittal,
1,judgement,
2,defendant,
3,affidavit,mattis nunc sed blandit libero volutpat at tel...
4,appellate,it before a notary or officer having authority...
5,court,of the appellate courts the decree or order is...
6,answer,courts the decree or order is declared valid a...
7,defendant,declared valid and will stand as rendered in t...
8,complaint,as rendered in the lower courtanswer the forma...
9,appeal,formal written statement by a defendant respon...


In [176]:
df.to_csv('terms_with_sentences.csv', index=False)