In [1]:
# import necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import pickle to dump extracted features 
import pickle

#import spacy
import spacy
from spacy.matcher import PhraseMatcher #import PhraseMatcher class

In [2]:
# Lood large English model 
nlp = spacy.load('en_core_web_lg')

In [3]:
# create the PhraseMatcher object
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

In [112]:
# read file and create nlp object
with open('./Data/Input/Legal_doc.txt') as f:
    doc = nlp(f.read())

In [113]:
#read terms as list
with open('./Data/Input/Legal_Terms.txt') as f:
    terms = f.readlines()
    
terms = [t.rstrip() for t in terms]

In [115]:
terms

['Identity of the parties',
 'Purpose of the agreement',
 'Contractual terms',
 'Underlying assumptions',
 'Warranties and disclaimers',
 'Liquidated damages',
 'Liability limitations',
 'Confidentiality provision',
 'Default',
 'Governing law',
 'Arbitration clause',
 'Indemnification agreement',
 'Lawsuit venues',
 'Signatures of authorized parties',
 'Statement constituting entire agreement',
 'Offer and acceptance',
 'Parties who can legally agree to terms',
 'Lawful subject matter',
 'Valuable consideration',
 'Mutuality of agreement and obligation',
 'acquittal',
 'affidavit',
 'agreement',
 'affirme',
 'answer',
 'appeal',
 'appellate',
 'arraignment',
 'bail',
 'bankruptcy',
 'bench trial',
 'brief',
 'chambers',
 'capital offense',
 'case law',
 'charge to the jury',
 'chief judge',
 'circumstantial evidence',
 'clerk of court',
 'common law',
 'complaint',
 'contract',
 'conviction',
 'counsel',
 'counterclaim',
 'court',
 'court reporter',
 'damages',
 'default judgement',
 

In [116]:
patterns = [nlp.make_doc(term) for term in terms]
matcher.add("LegalTerms", None, *patterns)

In [117]:
entry_list = {}

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end] 
    entry_list[start] = {'Terms': span.text.lower(), 'Sents': str(doc[start-15:end+15]).lower() }

In [119]:
entry_list

{0: {'Terms': 'acquittal', 'Sents': ''},
 1: {'Terms': 'judgement', 'Sents': ''},
 5: {'Terms': 'defendant', 'Sents': ''},
 181: {'Terms': 'affidavit',
  'Sents': 'volutpat. at tellus at urna condimentum mattis pellentesque id nibh tortor.\naffidavit a written statement of facts confirmed by the oath of the party making it,'},
 215: {'Terms': 'appellate',
  'Sents': 'or officer having authority to administer oaths.\naffirmed in the practice of the appellate courts, the decree or order is declared valid and will stand as rendered in'},
 233: {'Terms': 'court',
  'Sents': 'the decree or order is declared valid and will stand as rendered in the lower court.\nanswer the formal written statement by a defendant responding to a civil complaint'},
 236: {'Terms': 'answer',
  'Sents': 'order is declared valid and will stand as rendered in the lower court.\nanswer the formal written statement by a defendant responding to a civil complaint and setting forth'},
 243: {'Terms': 'defendant',
  'Sent

In [138]:
df = pd.DataFrame(entry_list).transpose()
df.head(25)

Unnamed: 0,Sents,Terms
0,,acquittal
1,,judgement
5,,defendant
181,volutpat. at tellus at urna condimentum mattis...,affidavit
215,or officer having authority to administer oath...,appellate
233,the decree or order is declared valid and will...,court
236,order is declared valid and will stand as rend...,answer
243,as rendered in the lower court.\nanswer the fo...,defendant
248,court.\nanswer the formal written statement by...,complaint
258,defendant responding to a civil complaint and ...,appeal


In [150]:
def Clean_Punctuation(string): 
    
    # punctuation marks 
    punctuations = '\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n'    
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "")   
    
    return(string)  


In [151]:
df['Sents'] = df['Sents'].apply(Clean_Punctuation)

df.replace(r'^\s*$', np.nan, regex=True)

Unnamed: 0,Sents,Terms
0,,acquittal
1,,judgement
5,,defendant
181,volutpat at tellus at urna condimentum mattis ...,affidavit
215,or officer having authority to administer oath...,appellate
233,the decree or order is declared valid and will...,court
236,order is declared valid and will stand as rend...,answer
243,as rendered in the lower courtanswer the forma...,defendant
248,courtanswer the formal written statement by a ...,complaint
258,defendant responding to a civil complaint and ...,appeal


In [153]:
df.to_csv('terms_with_sentences.csv', index=False)