In [2]:
# import necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import pickle to dump extracted features 
import pickle

#import spacy
import spacy
from spacy.matcher import PhraseMatcher #import PhraseMatcher class

In [3]:
# Lood large English model 
nlp = spacy.load('en_core_web_lg')

In [4]:
# create the PhraseMatcher object
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

In [5]:
def read_file(file_path):
    with open(file_path) as f:
        str_text = f.read()
    
    return str_text.replace("\n", "")
   

In [6]:
#text = read_file('/home/shaitender/Assignment/Data/Input/Legal_Terms.txt')
doc_text = read_file('./Data/Input/Legal_doc.txt')

In [7]:
print(doc_text)

acquittal Judgement that a criminal defendant has not been proved guilty beyond a reasonable doubt. at auctor urna nunc id. Porttitor leo a diam sollicitudin tempor. Justo nec ultrices dui sapien eget. Sapien nec sagittis aliquam malesuada bibendum arcu. Velit euismod in pellentesque massa placerat duis ultricies lacus sed. At elementum eu facilisis sed odio. Elementum nisi quis eleifend quam adipiscing. Posuere urna nec tincidunt praesent semper. Amet consectetur adipiscing elit ut aliquam purus sit amet luctus. Vel fringilla est ullamcorper eget. Enim lobortis scelerisque fermentum dui faucibus in ornare. Ornare suspendisse sed nisi lacus sed viverra tellus in hac. Enim neque volutpat ac tincidunt vitae semper. Tristique senectus et netus et malesuada. Lacus laoreet non curabitur gravida arcu ac tortor dignissim convallis. Habitant morbi tristique senectus et netus. Aliquam malesuada bibendum arcu vitae elementum curabitur vitae nunc. Viverra nibh cras pulvinar mattis nunc sed blandi

In [8]:
#read terms as list
with open('./Data/Input/Legal_Terms.txt') as f:
    terms = f.readlines()
    
terms = [t.rstrip() for t in terms]

In [9]:
len(terms)

113

In [10]:
terms

['Identity of the parties',
 'Purpose of the agreement',
 'Contractual terms',
 'Underlying assumptions',
 'Warranties and disclaimers',
 'Liquidated damages',
 'Liability limitations',
 'Confidentiality provision',
 'Default',
 'Governing law',
 'Arbitration clause',
 'Indemnification agreement',
 'Lawsuit venues',
 'Signatures of authorized parties',
 'Statement constituting entire agreement',
 'Offer and acceptance',
 'Parties who can legally agree to terms',
 'Lawful subject matter',
 'Valuable consideration',
 'Mutuality of agreement and obligation',
 'acquittal',
 'affidavit',
 'agreement',
 'affirme',
 'answer',
 'appeal',
 'appellate',
 'arraignment',
 'bail',
 'bankruptcy',
 'bench trial',
 'brief',
 'chambers',
 'capital offense',
 'case law',
 'charge to the jury',
 'chief judge',
 'circumstantial evidence',
 'clerk of court',
 'common law',
 'complaint',
 'contract',
 'conviction',
 'counsel',
 'counterclaim',
 'court',
 'court reporter',
 'damages',
 'default judgement',
 

In [11]:
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("LegalTerms", None, *patterns)

In [12]:
#tterns = list(nlp.tokenizer.pipe(terms))
#atcher.add("Terms", None, *patterns)

In [13]:
extracted_terms =[]
doc = nlp(doc_text)
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    extracted_terms.append(span.text.lower())

In [16]:
'''for token in set(extracted_terms):
    print(token)'''

charge to the jury
settlement
jurisprudence
agreement
acquittal
docket
misdemeanor
petit jury
judge
damages
mistrial
verdict
clerk of court
information
bankruptcy
magistrate judges
pleadings
prosecute
lawsuit
felony
plea
impeachment
plaintiff
bail
chambers
counsel
jurisdiction
interrogatories
witness
injunction
chief judge
appellate
issue
instructions
conviction
deposition
federal question
defendant
record
transcript
file
evidence
parties
default judgement
hearsay
jury
warrant
u.s. attorney
precedent
reverse
default
testimony
brief
indictment
answer
opinion
court
sentence
habeas corpus
complaint
court reporter
appeal
common law
discovery
contract
judgement


In [17]:
# imports
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [59]:
# Turning a collection of terms into numerical feature vectors.
vectorizer = TfidfVectorizer(max_df=0.5, max_features=100,min_df=2, use_idf=True)      
        
X_matrix = vectorizer.fit_transform(extracted_terms)  
#terms = vectorizer.get_feature_names()

In [28]:
X_matrix[0:1]

<1x31 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [29]:
print(X_matrix.shape)

(228, 31)


In [30]:
#from sklearn.metrics.pairwise import cosine_similarity
#cosine_similarity(X_matrix[0:1], X_matrix)

In [31]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(X_matrix[0:1], X_matrix).flatten()

In [32]:
cosine_similarities

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [45]:
#Build model
import sklearn.pipeline
import sklearn.decomposition
import sklearn.feature_selection

model = sklearn.pipeline.Pipeline([
    #('pca', sklearn.decomposition.PCA()),
    ('select', sklearn.feature_selection.SelectKBest(score_func=sklearn.feature_selection.f_classif,k=5)),
    ('classify', sklearn.linear_model.LogisticRegressionCV())
])

model.fit(X_matrix, extracted_terms)




  f = msb / msw


Pipeline(memory=None,
     steps=[('select', SelectKBest(k=2, score_func=<function f_classif at 0x7f985578eb70>)), ('classify', LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

In [46]:
# Assess data
predicted_log = model.predict(X_matrix)
predicted_log_prob = model.predict_proba(X_matrix)
print(sklearn.metrics.classification_report(extracted_terms, predicted_log))

                    precision    recall  f1-score   support

         acquittal       0.00      0.00      0.00         1
         agreement       0.00      0.00      0.00         1
            answer       0.00      0.00      0.00         5
            appeal       0.00      0.00      0.00         3
         appellate       0.00      0.00      0.00         8
              bail       0.00      0.00      0.00         1
        bankruptcy       0.00      0.00      0.00         3
             brief       0.00      0.00      0.00         1
          chambers       0.00      0.00      0.00         1
charge to the jury       0.00      0.00      0.00         1
       chief judge       0.00      0.00      0.00         1
    clerk of court       0.00      0.00      0.00         2
        common law       0.00      0.00      0.00         1
         complaint       0.00      0.00      0.00         5
          contract       0.00      0.00      0.00         1
        conviction       0.00      0.00

  'precision', 'predicted', average, warn_for)


In [48]:
# pickle model
from sklearn.externals import joblib
joblib.dump(model, "terms_model.pickle")

['terms_model.pickle']

In [None]:
model