In [1]:
import os
import re
import string

import bs4 as bs
import urllib.request
import requests
import json
import glob
import time
import pprint

import pandas as pd
import numpy as np

import dateparser

from cleantext import clean
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from legal_doc_processing.utils import load_data
from legal_doc_processing.legal_doc.segmentation.utils import clean_spec_chars
from legal_doc_processing.legal_doc.segmentation.structure import get_structure, get_header

[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
nltk.download('punkt')
nltk.download("stopwords")
chachedWords = stopwords.words('english')
nlpipe = pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='distilbert-base-cased')

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
"""text=unicode(file)"""

'text=unicode(file)'

In [None]:
# load ciolab
from google.colab import drive
drive.mount('/content/drive')

In [5]:
cd drive/MyDrive/Theolex/theolex_document_processing/

/content/drive/MyDrive/Theolex/theolex_document_processing


In [5]:
# text_path
text_path = "./data/files/7100-15/order-allied-markets-llc-et-al.txt"

In [7]:
# load raw text
raw_text = load_data(text_path)
raw_text[:300]

'UNITED STATES DISTRICT COURT\nMIDDLE DISTRICT OF FLORIDA\nJacksonville Division\n\nU.S. COMMODITY FUTURES\nTRADING COMMISSION,\nPlaintiff,\nv. Case No. 3:15-cv-5-J-34MCR\nALLIED MARKETS LLC,\nJOSHUA GILLILAND, and\nCHAWALIT WONGKHIAOQO,\n\nDefendants.\n\n \n\n \n\n \n\nORDER GRANTING PLAINTIFF’S EX PARTE\nMOTION FOR STA'

In [8]:
# clean_spec_chars
cleaned_text, cleaned_formatted_text = clean_spec_chars(raw_text)
cleaned_text[:300]

'UNITED STATES DISTRICT COURT MIDDLE DISTRICT OF FLORIDA Jacksonville Division U.S. COMMODITY FUTURES TRADING COMMISSION, Plaintiff, v. Case No. 3:15-cv-5-J-34MCR ALLIED MARKETS LLC, JOSHUA GILLILAND, and CHAWALIT WONGKHIAOQO, Defendants. ORDER GRANTING PLAINTIFF’S EX PARTE MOTION FOR STATUTORY RESTR'

In [9]:
# get_structure
structured_text = get_structure(cleaned_text)
structured_text[:10]

[{'content': ['UNITED STATES DISTRICT COURT MIDDLE DISTRICT OF FLORIDA Jacksonville Division U.S. COMMODITY FUTURES TRADING COMMISSION, Plaintiff, v. Case No.'],
  'header': 'UNITED STATES DISTRICT COURT MIDDLE DISTRICT OF FLORIDA Jacksonville Division U.S. COMMODITY FUTURES TRADING COMMISSION, Plaintiff, v. Case No.',
  'id': 0},
 {'content': ['3:15-cv-5-J-34MCR ALLIED MARKETS LLC, JOSHUA GILLILAND, and CHAWALIT WONGKHIAOQO, Defendants.'],
  'header': '3:15-cv-5-J-34MCR ALLIED MARKETS LLC, JOSHUA GILLILAND, and CHAWALIT WONGKHIAOQO, Defendants.',
  'id': 1},
 {'content': ["ORDER GRANTING PLAINTIFF’S EX PARTE MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING PRELIMINARY INJUNCTION HEARING THIS CAUSE is before the Court on Plaintiff's Ex Parte Motion for Statutory Restraining Order and Motion for Preliminary Injunction (Doc.",
   'S-4; Motion), filed on January 5, 2015.',
   'Simultaneously with the Motion, the United States Commodity Futures Trading Commission (Plaintiff or CFTC) f

In [10]:
# explore get_structure 1
for k in structured_text[:10]: 
    print(pprint.pprint(k))
    print("\n------\n")

{'content': ['UNITED STATES DISTRICT COURT MIDDLE DISTRICT OF FLORIDA '
             'Jacksonville Division U.S. COMMODITY FUTURES TRADING COMMISSION, '
             'Plaintiff, v. Case No.'],
 'header': 'UNITED STATES DISTRICT COURT MIDDLE DISTRICT OF FLORIDA '
           'Jacksonville Division U.S. COMMODITY FUTURES TRADING COMMISSION, '
           'Plaintiff, v. Case No.',
 'id': 0}
None

------

{'content': ['3:15-cv-5-J-34MCR ALLIED MARKETS LLC, JOSHUA GILLILAND, and '
             'CHAWALIT WONGKHIAOQO, Defendants.'],
 'header': '3:15-cv-5-J-34MCR ALLIED MARKETS LLC, JOSHUA GILLILAND, and '
           'CHAWALIT WONGKHIAOQO, Defendants.',
 'id': 1}
None

------

{'content': ['ORDER GRANTING PLAINTIFF’S EX PARTE MOTION FOR STATUTORY '
             'RESTRAINING ORDER AND SCHEDULING PRELIMINARY INJUNCTION HEARING '
             "THIS CAUSE is before the Court on Plaintiff's Ex Parte Motion "
             'for Statutory Restraining Order and Motion for Preliminary '
             'Inju

In [11]:
# explore get_structure 2
for k in structured_text : 
    print(f"header : {k.get('header')}\n id:{k.get('id')},\n content:{k.get('content')[:400]} ")
    print("\n-----\n")

header : UNITED STATES DISTRICT COURT MIDDLE DISTRICT OF FLORIDA Jacksonville Division U.S. COMMODITY FUTURES TRADING COMMISSION, Plaintiff, v. Case No.
 id:0,
 content:['UNITED STATES DISTRICT COURT MIDDLE DISTRICT OF FLORIDA Jacksonville Division U.S. COMMODITY FUTURES TRADING COMMISSION, Plaintiff, v. Case No.'] 

-----

header : 3:15-cv-5-J-34MCR ALLIED MARKETS LLC, JOSHUA GILLILAND, and CHAWALIT WONGKHIAOQO, Defendants.
 id:1,
 content:['3:15-cv-5-J-34MCR ALLIED MARKETS LLC, JOSHUA GILLILAND, and CHAWALIT WONGKHIAOQO, Defendants.'] 

-----

header : A.
 id:2,
 content:["ORDER GRANTING PLAINTIFF’S EX PARTE MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING PRELIMINARY INJUNCTION HEARING THIS CAUSE is before the Court on Plaintiff's Ex Parte Motion for Statutory Restraining Order and Motion for Preliminary Injunction (Doc.", 'S-4; Motion), filed on January 5, 2015.', 'Simultaneously with the Motion, the United States Commodity Futures Trading Commission (Plaintiff or CFTC) filed 

In [12]:
# explore get_structure
structured_text[2]

{'content': ["ORDER GRANTING PLAINTIFF’S EX PARTE MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING PRELIMINARY INJUNCTION HEARING THIS CAUSE is before the Court on Plaintiff's Ex Parte Motion for Statutory Restraining Order and Motion for Preliminary Injunction (Doc.",
  'S-4; Motion), filed on January 5, 2015.',
  'Simultaneously with the Motion, the United States Commodity Futures Trading Commission (Plaintiff or CFTC) filed a Complaint for Injunctive Relief, Civil Monetary Penalty, and Other Equitable Relief (Doc.',
  'S-3) against Defendants Allied Markets LLC, Joshua Gilliland, and Chawalit Wongkhiao (collectively, Defendants).',
  'In the Motion, the CFTC moves, pursuant to Section 6c(a) of the Commodity Exchange Act (CEA), 7 U.S.C.',
  '§ 13a-1(a) (2012), for an ex parte statutory restraining order freezing assets and prohibiting the destruction of books, records, or other documents; and for an order requiring Defendants to show cause why a preliminary injunction should not

In [13]:
# get_header 
text_header = get_header(structured_text)
text_header

'UNITED STATES DISTRICT COURT MIDDLE DISTRICT OF FLORIDA Jacksonville Division U.S. COMMODITY FUTURES TRADING COMMISSION, Plaintiff, v. Case No.3:15-cv-5-J-34MCR ALLIED MARKETS LLC, JOSHUA GILLILAND, and CHAWALIT WONGKHIAOQO, Defendants.'

In [21]:
# get_defendant 

def who_defendant(text:str, nlpipe = nlpipe):
    return nlpipe(question="Who is the Defendant?", context=text,topk=3)

def who_violated(text:str, nlpipe = nlpipe):   
    return nlpipe(question="Who violated?", context=text,topk=3)

def who_accused(text:str, nlpipe = nlpipe):
    return nlpipe(question="Who is acused?", context=text,topk=3)

def get_charged(text:str, nlpipe = nlpipe):
    return nlpipe(question="Who is charged?", context=text,topk=3)

defendant_funct_list = [(who_defendant, "who_defendant"), (who_violated, "who_violated"), (who_accused, "who_accused"), (get_charged, "get_charged")]

In [22]:
# get_plaintiff

def who_plaintiff(text:str, nlpipe = nlpipe):
    return nlpipe(question="Who is the plaintiff?", context=text,topk=3)

def who_victim(text:str, nlpipe = nlpipe):   
    return nlpipe(question="Who is the victim?", context=text,topk=3)

def who_accuse(text:str, nlpipe = nlpipe):
    return nlpipe(question="Who is the acusator?", context=text,topk=3)

def get_charge(text:str, nlpipe = nlpipe):
    return nlpipe(question="Who is charging?", context=text,topk=3)

plaintiff_funct_list = [(who_plaintiff, "who_plaintiff"), (who_victim, "who_victim"), (who_accuse, "who_accuse"), (get_charge, "get_charge")]

In [23]:
# join a context section in text
context_text = " ".join( structured_text[2].get("content"))
context_text[:300]

"ORDER GRANTING PLAINTIFF’S EX PARTE MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING PRELIMINARY INJUNCTION HEARING THIS CAUSE is before the Court on Plaintiff's Ex Parte Motion for Statutory Restraining Order and Motion for Preliminary Injunction (Doc. S-4; Motion), filed on January 5, 2015. S"

In [24]:
# predict from context
defendant_from_content = who_defendant(text=context_text, nlpipe=nlpipe)
defendant_from_content

[{'score': 0.09882518649101257,
  'start': 2450,
  'end': 2475,
  'answer': 'Gilliland’s and Wongkhiao'},
 {'score': 0.08408939093351364,
  'start': 570,
  'end': 588,
  'answer': 'Chawalit Wongkhiao'},
 {'score': 0.05622170865535736,
  'start': 528,
  'end': 546,
  'answer': 'Allied Markets LLC'}]

In [25]:
# predict from header
defendant_from_header = get_defendant(text=text_header, nlpipe=nlpipe)
defendant_from_header

NameError: name 'get_defendant' is not defined

In [26]:
for sec in structured_text[2:10] : 
    _content = " ".join( sec["content"])
    if len(_content)> 500 : 
        defendant = get_defendant(_content, nlpipe)
        print(f"id: {sec.get('id')}\nheader : {sec.get('header')}\n _content : {_content[:300]}\ndefendant : {defendant}\
    \n-----\n" )

NameError: name 'get_defendant' is not defined

In [30]:
def predict_something(funct, structured_text, sections=[2, 7]) : 
    """ """
    
    preds = list()
    for k in  sections: 
        _content = " ".join( structured_text[k]["content"])
        preds.extend( funct(_content, nlpipe=nlpipe))
    preds = sorted(preds, reverse=True, key=lambda i : i["score"] ) 
    return preds    
    
    
pred_defendant = predict_something(who_defendant, structured_text )
pred_defendant

[{'score': 0.09882518649101257,
  'start': 2450,
  'end': 2475,
  'answer': 'Gilliland’s and Wongkhiao'},
 {'score': 0.08408939093351364,
  'start': 570,
  'end': 588,
  'answer': 'Chawalit Wongkhiao'},
 {'score': 0.06407205015420914,
  'start': 1392,
  'end': 1410,
  'answer': 'Allied Markets LLC'},
 {'score': 0.05622170865535736,
  'start': 528,
  'end': 546,
  'answer': 'Allied Markets LLC'},
 {'score': 0.006931839045137167,
  'start': 1392,
  'end': 1428,
  'answer': 'Allied Markets LLC, Joshua Gilliland'},
 {'score': 0.0056947264820337296,
  'start': 1392,
  'end': 1410,
  'answer': 'Allied Markets LLC'}]

In [28]:
def grouped_predictions(funct, structured_text, sections=[2, 7]) : 
    """ """
    
    pred_defendant = predict_something(get_defendant, structured_text , sections=sections)
    df = pd.DataFrame(pred_defendant)
    
    gp_answer = list() 
    for k, sub_df in df.groupby("answer") : 
        gp_answer.append({"ans": k, "cum_score" : sub_df.score.sum()})
    
    return gp_answer

# ---- 

grouped_predictions(get_defendant, structured_text)

NameError: name 'get_defendant' is not defined

In [135]:
# defenant question list

for funct, name in defendant_funct_list: 
    print(f"---- {name} ----" )
    print(grouped_predictions(get_defendant, structured_text))
    print("\n")

---- who_defendant ----
[{'ans': 'Allied Markets LLC', 'cum_score': 0.12598848529160023}, {'ans': 'Allied Markets LLC, Joshua Gilliland', 'cum_score': 0.006931839045137167}, {'ans': 'Chawalit Wongkhiao', 'cum_score': 0.08408939093351364}, {'ans': 'Gilliland’s and Wongkhiao', 'cum_score': 0.09882518649101257}]


---- who_violated ----
[{'ans': 'Allied Markets LLC', 'cum_score': 0.12598848529160023}, {'ans': 'Allied Markets LLC, Joshua Gilliland', 'cum_score': 0.006931839045137167}, {'ans': 'Chawalit Wongkhiao', 'cum_score': 0.08408939093351364}, {'ans': 'Gilliland’s and Wongkhiao', 'cum_score': 0.09882518649101257}]


---- who_accused ----
[{'ans': 'Allied Markets LLC', 'cum_score': 0.12598848529160023}, {'ans': 'Allied Markets LLC, Joshua Gilliland', 'cum_score': 0.006931839045137167}, {'ans': 'Chawalit Wongkhiao', 'cum_score': 0.08408939093351364}, {'ans': 'Gilliland’s and Wongkhiao', 'cum_score': 0.09882518649101257}]


---- get_charged ----
[{'ans': 'Allied Markets LLC', 'cum_score'

In [136]:
# plaintiff_funct_list

for funct, name in plaintiff_funct_list: 
    print(f"---- {name} ----" )
    print(grouped_predictions(get_defendant, structured_text))
    print("\n")

---- who_plaintiff ----
[{'ans': 'Allied Markets LLC', 'cum_score': 0.12598848529160023}, {'ans': 'Allied Markets LLC, Joshua Gilliland', 'cum_score': 0.006931839045137167}, {'ans': 'Chawalit Wongkhiao', 'cum_score': 0.08408939093351364}, {'ans': 'Gilliland’s and Wongkhiao', 'cum_score': 0.09882518649101257}]


---- who_victim ----
[{'ans': 'Allied Markets LLC', 'cum_score': 0.12598848529160023}, {'ans': 'Allied Markets LLC, Joshua Gilliland', 'cum_score': 0.006931839045137167}, {'ans': 'Chawalit Wongkhiao', 'cum_score': 0.08408939093351364}, {'ans': 'Gilliland’s and Wongkhiao', 'cum_score': 0.09882518649101257}]


---- who_accuse ----
[{'ans': 'Allied Markets LLC', 'cum_score': 0.12598848529160023}, {'ans': 'Allied Markets LLC, Joshua Gilliland', 'cum_score': 0.006931839045137167}, {'ans': 'Chawalit Wongkhiao', 'cum_score': 0.08408939093351364}, {'ans': 'Gilliland’s and Wongkhiao', 'cum_score': 0.09882518649101257}]


---- get_charge ----
[{'ans': 'Allied Markets LLC', 'cum_score': 0.

[{'ans': 'Allied Markets LLC', 'cum_score': 0.12598848529160023},
 {'ans': 'Allied Markets LLC, Joshua Gilliland',
  'cum_score': 0.006931839045137167},
 {'ans': 'Chawalit Wongkhiao', 'cum_score': 0.08408939093351364},
 {'ans': 'Gilliland’s and Wongkhiao', 'cum_score': 0.09882518649101257}]