In [1]:
#!pip install pandas
#!pip install numpy
#!pip sklearn
#!pip install requests
#!pip install bs4
#!pip install nltk
#!pip install spacy
#!pip install cleantext
#!pip install clean-text
#!pip install unidecode
#!python3 -m spacy download en_core_web_sm


In [2]:
# builtin
import os
import sys
import dateparser
import string 
import re
import urllib.request
import json
import glob
import heapq

# request and web parse
import requests
import bs4 as bs

# data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# nlp
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
from cleantext import clean
import spacy
import textsplit
from textsplit.tools import SimpleSentenceTokenizer
from textsplit.tools import get_penalty, get_segments
from textsplit.algorithm import split_optimal, split_greedy, get_total
import word2vec


Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [3]:
# bootsparp pâckages
nltk.download("popular")
nlp = spacy.load("en_core_web_sm")
%matplotlib inline


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk

In [4]:
# pwd
!pwd

# update sys path
# module_path = os.path.abspath(os.path.join('..'))
# print(module_path)
# if module_path not in sys.path:
#     sys.path.append(module_path+"/"+"legal_doc_processing")

/home/alex/Desktop/legal_doc_processing


In [5]:
# package import
import legal_doc_processing as ldp
from legal_doc_processing.utils import *

# test ok import  
hello()

'world'

In [6]:
# root
root = os.getcwd()
print(root)

# file
file = "data/cftc_text_7100-15_press-release.txt"
print(file)

# text_path
text_path = root + "/" + file
print(text_path)

# sanitary check
os.path.isfile(text_path)

/home/alex/Desktop/legal_doc_processing
data/cftc_text_7100-15_press-release.txt
/home/alex/Desktop/legal_doc_processing/data/cftc_text_7100-15_press-release.txt


True

In [7]:
# load doc
raw_text=load_data(text_path)
raw_text[:300]

'Release\tNumber\t7100-15\n\nJanuary\t12,\t2015\n\nFederal\tCourt\tin\tFlorida\tEnters\tOrder\tFreezing\tAssets\tin\tCFTC\tForeign\tCurrency\tAnti-\nFraud\tAction\tagainst\tAllied\tMarkets\tLLC\tand\tits\tPrincipals\tJoshua\tGilliland\tand\tChawalit\nWongkhiao\n\nCFTC\tCharges\tJacksonville,\tFlorida,\tDefendants\twith\tOperating\ta\nFraudulen'

In [8]:
# first text cleaning based on regex, just keep text not spec chars
clean_text, formatted_clean_text = clean_spec_chars(raw_text)
clean_text[:300]

'Release Number 7100-15 January 12, 2015 Federal Court in Florida Enters Order Freezing Assets in CFTC Foreign Currency Anti- Fraud Action against Allied Markets LLC and its Principals Joshua Gilliland and Chawalit Wongkhiao CFTC Charges Jacksonville, Florida, Defendants with Operating a Fraudulent F'

In [9]:
# handle encoding problems and force ascii conversion ; return clean text
clean_text = handle_encoding(clean_text)
clean_text[:300]

'Release Number 7100-15 January 12, 2015 Federal Court in Florida Enters Order Freezing Assets in CFTC Foreign Currency Anti- Fraud Action against Allied Markets LLC and its Principals Joshua Gilliland and Chawalit Wongkhiao CFTC Charges Jacksonville, Florida, Defendants with Operating a Fraudulent F'

In [10]:
# test tokenize sentences
# ???
tokenized_text = sent_tokenize("Three days later, the SEC  filed a federal injunctive action against Sentinel, SEC v.  Sentinel,  et al., No.  07 CV 4684 (N.D. Ill. filed Aug. 20, 2007), and on April 28, 2008, the CFTC filed a Complaint against Sentinel, Bloom and Sentinel's Senior Vice President and head trader, Charles K. Mosley, seeking injunctive and other equitable relief, as well as the imposition of civil penalties, for violating various provisions of the Act and Commission Regulations.")
tokenized_text[0]

'Three days later, the SEC  filed a federal injunctive action against Sentinel, SEC v.  Sentinel,  et al., No.'

In [11]:
tokenized_text[1]

"07 CV 4684 (N.D. Ill. filed Aug. 20, 2007), and on April 28, 2008, the CFTC filed a Complaint against Sentinel, Bloom and Sentinel's Senior Vice President and head trader, Charles K. Mosley, seeking injunctive and other equitable relief, as well as the imposition of civil penalties, for violating various provisions of the Act and Commission Regulations."

In [12]:
punctuation=[".", "?", "!"]

In [14]:
section=get_structured_document(raw_text)
str(section[4])[:300]

'{\'content\': [" (\'Release Number 7100-15\', \'Release Number \')", " (\'January 12, 2015\', \'January \')", " (\'Federal Court in Florida Enters Order Freezing Assets in CFTC Foreign Currency Anti-\', \'Federal Court in Florida Enters Order Freezing Assets in CFTC Foreign Currency Anti \')", " (\'Fraud Action ag'

In [15]:
def summary(sentence_scores):
    summary_sentences = heapq.nlargest(10, sentence_scores, key=sentence_scores.get)
    return summary_sentences

In [16]:
# root
root = os.getcwd()
print(root)

# file
file = "data/cftc_text_7100-15_order-allied-markets-llc-et-al.txt"
print(file)

# text_path
text_path = root + "/" + file
print(text_path)

# sanitary check
os.path.isfile(text_path)

/home/alex/Desktop/legal_doc_processing
data/cftc_text_7100-15_order-allied-markets-llc-et-al.txt
/home/alex/Desktop/legal_doc_processing/data/cftc_text_7100-15_order-allied-markets-llc-et-al.txt


True

In [17]:
# load text
raw_text = load_data(text_path)

In [18]:
# clean
article_text, formatted_article_text=clean_spec_chars(file)

# frequencies
sentence_list = nltk.sent_tokenize(article_text)
word_frequencies=word_frequency(formatted_article_text)
stopwords = nltk.corpus.stopwords.words('english')

maximum_frequncy = max(word_frequencies.values())

In [19]:
# summary
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    
sentence_scores = sentence_score(sentence_list, word_frequencies)
    
summary(sentence_scores)

[]

##  Training textsplit model using word vector

In [None]:
corpus_path = './text8'  # be sure your corpus is cleaned from punctuation and lowercased
if not os.path.exists(corpus_path):
    !wget http://mattmahoney.net/dc/text8.zip
    !unzip {corpus_path}

In [None]:
links = './7100-15/cftc_text_7100-15_order-allied-markets-llc-et-al.txt'

In [None]:
wrdvec_path = 'wrdvecs.bin'
if not os.path.exists(wrdvec_path):
    %time word2vec.word2vec(corpus_path, wrdvec_path, cbow=1, iter_=5, hs=1, threads=8, sample='1e-5', window=15, size=200, binary=1)

In [None]:
model = word2vec.load(wrdvec_path)
wrdvecs = pd.DataFrame(model.vectors, index=model.vocab)
del model
print(wrdvecs.shape)

In [None]:
sentence_tokenizer = SimpleSentenceTokenizer()


In [None]:
link = links
segment_len = 30  # segment target length in sentences
book_path = os.path.basename(link)

with open(book_path, 'rt') as f:
    text = f.read()  #.replace('\n', ' ')  # punkt tokenizer handles newlines not so nice

sentenced_text = sentence_tokenizer(text)
vecr = CountVectorizer(vocabulary=wrdvecs.index)

sentence_vectors = vecr.transform(sentenced_text).dot(wrdvecs)

penalty = get_penalty([sentence_vectors], segment_len)
print('penalty %4.2f' % penalty)

optimal_segmentation = split_optimal(sentence_vectors, penalty, seg_limit=250)
segmented_text = get_segments(sentenced_text, optimal_segmentation)

print('%d sentences, %d segments, avg %4.2f sentences per segment' % (
    len(sentenced_text), len(segmented_text), len(sentenced_text) / len(segmented_text)))

with open(book_path + '.seg', 'wt') as f:
    for i, segment_sentences in enumerate(segmented_text):
        segment_str = ' // '.join(segment_sentences)
        gain = optimal_segmentation.gains[i] if i < len(segmented_text) - 1 else 0
        segment_info = ' [%d sentences, %4.3f] ' % (len(segment_sentences), gain) 
        print(segment_str + '\n8<' + '=' * 30 + segment_info + "=" * 30, file=f)

greedy_segmentation = split_greedy(sentence_vectors, max_splits=len(optimal_segmentation.splits))
greedy_segmented_text = get_segments(sentenced_text, greedy_segmentation)
lengths_optimal = [len(segment) for segment in segmented_text for sentence in segment]
lengths_greedy = [len(segment) for segment in greedy_segmented_text for sentence in segment]
df = pd.DataFrame({'greedy':lengths_greedy, 'optimal': lengths_optimal})
df.plot.line(figsize=(18, 3), title='Segment lenghts over text')
df.plot.hist(bins=30, alpha=0.5, figsize=(10, 3), title='Histogram of segment lengths')

totals = [get_total(sentence_vectors, seg.splits, penalty) 
          for seg in [optimal_segmentation, greedy_segmentation]]
print('optimal score %4.2f, greedy score %4.2f' % tuple(totals))
print('ratio of scores %5.4f' % (totals[0] / totals[1]))

## No case/docket

In [21]:
clean_page = clean_doc(load_data(text_path))
clean_page[1][2:4]

[" ('The Court possesses jurisdiction over the parties and over the subject matter ', 'The Court possesses jurisdiction over the parties and over the subject matter ')",
 " ('of this action pursuant to Section 6c of the CEA, 7 U.S.C. § 13a- l (2012). ', 'of this action pursuant to Section c of the CEA U S C a l ')"]

In [22]:
def get_case(first_page):
    for line in first_page:
        if len(line) < 40:
            p = re.compile("NO[\.:]\s*.+")
            result = p.search(line.upper())
            if result:
                return result.group(0).strip()
            
    for line in first_page:   
        if len(line) < 40:    
            p = re.compile("\d*-?CV-\d+.*")
            result = p.search(line.upper())
            if result:
                return result.group(0).strip()

In [24]:
get_case(clean_page[1])

In [25]:
get_case([' CFTC Docket No. SD 20-01'])

'NO. SD 20-01'

## Defendant

In [26]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("\n".join(clean_page[0]))
print(text)
doc = nlp(text)

# Analyze syntax
#print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
#print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    if entity.label_ == "ORG":
        print(entity.text,":       ", entity.label_)

 ('UNITED STATES DISTRICT COURT ', 'UNITED STATES DISTRICT COURT ')
 ('MIDDLE DISTRICT OF FLORIDA ', 'MIDDLE DISTRICT OF FLORIDA ')
 ('Jacksonville Division ', 'Jacksonville Division ')
 ('v. ', 'v ')
 ('Case No. 3:15-cv-5-J-34MCR ', 'Case No cv J MCR ')
 ('U.S. COMMODITY FUTURES ', 'U S COMMODITY FUTURES ')
 ('TRADING COMMISSION, ', 'TRADING COMMISSION ')
 ('Plaintiff, ', 'Plaintiff ')
 ('ALLIED MARKETS LLC, ', 'ALLIED MARKETS LLC ')
 ('JOSHUA GILLILAND, and ', 'JOSHUA GILLILAND and ')
 ('CHAWALIT WONGKHIAO, ', 'CHAWALIT WONGKHIAO ')
 ('Defendants. ', 'Defendants ')
 ("ORDER GRANTING PLAINTIFF'S EX PARTE ", 'ORDER GRANTING PLAINTIFF S EX PARTE ')
 ('MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING ', 'MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING ')
 ('PRELIMINARY fNJUNCTION HEARING ', 'PRELIMINARY fNJUNCTION HEARING ')
 ('THIS CAUSE is before the Court on Plaintiffs Ex Parte Motion for Statutory ', 'THIS CAUSE is before the Court on Plaintiffs Ex Parte Motion for Statuto

In [27]:
print("\n".join(clean_page[0]))

 ('UNITED STATES DISTRICT COURT ', 'UNITED STATES DISTRICT COURT ')
 ('MIDDLE DISTRICT OF FLORIDA ', 'MIDDLE DISTRICT OF FLORIDA ')
 ('Jacksonville Division ', 'Jacksonville Division ')
 ('v. ', 'v ')
 ('Case No. 3:15-cv-5-J-34MCR ', 'Case No cv J MCR ')
 ('U.S. COMMODITY FUTURES ', 'U S COMMODITY FUTURES ')
 ('TRADING COMMISSION, ', 'TRADING COMMISSION ')
 ('Plaintiff, ', 'Plaintiff ')
 ('ALLIED MARKETS LLC, ', 'ALLIED MARKETS LLC ')
 ('JOSHUA GILLILAND, and ', 'JOSHUA GILLILAND and ')
 ('CHAWALIT WONGKHIAO, ', 'CHAWALIT WONGKHIAO ')
 ('Defendants. ', 'Defendants ')
 ("ORDER GRANTING PLAINTIFF'S EX PARTE ", 'ORDER GRANTING PLAINTIFF S EX PARTE ')
 ('MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING ', 'MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING ')
 ('PRELIMINARY fNJUNCTION HEARING ', 'PRELIMINARY fNJUNCTION HEARING ')
 ('THIS CAUSE is before the Court on Plaintiffs Ex Parte Motion for Statutory ', 'THIS CAUSE is before the Court on Plaintiffs Ex Parte Motion for Statuto

In [28]:
# Question answering pipeline, specifying the checkpoint identifier
nlp = pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='distilbert-base-cased')

In [29]:
first_page = [text for text in clean_page[0] if len(text) > 5]

In [30]:
first_page

[" ('UNITED STATES DISTRICT COURT ', 'UNITED STATES DISTRICT COURT ')",
 " ('MIDDLE DISTRICT OF FLORIDA ', 'MIDDLE DISTRICT OF FLORIDA ')",
 " ('Jacksonville Division ', 'Jacksonville Division ')",
 " ('v. ', 'v ')",
 " ('Case No. 3:15-cv-5-J-34MCR ', 'Case No cv J MCR ')",
 " ('U.S. COMMODITY FUTURES ', 'U S COMMODITY FUTURES ')",
 " ('TRADING COMMISSION, ', 'TRADING COMMISSION ')",
 " ('Plaintiff, ', 'Plaintiff ')",
 " ('ALLIED MARKETS LLC, ', 'ALLIED MARKETS LLC ')",
 " ('JOSHUA GILLILAND, and ', 'JOSHUA GILLILAND and ')",
 " ('CHAWALIT WONGKHIAO, ', 'CHAWALIT WONGKHIAO ')",
 " ('Defendants. ', 'Defendants ')",
 ' ("ORDER GRANTING PLAINTIFF\'S EX PARTE ", \'ORDER GRANTING PLAINTIFF S EX PARTE \')',
 " ('MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING ', 'MOTION FOR STATUTORY RESTRAINING ORDER AND SCHEDULING ')",
 " ('PRELIMINARY fNJUNCTION HEARING ', 'PRELIMINARY fNJUNCTION HEARING ')",
 " ('THIS CAUSE is before the Court on Plaintiffs Ex Parte Motion for Statutory ', 'THIS CA

In [31]:
print(nlp(question="Who violeted?", context=formatted_article_text))

{'score': 0.19786383211612701, 'start': 21, 'end': 49, 'answer': 'allied markets llc et al txt'}


In [32]:
print(nlp(question="Who is the defendant?", context=formatted_article_text, topk=2))

[{'score': 0.38688942790031433, 'start': 21, 'end': 35, 'answer': 'allied markets'}, {'score': 0.2808257043361664, 'start': 21, 'end': 27, 'answer': 'allied'}]


# STOP HERE

## Extracted authorities

In [None]:
UNITED STATES DISTRICT COURT
FOR THE MIDDLE DISTRICT OF FLORIDA 

## Code law violation

In [None]:
violated Section 4c(a)(5)(C) of the Commodity Exchange Act ("Act"), 7 U.S.C.
§ 6c(a)(5)(C) (2018).

In [None]:
Section 4c(a)(5) of the Act, 7 U.S.C.
§ 6c(a)(5) (2012).

## Violation period

In [None]:
line = 'The Commodity Futures Trading Commission ("Commission") has reason to believe that from at least July 2012 through March 2017 ("Relevant Period"), Propex Derivatives Pty Ltd. ("Propex") violated Section 4c(a)(5)(C) of the Commodity Exchange Act ("Act"), 7 U.S.C. 6c(a)(5)(C) (2018). Therefore, the Commission deems it appropriate and in the public interest that public administrative proceedings be, and hereby are, instituted to determine whether Propex engaged in the violations set forth herein and to determine whether any order should be issued imposing remedial sanctions.'

In [None]:
("Relevant Period")

## Transaction amount

In [None]:
pour complaint tous les montants vers Transaction amounts

## Defendant

In [None]:
Propex Derivatives Pty Ltd, Respondent.

In [None]:
#!pip install -U spacy
#!python -m spacy download en_core_web_sm

## Nature of violations

In [None]:
II. FINDINGS
The Commission finds the following:
A. SUMMARY
During the Relevant Period, Propex, by and through a Propex trader ("Trader A"),·
engaged in thousands of instances of the disruptive trading practice known as "spoofing"
(bidding or offering with the intent to cancel the bid or offer before execution) in the E-mini
S&P 500 futures contracts traded on the Chicago Mercantile Exchange ("CME"), a futures
exchange and designated contract market which is owned and operated by CME Group Inc. This
conduct violated Section 4c(a)(5)(C) of the Act, 7 U.S.C. § 6c(a)(5)(C) (2018). 

In [None]:
cftc_full_list = pd.read_excel("cftc_full_list.xlsx")

In [None]:
def gen_line_rep(rep):
    original_path = f"./cftc/original/{rep}"
    text_path = f"./cftc/text/{rep}"
    meta_path = f"./cftc/meta-data/{rep}"
    lines = []
    for file in glob.glob(f"{meta_path}/*.json"):
        filename = os.path.basename(file)

        with open(file) as json_file: 
            data = json.load(json_file)     

        data['filename'] = filename.replace(".json","")
        data['folder'] = rep
        with open(f"{text_path}/{filename.replace('.json','.txt')}") as f:
            data['doc_text'] = f.read()  


        lines.append(data)
    return lines

In [None]:
meta_data = []
for index, row in cftc_full_list[~cftc_full_list.scraped_folder.isnull()].iterrows():
    try:
        meta_data.extend(gen_line_rep(row['scraped_folder']))
    except:
        print("error on",  row['scraped_folder'])
len(meta_data)

In [None]:
df_meta_data = pd.DataFrame(meta_data)

In [None]:
df_meta_data['doc_clean'] = df_meta_data.doc_text.apply(clean_doc)

In [None]:
df_meta_data['first_page'] = df_meta_data['doc_clean'].str[0]

In [None]:
df_meta_data = df_meta_data[~df_meta_data['first_page'].isnull()]

## Get reference

In [None]:
df_meta_data['reference'] = df_meta_data.first_page.apply(get_case)

## Get defendant

In [None]:
first_page = [text for text in df_meta_data.first_page.values[3] if len(text) > 100]

In [None]:
first_page

In [None]:
print(nlp(question="Who violeted?", context=".".join(first_page)))

In [None]:
print(nlp(question="Who is the defendant?", context=".".join(first_page), topk=3))

In [None]:
#df_meta_data[['reference', 'folder', 'filename']][df_meta_data.reference.isnull()]

## Type

In [None]:
df_meta_data['is_order'] = df_meta_data.filename.str.contains("order")

In [None]:
df_meta_data['is_complaint'] = df_meta_data.filename.str.contains("complaint")

In [None]:
df_meta_data['type'] = np.where(df_meta_data['is_order'], "Order CFTC", 
         np.where(df_meta_data['is_complaint'], 
                  "Complaint CFTC", None))