## 1 - INTRO
--------------------

### 1.1 - import builtin packages

In [1]:
# builtin
import os
import sys
import dateparser
import string 
import re
import urllib.request
import json
import glob
import heapq

### 1.2 - import external packages

In [2]:
# request and web parse
import requests
import bs4 as bs

# data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# nlp
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
from cleantext import clean
import spacy
import textsplit
from textsplit.tools import SimpleSentenceTokenizer
from textsplit.tools import get_penalty, get_segments
from textsplit.algorithm import split_optimal, split_greedy, get_total
import word2vec

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


### 1.3 - Import local packages

In [3]:
import legal_doc_processing as ldp
from legal_doc_processing.information_extraction import * 
from legal_doc_processing.segmentation import * 
from legal_doc_processing.utils import * 

### 1.4 - Bootstrap packages

In [4]:
nltk.download("popular")
nlp = spacy.load("en_core_web_sm")
%matplotlib inline

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/alex/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk

## 2 - CASE
----------------------------------


### 2.1 -  For one doc


from data

In [5]:
# file_path
file = "data/order-vision-financial-markets-llc.txt"
file_path = os.getcwd() + "/" + file
assert os.path.isfile(file_path)

In [6]:
# read file 
raw_text = load_data(file_path)
raw_text[:300]

' \n\n(— RECEIVED CFTC >\n\nUNITED STATES OF AMERICA\nBefore the\nCOMMODITY FUTURES TRADING COMMISSION\n\n \n\n \n\nOffice of Proceedings\nProceedings Clerk\n\n\\ 5:00 pm, Jul 12, 2019]\n\nIn the Matter of:\n\n \n\n \n\nVision Financial Markets LLC,\nCFTC Docket No. _ 19-13\n\n \n\nRespondent.\n\n \n\nORDER INSTITUTING PROCEEDINGS P'

In [7]:
# clean and first 
clean_pages = clean_doc(raw_text)
first_page = clean_pages[0]
first_page[:10]

[' ( RECEIVED CFTC >',
 ' UNITED STATES OF AMERICA',
 ' Before the',
 ' COMMODITY FUTURES TRADING COMMISSION',
 ' Office of Proceedings',
 ' Proceedings Clerk',
 ' \\ 5:00 pm, Jul 12, 2019]',
 ' In the Matter of:',
 ' Vision Financial Markets LLC,',
 ' CFTC Docket No.  19-13']

In [8]:
# pred case
case = get_case(clean_pages[0])
case

'NO.  19-13'

### 2.2 -  For all test doc

In [9]:
# file_path
folder = "tests/dataset/features"
_files_path = os.getcwd() + "/" + folder
files_path_list = [_files_path + "/" + filename for filename in os.listdir(_files_path) ]
files_path_list

['/home/alex/Desktop/legal_doc_processing/tests/dataset/features/szatmari.txt',
 '/home/alex/Desktop/legal_doc_processing/tests/dataset/features/hartshorn.txt',
 '/home/alex/Desktop/legal_doc_processing/tests/dataset/features/eyal.txt',
 '/home/alex/Desktop/legal_doc_processing/tests/dataset/features/vorley.txt',
 '/home/alex/Desktop/legal_doc_processing/tests/dataset/features/mirae.txt']

In [10]:
# read file
raw_text_list = [load_data(file_path) for file_path in files_path_list]
raw_text_list[0]

'Case 1:19-cv-00544-DKW-KJM Document 34 Filed 08/13/20 Pagelof2 PagelD#: 716\n\nIN THE UNITED STATES DISTRICT COURT\n\nFOR THE DISTRICT OF HAWAITI\n\nUNITED STATES COMMODITY CV 19-00544 DKW-KJM\n\n)\nFUTURES TRADING )\nCOMMISSION, )\n) ORDER ADOPTING\nPlaintiff, ) MAGISTRATE JUDGE’S\n) FINDINGS AND\nVS. ) RECOMMENDATION\n)\nPETER SZATMARI, )\n)\nDefendant. )\n)\n\n \n\nORDER ADOPTING MAGISTRATE JUDGE’S\nFINDINGS AND RECOMMENDATION\n\nFindings and Recommendation having been filed on July 28, 2020 and\nserved on all parties on July 29, 2020, and no objections having been filed by any\nparty,\n\nIT IS HEREBY ORDERED AND ADJUDGED that, pursuant to Title 28,\nUnited States Code, Section 636(b)(1)(C) and Local Rule 74.1, the "Findings and\nRecommendation to Grant Plaintiff\'s Motion for Final Judgment by Default,\n\n//\n//\n\n//\n\x0cCase 1:19-cv-00544-DKW-KJM Document 34 Filed 08/13/20 Page2of2 PagelD#: 717\n\nPermanent Injunction, Civil Monetary Penalties, and Other Statutory Equitable\nRe

In [11]:
# clean and first 
clean_pages_list = [ clean_doc(raw_text) for  raw_text in raw_text_list]
first_page_list = [pages[0] for pages in clean_pages_list]
first_page_list[0]

[' Case 1:19-cv-00544-DKW-KJM Document 34 Filed 08/13/20 Pagelof2 PagelD#: 716',
 ' IN THE UNITED STATES DISTRICT COURT',
 ' FOR THE DISTRICT OF HAWAITI',
 ' UNITED STATES COMMODITY CV 19-00544 DKW-KJM',
 ' FUTURES TRADING )',
 ' COMMISSION, )',
 ' ) ORDER ADOPTING',
 " Plaintiff, ) MAGISTRATE JUDGE'S",
 ' ) FINDINGS AND',
 ' VS. ) RECOMMENDATION',
 ' PETER SZATMARI, )',
 ' Defendant. )',
 " ORDER ADOPTING MAGISTRATE JUDGE'S",
 ' FINDINGS AND RECOMMENDATION',
 ' Findings and Recommendation having been filed on July 28, 2020 and served on all parties on July 29, 2020, and no objections having been filed by any party,']

In [12]:
# pred case
case_list = [get_case(fp) for fp in first_page_list]
case_list

['-- error : case not founded --',
 'NO. 16-CV-9802',
 'NO. 7:19-CV-09832-CS',
 'NO: 18-CV-00603',
 'NO. 20-11']

## 2 - Defendant
-------------------------------------

### 2.1 - One file

In [13]:
# file_path
file = "data/order-vision-financial-markets-llc.txt"
file_path = os.getcwd() + "/" + file
assert os.path.isfile(file_path)

In [14]:
# read file 
raw_text = load_data(file_path)
raw_text[:300]

' \n\n(— RECEIVED CFTC >\n\nUNITED STATES OF AMERICA\nBefore the\nCOMMODITY FUTURES TRADING COMMISSION\n\n \n\n \n\nOffice of Proceedings\nProceedings Clerk\n\n\\ 5:00 pm, Jul 12, 2019]\n\nIn the Matter of:\n\n \n\n \n\nVision Financial Markets LLC,\nCFTC Docket No. _ 19-13\n\n \n\nRespondent.\n\n \n\nORDER INSTITUTING PROCEEDINGS P'

In [26]:
# clean first join
clean_pages = clean_doc(raw_text)
first_page = clean_pages[0]
joined_first_page = ("\n".join(first_page))


 ( RECEIVED CFTC >
 UNITED STATES OF AMERICA
 Before the
 COMMODITY FUTURES TRADING COMMISSION
 Office of Proceedings
 Proceedings Clerk
 \ 5:00 pm, Jul 12, 2019]
 In the Matter of:
 Vision Financial Markets LLC,
 CFTC Docket No.  19-13
 Respondent.
 ORDER INSTITUTING PROCEEDINGS PURSUANT TO
 SECTIO


In [21]:
# Process whole documents
doc = nlp(joined_first_page)

In [33]:
# Analyze syntax
#print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
#print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    if entity.label_ == "ORG":
        #  print(entity.text,":       ", entity.label_)
        print(f"{entity.text[:30].ljust(40)} :  {entity.label_} ")

CFTC                                     :  ORG 
THE COMMODITY EXCHANGE ACT               :  ORG 
The Commodity Futures Trading            :  ORG 
Vision Financial Markets LLC             :  ORG 
Commission Regulation                    :  ORG 
Commission                               :  ORG 
this Order Instituting Proceed           :  ORG 
the Commodity Exchange Act               :  ORG 
Imposing Remedial Sanctions              :  ORG 
Commission                               :  ORG 
Commission                               :  ORG 
Commission                               :  ORG 
Commission                               :  ORG 


In [34]:
# Question answering pipeline, specifying the checkpoint identifier
nlpipe = pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='distilbert-base-cased')

In [38]:
first_page_100 = [text for text in first_page if len(text) > 100]
first_page_100[:10]

[' The Commodity Futures Trading Commission ("Commission") has reason to believe that Vision Financial Markets LLC ("Respondent") violated Commission Regulation ("Regulation") 166.3, 17 C.F.R. 166.3 (2018). Therefore, the Commission deems it appropriate and in the public interest that public administrative proceedings be, and hereby are, instituted to determine whether Respondent engaged in the violations set forth herein and to determine whether any order should be issued imposing remedial sanctions.',
 ' In anticipation of the institution of an administrative proceeding, Respondent has submitted an Offer of Settlement ("Offer"), which the Commission has determined to accept.',
 ' Without admitting or denying any of the findings or conclusions herein, Respondent consents to the entry of this Order Instituting Proceedings Pursuant to Section 6(c) and (d) of the Commodity Exchange Act, Making Findings, and Imposing Remedial Sanctions ("Order") and acknowledges service of this Order.',
 

In [49]:
violeted_ans = nlpipe(question="Who violeted?", context=".".join(first_page_100), topk=3)
violeted_ans

[{'score': 0.5201619863510132,
  'start': 1326,
  'end': 1336,
  'answer': 'Respondent'},
 {'score': 0.018238907679915428,
  'start': 1657,
  'end': 1667,
  'answer': 'Respondent'},
 {'score': 0.0016895169392228127,
  'start': 1326,
  'end': 1353,
  'answer': 'Respondent does not consent'}]

In [51]:
defendant_ans = nlpipe(question="Who is the defendant?", context=".".join(first_page_100), topk=3)
defendant_ans[:3]

[{'score': 0.35667505860328674,
  'start': 1528,
  'end': 1547,
  'answer': 'a party or claimant'},
 {'score': 0.1386204957962036,
  'start': 1186,
  'end': 1205,
  'answer': 'a party or claimant'},
 {'score': 0.11748140305280685,
  'start': 1528,
  'end': 1547,
  'answer': 'a party or claimant'}]

# STOP HERE

## Extracted authorities

In [None]:
UNITED STATES DISTRICT COURT
FOR THE MIDDLE DISTRICT OF FLORIDA 

## Code law violation

In [None]:
violated Section 4c(a)(5)(C) of the Commodity Exchange Act ("Act"), 7 U.S.C.
§ 6c(a)(5)(C) (2018).

In [None]:
Section 4c(a)(5) of the Act, 7 U.S.C.
§ 6c(a)(5) (2012).

## Violation period

In [None]:
line = 'The Commodity Futures Trading Commission ("Commission") has reason to believe that from at least July 2012 through March 2017 ("Relevant Period"), Propex Derivatives Pty Ltd. ("Propex") violated Section 4c(a)(5)(C) of the Commodity Exchange Act ("Act"), 7 U.S.C. 6c(a)(5)(C) (2018). Therefore, the Commission deems it appropriate and in the public interest that public administrative proceedings be, and hereby are, instituted to determine whether Propex engaged in the violations set forth herein and to determine whether any order should be issued imposing remedial sanctions.'

In [None]:
("Relevant Period")

## Transaction amount

In [None]:
pour complaint tous les montants vers Transaction amounts

## Defendant

In [None]:
Propex Derivatives Pty Ltd, Respondent.

In [None]:
#!pip install -U spacy
#!python -m spacy download en_core_web_sm

## Nature of violations

In [None]:
II. FINDINGS
The Commission finds the following:
A. SUMMARY
During the Relevant Period, Propex, by and through a Propex trader ("Trader A"),·
engaged in thousands of instances of the disruptive trading practice known as "spoofing"
(bidding or offering with the intent to cancel the bid or offer before execution) in the E-mini
S&P 500 futures contracts traded on the Chicago Mercantile Exchange ("CME"), a futures
exchange and designated contract market which is owned and operated by CME Group Inc. This
conduct violated Section 4c(a)(5)(C) of the Act, 7 U.S.C. § 6c(a)(5)(C) (2018). 

In [None]:
cftc_full_list = pd.read_excel("cftc_full_list.xlsx")

In [None]:
def gen_line_rep(rep):
    original_path = f"./cftc/original/{rep}"
    text_path = f"./cftc/text/{rep}"
    meta_path = f"./cftc/meta-data/{rep}"
    lines = []
    for file in glob.glob(f"{meta_path}/*.json"):
        filename = os.path.basename(file)

        with open(file) as json_file: 
            data = json.load(json_file)     

        data['filename'] = filename.replace(".json","")
        data['folder'] = rep
        with open(f"{text_path}/{filename.replace('.json','.txt')}") as f:
            data['doc_text'] = f.read()  


        lines.append(data)
    return lines

In [None]:
meta_data = []
for index, row in cftc_full_list[~cftc_full_list.scraped_folder.isnull()].iterrows():
    try:
        meta_data.extend(gen_line_rep(row['scraped_folder']))
    except:
        print("error on",  row['scraped_folder'])
len(meta_data)

In [None]:
df_meta_data = pd.DataFrame(meta_data)

In [None]:
df_meta_data['doc_clean'] = df_meta_data.doc_text.apply(clean_doc)

In [None]:
df_meta_data['first_page'] = df_meta_data['doc_clean'].str[0]

In [None]:
df_meta_data = df_meta_data[~df_meta_data['first_page'].isnull()]

## Get reference

In [None]:
df_meta_data['reference'] = df_meta_data.first_page.apply(get_case)

## Get defendant

In [None]:
first_page = [text for text in df_meta_data.first_page.values[3] if len(text) > 100]

In [None]:
first_page

In [None]:
print(nlp(question="Who violeted?", context=".".join(first_page)))

In [None]:
print(nlp(question="Who is the defendant?", context=".".join(first_page), topk=3))

In [None]:
#df_meta_data[['reference', 'folder', 'filename']][df_meta_data.reference.isnull()]

## Type

In [None]:
df_meta_data['is_order'] = df_meta_data.filename.str.contains("order")

In [None]:
df_meta_data['is_complaint'] = df_meta_data.filename.str.contains("complaint")

In [None]:
df_meta_data['type'] = np.where(df_meta_data['is_order'], "Order CFTC", 
         np.where(df_meta_data['is_complaint'], 
                  "Complaint CFTC", None))