In [3]:
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

import pickle
import numpy as np
import json
from json  import JSONEncoder

import fasttext

from transformers import AutoTokenizer, AutoModelForSequenceClassification

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\panka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\panka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\panka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
#Reading the text file
with open('inputdata.txt', 'r', encoding='utf8') as file:
    txt = file.read()

In [22]:
txt

'[1] This is an appeal from a chambers judge’s order setting the appellant mother’s application\nfor an order of civil habeus corpus regarding her young child over to a one and one\xadhalf day\nspecial chambers hearing (“adjournment issue”). That hearing is now scheduled for September 13\nand 14, 2010. The chambers judge was of the view that given the number of parties and the\ncomplexity of the matter, it was unsuitable for a morning chambers application. The motion for\nhabeus corpus was brought on approximately two days’ notice.\n[2] The appellant asks this Court to grant the habeus corpus relief she seeks and order the\nchild returned to her.\n[3] In the interim, a Provincial Court Family Division trial is schedule to commence\ntomorrow, June 1, 2010. At that trial the issues will include who should be the child’s guardian.\nThe potential guardians include: the appellant, J.B., J.B.’s mother (K.C.), a maternal aunt of the\nchild, and the respondent Director. Seven days of trial tim

In [68]:
#Splitting it based on para
data = re.split(r'\[\S*\]', txt, flags=re.MULTILINE)

In [71]:
data.remove('')

In [72]:
data

[' This is an appeal from a chambers judge’s order setting the appellant mother’s application\nfor an order of civil habeus corpus regarding her young child over to a one and one\xadhalf day\nspecial chambers hearing (“adjournment issue”). That hearing is now scheduled for September 13\nand 14, 2010. The chambers judge was of the view that given the number of parties and the\ncomplexity of the matter, it was unsuitable for a morning chambers application. The motion for\nhabeus corpus was brought on approximately two days’ notice.\n',
 ' The appellant asks this Court to grant the habeus corpus relief she seeks and order the\nchild returned to her.\n',
 ' In the interim, a Provincial Court Family Division trial is schedule to commence\ntomorrow, June 1, 2010. At that trial the issues will include who should be the child’s guardian.\nThe potential guardians include: the appellant, J.B., J.B.’s mother (K.C.), a maternal aunt of the\nchild, and the respondent Director. Seven days of trial t

In [75]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = str(text).lower() 
    text= str(text).strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [76]:
newdata = []

for d in data:
    s = lemmatizer(stopword(preprocess(d)))
    newdata.append(s)

In [77]:
newdata

['appeal chamber judge order set appellant mother application order civil habeus corpus regard young child one onehalf day special chamber hear adjournment issue hear scheduled september chamber judge view give number party complexity matter unsuitable morning chamber application motion habeus corpus bring approximately two day notice',
 'appellant ask court grant habeus corpus relief seek order child return',
 'interim provincial court family division trial schedule commence tomorrow june trial issue include child guardian potential guardian include appellant j b j b mother k c maternal aunt child respondent director seven day trial time set purpose',
 'find unnecessary deal adjournment issue habeus corpus order available record u j b mother care child pursuant unappealed consent interim guardianship order date october continue december role respondent director limit determine condition appellant access child',
 'habeus corpus can not grant respondent director respondent director chil

In [101]:
class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

In [141]:
#BOW json file
#jsondata = []
fl = open('bow_core2.json', 'w', encoding='utf-8')

mdl = pickle.load(open('bow.pkl', 'rb'))
tfidf_vectorizer = pickle.load(open('vectorize.pkl', 'rb'))

for d in newdata:
    X_test = [d]
    X_vector=tfidf_vectorizer.transform(X_test) #converting X_test to vector
    y_predict = mdl.predict(X_vector)      #use the trained model on X_vector
    arr = mdl.predict_proba(X_vector)[:,:]
    y_prob = arr[0][np.argmax(arr)]
    
    x = {
        d: [
            {"y_predict": y_predict},
            {"y_prob": y_prob}
        ]
    }
    
    #jsondata.append(json.dumps(x, cls = NumpyArrayEncoder))
    json.dump(x, fl, cls = NumpyArrayEncoder)
    fl.write("\n")


In [128]:
data_f = []

for d in data:
    s = preprocess(d)
    data_f.append(s)

In [134]:
#Fasttext Json file
fl = open('fasttext_core2.json', 'w', encoding='utf-8')

fst_mdl = fasttext.load_model("model_fasttext.bin")

for d in data_f:
    output = fst_mdl.predict(d, k = 1)
    
    x = {
        d: [
            {"y_predict": output[0]},
            {"y_prob": output[1]}
        ]
    }
    
    #jsondata.append(json.dumps(x, cls = NumpyArrayEncoder))
    json.dump(x, fl, cls = NumpyArrayEncoder)
    fl.write("\n")




In [139]:
#Bert json file
fl = open('bert_core2.json', 'w', encoding='utf-8')

model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
label_names = ['dec_name', 'counsel', 'court', 'facts', 'judge', 'outcome']
loaded_model = AutoModelForSequenceClassification.from_pretrained("legal_bert")


for d in data_f:
    text = d
    inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")

    outputs = loaded_model(**inputs)
    pred = np.argmax(outputs['logits'].detach().numpy(), axis=-1)

    x = {
        d: str(label_names[pred[0]])
    }
    
    #jsondata.append(json.dumps(x, cls = NumpyArrayEncoder))
    json.dump(x, fl, cls = NumpyArrayEncoder)
    fl.write("\n")


Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [1]:
#open and load saved json file
fj = open('2021onsc5263.json',)
dataj = json.load(fj)

In [13]:
newdataj = []
for d in dataj['elements']:
    print(d['Text'])

CITATION: 
Chong v. Donnelly, 2021 ONSC 5263 
COURT FILE NO.: FS-17-40295 
DATE: 20210728 
BETWEEN: 
Ting Mei CHONG 
– and – 
Timothy DONNELLY 
CHOZIK J. 
ONTARIO 
SUPERIOR COURT OF JUSTICE 
Applicant 
Respondent 
) ) ) ) ) ) ) ) ) ) ) ) 
) 
) 
Ting Mei Chong, Self-represented 
Timothy Donnelly, Self-represented 
HEARD: March 29, 30, 31, April 1 and 6, 2021 by video conference 
REASONS FOR JUDGMENT 
[1] 
The parties were married on September 20, 2009. They have two children, Adella (age 10), and Aria (age 9). They separated on July 16, 2016 but continued to reside together in the same home until late June 2018. 
[2] 
Since their separation in 2016, the parties settled many of the issues between them. They resolved the sale of the matrimonial home, property division and equalization, spousal support and travel with the children. They have abided by a “partial” parenting plan, set out in the interim Order made on consent by Coats J. on April 10, 2018.  
[3] 
At this trial, the parties so