## this notebook will use spacy to extract the legal entity from the text and also label the entity type

### the training data is like:
#### *6_3.json*  has json data of format:
----
{"doc_id": 1835, "text": "1 The applicants Sharman Networks Ltd ('Sharman Networks'), Sharman License Holdings Ltd ('Sharman License') and Ms Nicola Anne Hemming ('Ms Hemming') are each the subject of asset preservation orders made by Wilcox J on 22 March 2005 ('the Mareva orders').", "entities": [[17, 37, "Organization"], [38, 58, "Organization"], [60, 88, "Organization"], [89, 108, "Organization"], [113, 135, "person"], [136, 150, "person"], [221, 234, "Date"], [209, 217, "Judges"]], "username": "admin"}

----

#### *legal_train.txt* has annotations of the form:
----
-DOCSTART- -X- -X- O

1 Others 

The Others

applicants Others

Sharman B-Organization

Networks I-Organization

Ltd L-Organization

Sharman B-Organization

Networks L-Organization

Sharman B-Organization

License I-Organization

Holdings I-Organization

Ltd L-Organization

Sharman B-Organization

License L-Organization

and Others

Ms B-person

Nicola I-person

Anne I-person

Hemming L-person

Ms B-person

Hemming L-person

are Others

each Others

the Others

subject Others

of Others

asset Others

preservation Others

orders Others

made Others

by Others

Wilcox Others

J Others

on Others

22 B-Date

March I-Date

2005 L-Date

the Others

Mareva Others

----

#### we will use *spacy* to get a base pretrained model and then train it on the legal data

In [9]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm
import os

In [10]:
import spacy
import random
import json
from spacy.training.example import Example

#### Load the pre-trained spaCy model

In [11]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [12]:

nlp = spacy.load("en_core_web_sm")

#### Load the training data from the JSON file

In [13]:
# TRAINING_DATA = [
# ("1 The applicants Sharman Networks Ltd ('Sharman Networks'), Sharman License Holdings Ltd ('Sharman License') and Ms Nicola Anne Hemming ('Ms Hemming') are each the subject of asset preservation orders made by Wilcox J on 22 March 2005 ('the Mareva orders').", {"entities": [[17, 37, "Organization"], [38, 58, "Organization"], [60, 88, "Organization"], [89, 108, "Organization"], [113, 135, "person"], [136, 150, "person"], [221, 234, "Date"], [209, 217, "Judges"]]})

# ]

In [14]:
with open("datasets/6_3.json", "r") as file:
    TRAIN_DATA  = json.load(file)

In [None]:
with open("datasets/06_9.json", "r") as file:
    TEST_DATA  = json.load(file)

In [15]:
model = None
output_dir='output'

#create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
n_iter=100

##### load the model

In [17]:


if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")



#### set up the pipeline

In [None]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

#### Prepare training examples in spaCy format

In [None]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

# Training loop

In [None]:
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

#### Save the trained model to a directory

In [None]:

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

#### Test the trained model

In [None]:
for text, _ in TEST_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

# load the model

model = spacy.load('model_name')