In [1]:
!unzip "/content/datasets_dl.zip"

Archive:  /content/datasets_dl.zip
   creating: datasets/
  inflating: datasets/06_16 .json    
  inflating: datasets/06_16.json     
  inflating: datasets/06_9.json      
  inflating: datasets/6_3.json       
  inflating: datasets/legal_test.txt  
  inflating: datasets/legal_train.txt  
  inflating: datasets/legal_valid.txt  


In [1]:
import json
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

'''getting the data ready in Spacy format'''

train_data = []
with open('/content/datasets/6_3.json', 'r', encoding='utf-8') as f:
    # Iterate through lines in the file
    for line in f:
        # Parse each line as a JSON object
        example = json.loads(line)

        # Extract the "text" and "entities" fields
        text = example['text']
        entities = example['entities']

        # Create a dictionary in the desired format
        temp_dict = {'text': text, 'entities': entities}

        # Append the dictionary to the train_data list
        train_data.append(temp_dict)


nlp = spacy.blank("en")
doc_bin = DocBin()

from spacy.util import filter_spans

for training_example in tqdm(train_data):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")




100%|██████████| 164/164 [00:00<00:00, 1940.59it/s]


In [13]:
!python -m spacy init fill-config /content/base_config.cfg /content/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [14]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ No output directory provided[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     36.66    0.00    0.00    0.00    0.00
  2     200        965.37   2141.29   57.61   62.94   53.10    0.58
  4     400       6065.08   1044.39   81.55   80.81   82.30    0.82
  8     600       1514.61    838.68   88.61   88.71   88.51    0.89
 12     800        635.78    623.08   95.31   94.77   95.86    0.95
 18    1000       1132.32    440.16   98.16   98.38   97.93    0.98
 25    1200        177.50    264.19   98.16   98.16   98.16    0.98
 33    1400        117.41    161.33   98.39   98.61   98.16    0.98
 44    1600        107.49    150.88   98.96   99.31   98.62    0.99
 57    1800        129.01    165.84   99.20   99.08 

In [15]:
!python -m spacy train /content/config.cfg --output ./models/output

[38;5;4mℹ Saving to output directory: models/output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     36.66    0.00    0.00    0.00    0.00
  2     200        965.37   2141.29   57.61   62.94   53.10    0.58
  4     400       6065.08   1044.39   81.55   80.81   82.30    0.82
  8     600       1514.61    838.68   88.61   88.71   88.51    0.89
 12     800        635.78    623.08   95.31   94.77   95.86    0.95
 18    1000       1132.32    440.16   98.16   98.38   97.93    0.98
 25    1200        177.50    264.19   98.16   98.16   98.16    0.98
 33    1400        117.41    161.33   98.39   98.61   98.16    0.98
 44    1600        107.49    150.88   98.96   99.31   98.62    0.99
 57    1800        129.01    165.84   9

In [21]:
import spacy
from spacy import displacy

# Load the pre-trained NER model
nlp_ner = spacy.load("/content/models/output/model-best")

# Read the contents of the text file
with open('/content/datasets/legal_train.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Process the text with spaCy
doc = nlp_ner(text)

# Define entity colors
colors = {"Organization": "#F67DE3", "person": "#7DF6D9", "Courts": "#a6e22d"}

# Set rendering options
options = {"colors": colors}

# Visualize the named entities
displacy.render(doc, style="ent", options=options, jupyter=True)
