In [8]:
import nltk
import spacy

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp("The contract between Birla Corporation and Hindustan Inc. was signed on March 10, 2020.")
print([(X.text, X.label_) for X in doc.ents])

[('Birla Corporation', 'ORG'), ('Hindustan Inc.', 'ORG'), ('March 10, 2020', 'DATE')]


In [11]:
import spacy
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [12]:
from __future__ import unicode_literals, print_function

import random
from pathlib import Path
import spacy
from tqdm import tqdm
from spacy.training.example import Example
import pickle

In [13]:
TRAIN_DATA = [("The contract between Birla Corporation and Hindustan Inc. was signed on March 10, 2020.",{"entities": [( 21, 38,"CORPORATION"), (43, 57, "CORPORATION"), (72, 86, "DATE")]}),
  ("The court in New York ruled in favor of Plaintiff in the case Doe v. Smith.",{"entities": [(13,21, "LOCATION"),(40, 49,"PERSON"),(62, 74, "PERSON")]}),
  ("The shareholders agreed to invest $5 million in Willhome Ventures.",{"entities": [(34, 44, "CURRENCY"), (48, 65,  "CORPORATION")]}),
  ("The contract stated that payment must be made within 30 days of signing.",{"entities": [(53, 60, "TIMEFRAME")]})]


In [14]:
model = None
output_dir=Path("ner/")
n_iter=100

#load the model

if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


In [17]:
#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

In [18]:
# adding labels to ner
for _, entities in TRAIN_DATA:
    for ent in entities.get('entities'):
        ner.add_label(ent[2])
example = []
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, entities in tqdm(TRAIN_DATA):
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, entities)
            nlp.update(
                [example], 
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 4/4 [00:00<00:00, 51.29it/s]


{'ner': 55.60271668434143}


100%|██████████| 4/4 [00:00<00:00, 43.57it/s]


{'ner': 51.79668301343918}


100%|██████████| 4/4 [00:00<00:00, 61.74it/s]


{'ner': 43.51699072122574}


100%|██████████| 4/4 [00:00<00:00, 58.87it/s]


{'ner': 31.86742257885635}


100%|██████████| 4/4 [00:00<00:00, 48.61it/s]


{'ner': 22.036731922533363}


100%|██████████| 4/4 [00:00<00:00, 57.87it/s]


{'ner': 17.467854714486748}


100%|██████████| 4/4 [00:00<00:00, 60.92it/s]


{'ner': 18.073163681192455}


100%|██████████| 4/4 [00:00<00:00, 59.61it/s]


{'ner': 15.86498955883144}


100%|██████████| 4/4 [00:00<00:00, 59.66it/s]


{'ner': 15.72886565546105}


100%|██████████| 4/4 [00:00<00:00, 63.85it/s]


{'ner': 14.843900780921103}


100%|██████████| 4/4 [00:00<00:00, 78.69it/s]


{'ner': 13.858238188693576}


100%|██████████| 4/4 [00:00<00:00, 74.51it/s]


{'ner': 12.998064691132186}


100%|██████████| 4/4 [00:00<00:00, 59.58it/s]


{'ner': 12.822726690636046}


100%|██████████| 4/4 [00:00<00:00, 59.63it/s]


{'ner': 13.592955290670034}


100%|██████████| 4/4 [00:00<00:00, 50.87it/s]


{'ner': 11.96424583113162}


100%|██████████| 4/4 [00:00<00:00, 79.54it/s]


{'ner': 15.51527062425157}


100%|██████████| 4/4 [00:00<00:00, 73.19it/s]


{'ner': 24.869469557389294}


100%|██████████| 4/4 [00:00<00:00, 75.62it/s]


{'ner': 13.61923466144291}


100%|██████████| 4/4 [00:00<00:00, 59.56it/s]


{'ner': 10.896845372049086}


100%|██████████| 4/4 [00:00<00:00, 63.20it/s]


{'ner': 14.556281975564152}


100%|██████████| 4/4 [00:00<00:00, 80.74it/s]


{'ner': 11.326280276323246}


100%|██████████| 4/4 [00:00<00:00, 59.74it/s]


{'ner': 12.141887321929232}


100%|██████████| 4/4 [00:00<00:00, 61.65it/s]


{'ner': 11.913068894213236}


100%|██████████| 4/4 [00:00<00:00, 73.67it/s]


{'ner': 9.980901516981566}


100%|██████████| 4/4 [00:00<00:00, 71.71it/s]


{'ner': 8.457297132736894}


100%|██████████| 4/4 [00:00<00:00, 63.32it/s]


{'ner': 9.173804965173758}


100%|██████████| 4/4 [00:00<00:00, 80.77it/s]


{'ner': 6.382178040280612}


100%|██████████| 4/4 [00:00<00:00, 58.64it/s]


{'ner': 10.296279273845416}


100%|██████████| 4/4 [00:00<00:00, 60.77it/s]


{'ner': 11.114397029632643}


100%|██████████| 4/4 [00:00<00:00, 49.49it/s]


{'ner': 11.615675930773524}


100%|██████████| 4/4 [00:00<00:00, 76.08it/s]


{'ner': 32.08902486524223}


100%|██████████| 4/4 [00:00<00:00, 60.69it/s]


{'ner': 11.144023739038552}


100%|██████████| 4/4 [00:00<00:00, 61.68it/s]


{'ner': 20.35589279170919}


100%|██████████| 4/4 [00:00<00:00, 79.89it/s]


{'ner': 12.454135850487123}


100%|██████████| 4/4 [00:00<00:00, 59.11it/s]


{'ner': 13.158613453199166}


100%|██████████| 4/4 [00:00<00:00, 79.96it/s]


{'ner': 10.218105586558003}


100%|██████████| 4/4 [00:00<00:00, 61.50it/s]


{'ner': 11.502492220012506}


100%|██████████| 4/4 [00:00<00:00, 59.17it/s]


{'ner': 12.359480361216539}


100%|██████████| 4/4 [00:00<00:00, 80.31it/s]


{'ner': 14.43813214246892}


100%|██████████| 4/4 [00:00<00:00, 58.03it/s]


{'ner': 10.153263776133702}


100%|██████████| 4/4 [00:00<00:00, 61.65it/s]


{'ner': 11.138862212431937}


100%|██████████| 4/4 [00:00<00:00, 77.20it/s]


{'ner': 10.918232940531142}


100%|██████████| 4/4 [00:00<00:00, 62.07it/s]


{'ner': 11.512688264233867}


100%|██████████| 4/4 [00:00<00:00, 62.13it/s]


{'ner': 10.51411152806375}


100%|██████████| 4/4 [00:00<00:00, 60.42it/s]


{'ner': 7.9719702995772295}


100%|██████████| 4/4 [00:00<00:00, 60.51it/s]


{'ner': 6.351084019552922}


100%|██████████| 4/4 [00:00<00:00, 58.76it/s]


{'ner': 8.012312240567084}


100%|██████████| 4/4 [00:00<00:00, 80.63it/s]


{'ner': 10.418053254115657}


100%|██████████| 4/4 [00:00<00:00, 67.67it/s]


{'ner': 9.68307561470255}


100%|██████████| 4/4 [00:00<00:00, 67.31it/s]


{'ner': 6.39069854067298}


100%|██████████| 4/4 [00:00<00:00, 61.67it/s]


{'ner': 9.296586084244527}


100%|██████████| 4/4 [00:00<00:00, 80.00it/s]


{'ner': 3.7954329753124028}


100%|██████████| 4/4 [00:00<00:00, 61.05it/s]


{'ner': 9.128368547499296}


100%|██████████| 4/4 [00:00<00:00, 61.74it/s]


{'ner': 5.554209647590019}


100%|██████████| 4/4 [00:00<00:00, 79.15it/s]


{'ner': 10.547050749105356}


100%|██████████| 4/4 [00:00<00:00, 57.95it/s]


{'ner': 8.05646898113443}


100%|██████████| 4/4 [00:00<00:00, 81.07it/s]


{'ner': 9.113820622969563}


100%|██████████| 4/4 [00:00<00:00, 60.08it/s]


{'ner': 7.101308711463672}


100%|██████████| 4/4 [00:00<00:00, 59.57it/s]


{'ner': 3.852690299077685}


100%|██████████| 4/4 [00:00<00:00, 61.05it/s]


{'ner': 5.100588498541426}


100%|██████████| 4/4 [00:00<00:00, 59.74it/s]


{'ner': 4.076642858141266}


100%|██████████| 4/4 [00:00<00:00, 58.75it/s]


{'ner': 5.07575478091689}


100%|██████████| 4/4 [00:00<00:00, 80.07it/s]


{'ner': 3.771728207042269}


100%|██████████| 4/4 [00:00<00:00, 76.93it/s]


{'ner': 11.273273790579127}


100%|██████████| 4/4 [00:00<00:00, 59.62it/s]


{'ner': 3.953771254258344}


100%|██████████| 4/4 [00:00<00:00, 62.34it/s]


{'ner': 8.893564090234834}


100%|██████████| 4/4 [00:00<00:00, 75.31it/s]


{'ner': 3.46552583769006}


100%|██████████| 4/4 [00:00<00:00, 60.56it/s]


{'ner': 1.7926510825173372}


100%|██████████| 4/4 [00:00<00:00, 60.18it/s]


{'ner': 4.2162974049981505}


100%|██████████| 4/4 [00:00<00:00, 79.89it/s]


{'ner': 8.605124425234697}


100%|██████████| 4/4 [00:00<00:00, 74.86it/s]


{'ner': 4.349516101693631}


100%|██████████| 4/4 [00:00<00:00, 60.67it/s]


{'ner': 4.343539192299873}


100%|██████████| 4/4 [00:00<00:00, 47.64it/s]


{'ner': 1.4484959177656263}


100%|██████████| 4/4 [00:00<00:00, 60.92it/s]


{'ner': 3.1517908882978}


100%|██████████| 4/4 [00:00<00:00, 79.97it/s]


{'ner': 3.4601594157071567}


100%|██████████| 4/4 [00:00<00:00, 60.06it/s]


{'ner': 4.665686793841354}


100%|██████████| 4/4 [00:00<00:00, 62.12it/s]


{'ner': 2.08269862961142}


100%|██████████| 4/4 [00:00<00:00, 80.58it/s]


{'ner': 0.5854988863015054}


100%|██████████| 4/4 [00:00<00:00, 60.72it/s]


{'ner': 1.7846320869860672}


100%|██████████| 4/4 [00:00<00:00, 59.99it/s]


{'ner': 2.405072631188743}


100%|██████████| 4/4 [00:00<00:00, 59.16it/s]


{'ner': 4.182791737474185}


100%|██████████| 4/4 [00:00<00:00, 80.10it/s]


{'ner': 0.9611453897110319}


100%|██████████| 4/4 [00:00<00:00, 60.90it/s]


{'ner': 1.5582764200671515}


100%|██████████| 4/4 [00:00<00:00, 59.66it/s]


{'ner': 3.2596349165800422}


100%|██████████| 4/4 [00:00<00:00, 80.58it/s]


{'ner': 0.4412684103089785}


100%|██████████| 4/4 [00:00<00:00, 59.03it/s]


{'ner': 1.0294168892266893}


100%|██████████| 4/4 [00:00<00:00, 48.42it/s]


{'ner': 1.2115868068177433}


100%|██████████| 4/4 [00:00<00:00, 59.27it/s]


{'ner': 3.8459687192025034}


100%|██████████| 4/4 [00:00<00:00, 61.20it/s]


{'ner': 0.4742712711611058}


100%|██████████| 4/4 [00:00<00:00, 59.70it/s]


{'ner': 0.32066663939059076}


100%|██████████| 4/4 [00:00<00:00, 80.79it/s]


{'ner': 1.2942937609939378}


100%|██████████| 4/4 [00:00<00:00, 75.33it/s]


{'ner': 2.4446228816282165}


100%|██████████| 4/4 [00:00<00:00, 60.42it/s]


{'ner': 1.161934163457889}


100%|██████████| 4/4 [00:00<00:00, 77.59it/s]


{'ner': 1.2437320220798571}


100%|██████████| 4/4 [00:00<00:00, 59.55it/s]


{'ner': 0.07974813664279055}


100%|██████████| 4/4 [00:00<00:00, 62.89it/s]


{'ner': 0.942680739251011}


100%|██████████| 4/4 [00:00<00:00, 76.68it/s]


{'ner': 0.007296600299468586}


100%|██████████| 4/4 [00:00<00:00, 59.29it/s]


{'ner': 0.11718145636010269}


100%|██████████| 4/4 [00:00<00:00, 58.99it/s]


{'ner': 1.2782625435259851}


100%|██████████| 4/4 [00:00<00:00, 61.17it/s]

{'ner': 0.007724255048116881}





In [19]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)
pickle.dump(nlp, open( "legal.pkl", "wb" ))

Saved model to ner


In [20]:
doc=nlp("The Birla Corporation is legally entitled")
for ent in doc.ents:
    print(ent.text+ '  ------>   ' + ent.label_)

Birla Corporation  ------>   CORPORATION


In [62]:
import spacy
from spacy.training import Example
from spacy.scorer import Scorer

# Load your fine-tuned model
nlp = spacy.load("ner")

# Evaluation data
EVAL_DATA = [
  ("ABC Corp. filed a lawsuit against XYZ Ltd. on December 5, 2021.", {"entities": [(0, 9, "CORPORATION"), (34, 42, "CORPORATION"), (46, 62, "DATE")]}),
  ("The court in Los Angeles issued a verdict in favor of the Plaintiff, John Doe.", {"entities": [(13,24,"LOCATION"),(58,67,"PERSON"),(69,77,"PERSON")]}),
  ("Smith & Sons invested €3 million in a new project.", {"entities": [(0, 12, "CORPORATION"), (22, 32, "CURRENCY")]}),
  ("The lease agreement requires payment within 90 days of signing.", {"entities": [(44, 51, "TIMEFRAME")]})
]

# Initialize the scorer
scorer = Scorer()

# Disable other pipelines during evaluation
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']):
    for text, annotations in EVAL_DATA:
        doc = nlp(text)
        # Create a list of the predicted entities
        predicted_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        print('Predicted Entities:', predicted_entities)
        
        # Extract the gold standard entities from annotations
        gold_entities = annotations['entities']
        print('Real Entities:', gold_entities)



Predicted Entities: [(46, 62, 'DATE')]
Real Entities: [(0, 9, 'CORPORATION'), (34, 42, 'CORPORATION'), (46, 62, 'DATE')]
Predicted Entities: [(13, 24, 'CORPORATION'), (58, 67, 'PERSON'), (67, 73, 'CORPORATION'), (74, 78, 'PERSON')]
Real Entities: [(13, 24, 'LOCATION'), (58, 67, 'PERSON'), (69, 77, 'PERSON')]
Predicted Entities: [(8, 21, 'PERSON'), (22, 32, 'CURRENCY')]
Real Entities: [(0, 12, 'CORPORATION'), (22, 32, 'CURRENCY')]
Predicted Entities: [(44, 51, 'TIMEFRAME')]
Real Entities: [(44, 51, 'TIMEFRAME')]


In [67]:
import spacy

# Load your fine-tuned model
nlp = spacy.load("ner")

# Evaluation data
EVAL_DATA = [
    ("ABC Corp. filed a lawsuit against XYZ Ltd. on December 5, 2021.", {"entities": [(0, 9, "CORPORATION"), (34, 42, "CORPORATION"), (46, 62, "DATE")]}),
    ("The court in Los Angeles issued a verdict in favor of the Plaintiff, John Doe.", {"entities": [(13,24,"LOCATION"),(58,67,"PERSON"),(69,77,"PERSON")]}),
    ("Smith & Sons invested €3 million in a new project.", {"entities": [(0, 12, "CORPORATION"), (22, 32, "CURRENCY")]}),
    ("The lease agreement requires payment within 90 days of signing.", {"entities": [(44, 51, "TIMEFRAME")]})
]

# Initialize counters for evaluation metrics
eval_counts = {label: {"tp": 0, "fp": 0, "fn": 0} for label in ["CORPORATION", "DATE", "LOCATION", "PERSON", "CURRENCY", "TIMEFRAME"]}

# Disable other pipelines during evaluation
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']):
    for text, annotations in EVAL_DATA:
        doc = nlp(text)
        
        # Extract predicted entities
        predicted_entities = {(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}
        
        # Extract gold standard entities
        gold_entities = {(start, end, label) for start, end, label in annotations['entities']}
        
        # Update evaluation counts
        for start, end, label in predicted_entities:
            if (start, end, label) in gold_entities:
                eval_counts[label]["tp"] += 1
            else:
                eval_counts[label]["fp"] += 1
        
        for start, end, label in gold_entities:
            if (start, end, label) not in predicted_entities:
                eval_counts[label]["fn"] += 1

# Calculate precision, recall, and F1 score for each entity type
results = {}
for label, counts in eval_counts.items():
    tp = counts["tp"]
    fp = counts["fp"]
    fn = counts["fn"]
    
    precision = tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = tp / (tp + fn) if tp + fn > 0 else 0.0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0
    
    results[label] = {"precision": precision, "recall": recall, "f1_score": f1_score}

# Print results
for label, scores in results.items():
    print(f"Entity: {label}")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall: {scores['recall']:.4f}")
    print(f"  F1 score: {scores['f1_score']:.4f}")
    print()


Entity: CORPORATION
  Precision: 0.0000
  Recall: 0.0000
  F1 score: 0.0000

Entity: DATE
  Precision: 1.0000
  Recall: 1.0000
  F1 score: 1.0000

Entity: LOCATION
  Precision: 0.0000
  Recall: 0.0000
  F1 score: 0.0000

Entity: PERSON
  Precision: 0.3333
  Recall: 0.5000
  F1 score: 0.4000

Entity: CURRENCY
  Precision: 1.0000
  Recall: 1.0000
  F1 score: 1.0000

Entity: TIMEFRAME
  Precision: 1.0000
  Recall: 1.0000
  F1 score: 1.0000

