In [49]:
# Import the libraries
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score

In [50]:
# JSON file created from annotated artciles is converted to a format understandable by spaCy.

def convert_data_to_spacy(data_JSON_FilePath):
   #  try:
        training_data = []
        lines=[]
        with open(data_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #data indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    #except Exception as e:
         #logging.exception("Unable to process " + 'C:\\Users\\rohin\ROHINI\\Rohini\\spacy_data.json + "\n" + "error = " + str(e))
        #return None


In [51]:
# pass the training data to convert from json to spaCy format.

convert_data_to_spacy('/Users/shankargowrishankar/Desktop/Rohini/CRF/train_data.json')


[('A Review Of The Labour And Employment Year In Atlantic Canada : Part 1 The first part of this two - part review of the labour and employment year in Atlantic Canada covers developments in Nova Scotia and Newfoundland ',
  {'entities': [(46, 54, 'B-Location'),
    (188, 192, 'B-Location'),
    (204, 216, 'B-Location'),
    (55, 61, 'O-Location'),
    (193, 199, 'O-Location')]}),
 ('Introduction 2019 saw a number of changes to the legal landscape across Canada ',
  {'entities': [(13, 17, 'B-Date'), (72, 78, 'B-Location')]}),
 ('We experienced a year of legalisation of cannabis for recreational use "," the continuing impact of the #MeToo movement "," and a federal election ; all of which has impacted workplaces from coast to coast ',
  {'entities': []}),
 ('This article looks back at some of the developments in labour and employment law across in Nova Scotia and Newfoundland and upcoming changes for 2020 ',
  {'entities': [(91, 95, 'B-Location'),
    (107, 119, 'B-Location'),
    (96, 

In [52]:
# Divide the data in to small batches for training
def get_batches(train_data, model_type):
    max_batch_sizes = {"tagger": 32, "parser": 16, "ner": 16, "textcat": 64}
    max_batch_size = max_batch_sizes[model_type]
    if len(train_data) < 1000:
        max_batch_size /= 2
    if len(train_data) < 500:
        max_batch_size /= 2
    batch_size = compounding(1, max_batch_size, 1.001)
    batches = minibatch(train_data, size=batch_size)
    return batches


In [57]:
# Training and evaluating the model

import spacy
from spacy.util import minibatch, compounding, decaying

from pathlib import Path

def train_spacy(retrain):
    model_dir='/Users/shankargowrishankar/Desktop/Rohini/Report/latest_result'
# Model will be trained if retrain is given as True. The model can be saved and reused for testing. To use the existing
# trained model, pass this as False. 
    
    if(retrain == True):
    
        TRAIN_DATA = convert_data_to_spacy("/Users/shankargowrishankar/Desktop/Rohini/Report/data/train_data.json")
        nlp = spacy.blank('en')  # create blank Language class
        # create the built-in pipeline components and add them to the pipeline
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if 'ner' not in nlp.pipe_names:
            ner = nlp.create_pipe('ner')
            nlp.add_pipe(ner, last=True)


        # add labels
        for _, annotations in TRAIN_DATA:
             for ent in annotations.get('entities'):
                ner.add_label(ent[2])

        optimizer = nlp.begin_training()
        # Get names of other pipes to disable them during training to train only NER
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
        with nlp.disable_pipes(*other_pipes):  # only train NER
            dropouts = decaying(.9,.1, 1e-4)
            for epoch in range(15):
                random.shuffle(TRAIN_DATA)
                dropouts = decaying(.9,.1, 1e-4)

                losses = {}
                batches = get_batches(TRAIN_DATA, 'ner')
                for batch in batches:
                    size=next(dropouts)
                    texts, annotations = zip(*batch)
                    nlp.update(texts, annotations, sgd=optimizer, drop=size,
                               losses=losses)
                    
                print('Losses', losses)


        # Save model 
        output_dir=("C:\\Users\\rohin\\Rohini_Mondaq_Dessertation\\spacy\\latest_result")
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = "Spacy Model for NER"
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
            
# Test and evaluation of the model

    print("Loading from", model_dir)
    nlp = spacy.load(model_dir)
    examples = convert_data_to_spacy("C:\\Users\\rohin\\Rohini_Mondaq_Dessertation\\spacy\\data\\test_data.json")
    tp=0
    tr=0
    tf=0
    ta=0
    c=0 
    result={}


    for text,annot in examples:
        
        f=open("Legal_article"+str(c)+".txt","w",encoding='utf-8')

        doc_to_test=nlp(text)
        d={}
        for ent in doc_to_test.ents:

            d[ent.label_]=[]
        for ent in doc_to_test.ents:
            d[ent.label_].append(ent.text)

        for i in set(d.keys()):
            f.write("\n\n")
            f.write(i +":"+"\n")
            for j in set(d[i]):
                f.write(j.replace('\n','')+"\n")

        for ent in doc_to_test.ents:
            result[ent.label_]=[0,0,0,0,0,0]

        for ent in doc_to_test.ents:
            doc_gold_text= nlp.make_doc(text)
            gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
            y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
            y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]  
            if(result[ent.label_][0]==0):

                (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
                a=accuracy_score(y_true,y_pred)
                result[ent.label_][0]=1
                result[ent.label_][1]+=p
                result[ent.label_][2]+=r
                result[ent.label_][3]+=f
                result[ent.label_][4]+=a
                result[ent.label_][5]+=1
        c+=1
    print(result)
    for i in result:
        print("\n For Entity "+i+"\n")
        print("Accuracy : "+str((result[i][4]/result[i][5])*100)+"%")
        print("Precision : "+str(result[i][1]/result[i][5]))
        print("Recall : "+str(result[i][2]/result[i][5]))
        print("F-score : "+str(result[i][3]/result[i][5]))
        

train_spacy(True)

Losses {'ner': 31366.073115904615}
Losses {'ner': 26957.77985793195}
Losses {'ner': 24347.59257559074}
Losses {'ner': 22772.641735871417}
Losses {'ner': 21434.174839220752}
Losses {'ner': 20717.946966626463}
Losses {'ner': 19734.5409174344}
Losses {'ner': 18947.89682487511}
Losses {'ner': 18541.109052032032}
Losses {'ner': 18102.980566241516}
Losses {'ner': 17568.08491722641}
Losses {'ner': 16898.90418359255}
Losses {'ner': 16737.22887945064}
Losses {'ner': 16528.89685410268}
Losses {'ner': 16329.373104419461}
Saved model to C:\Users\rohin\Rohini_Mondaq_Dessertation\spacy\latest_result
Loading from C:\Users\rohin\Rohini_Mondaq_Dessertation\spacy\latest_result
UK
UK
January
2021
EU
January
2021
EU
UK
12
February
2020
FATF
)
FATF
Cyprus
Cyprus
Cyprus
Cyprus
Cyprus
Cyprus
Cyprus
2021
Cyprus
Mauritius
February
2020
Mauritius
Regulatory
Alerts
Mauritius
Mauritius
October
2019
27
February
2020
Financial
Services
Commission
of
Mauritius
(
14
February
2020
Financial
Services
Commission
of
Maur

Workers
'
Compensation
Ikeja
Local
Government
&
ANR
Local
Government
Lagos
High
Court
May
Nigeria
Federal
Government
Federal
Ministry
of
Interior
29th
June
2017
Nigeria
Brittania
Nig
.
Ltd
Imunze
Federal
Republic
of
Nigeria
2006
IGP
MCA
MCA
Major
Nig
.
Ltd
Mobil
Prod
.
(
Nig
.
)
UnLtd
.
LASEPA
Supreme
Court
Anyaegbunam
Supreme
Court
ANYAEBUNAM
MCA
Local
Government
Federal
Marriage
Registry
Department
Of
Justice
For
Northern
Ireland
Department
of
Justice
Department
of
Justice
Justice
Policy
Division
Department
of
Finance
Northern
Ireland
Northern
Ireland
England
&
Wales
Northern
Ireland
Northern
Ireland
Northern
Ireland
England
&
Wales
Northern
Ireland
2017
England
&
Wales
2001
Department
of
Justice
Department
of
Justice
Northern
Ireland
Northern
Ireland
NHS
Ireland
January
2017
January
2020
Department
of
Justice
Department
of
Justice
2019
Union
May
2018
Council
of
Europe
Nigerian
National
Information
Technology
Agency
(
NITDA
)
25th
January
2019
Nigerian
Nigeria
Nigeria
Nigerian
ICT
Ni

19
March
2015
French
National
Institute
of
Industrial
Paris
Court
of
Appeal
Paris
Court
of
Appeal
EU
Paris
Court
of
Appeal
Regulation
(
EEC
18
June
1992
6
May
2009
National
Institute
of
Intellectual
EU
EU
2004
EU
2006
EU
2007
Health
&
Safety
First
Health
Service
Protection
Surveillance
Centre
(
Department
of
Foreign
Affairs
Safety
Health
&
Welfare
2005
2005
2005
2005
2005
Department
of
Foreign
Affairs
2005
Department
of
Foreign
Affairs
2005
European
Commission
EU
2020
California
2017
2016
2015
NHTSA
(
National
Highway
Traffic
Safety
Administration
NHTSA
2018
2018
Arizona
Mesut
2018
Mesut
Sorumluluk
US
Government
European
Commission
19
February
2020
US
government
January
2020
US
Europe
19
February
EU
EU
EU
Europe
HLEG
April
2019
December
2018
April
2018
HLEG
United
States
US
government
early
January
2020
EU
EU
High
US
US
US
HLEG
EU
US
US
government
EU
HLEG
European
Data
Protection
Board
31
May
2020
Europe
Standard
2018
2019
and
Health
Administration
(
OSHA
)
Standard
2019
2018
2018
June

AIFC
AIFC
AIFC
AIFC
Bermuda
2004
Kazakhstan
AIFC
AIFC
AIFC
AIFC
Bermuda
British
Virgin
Islands
Cayman
Islands
AIFC
AIFC
Astana
Financial
Services
Authority
7
December
2015
2017
2017
2017
January
2020
February
2020
Turkish
Competition
Authority
(
TCA
January
2020
January
Google
2020
European
Commission
EU
Germany
France
UK
EU
January
2020
Turkish
Competition
Board
January
DHL
UPS
Allianz
Dubai
Aksigorta
Mastervolt
International
Mastervolt
2016
Ankara
Administrative
Court
2016
January
United
Technologies
Corporation
L.P.
L.P.
L.P.
Brand
Industrial
Holdings
INC
Acquisitions
LLC
CDR
Boston
Holdings
State
Investments
International
Limited
LGC
Science
Group
Holdings
Limited
Company
Limited
Wilhelmsen
Inland
Services
Holdings
Club
Asteria
Motor
Company
Ltd.
Hitachi
Ltd.
Marubeni
Corporation
2019
World
Health
U.S.
Centers
for
Disease
Control
and
Prevention
(
CDC
)
March
2
","
2020
United
States
United
States
Wuhan
China
WHO
Hubei
China
March
1
","
2020
China
WHO
Italy
Japan
CDC
United
States
O