**<center><font size = "5">Create custom NER model<center>**
***


In [None]:
!pip install spacy

In [None]:
!python3 -m spacy download en_core_web_lg

In [2]:
from __future__ import unicode_literals, print_function

import pandas as pd
import os, time

from pathlib import Path
from tqdm import tqdm 
from spacy.training import Example
from spacy import displacy
import spacy
import base64

### NER pipeline

In [3]:
nlpSpacy = spacy.load("en_core_web_lg")
 
print(nlpSpacy.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [4]:
# Load the CSV file into a DataFrame
df = pd.read_csv('known_exploited_vulnerabilities.csv')


# Create a random sample of 300 rows
sample_df = df.sample(n=300, random_state=1)

# Display the sample
print(sample_df)

                 cveID vendorProject   
919      CVE-2023-2136        Google  \
298     CVE-2021-35394       Realtek   
926      CVE-2014-0196         Linux   
323   CVE-2017-1000486      Primetek   
239     CVE-2019-16256   SIMalliance   
...                ...           ...   
322      CVE-2015-7450           IBM   
311     CVE-2021-22017        VMware   
831      CVE-2022-3236        Sophos   
1004    CVE-2023-41993         Apple   
1125    CVE-2022-24816         OSGeo   

                                                product   
919                                       Chromium Skia  \
298               Jungle Software Development Kit (SDK)   
926                                              Kernel   
323                              Primefaces Application   
239                                     Toolbox Browser   
...                                                 ...   
322   WebSphere Application Server and Server Hyperv...   
311                                      vCente

In [5]:
df.head()

Unnamed: 0,cveID,vendorProject,product,vulnerabilityName,dateAdded,shortDescription,requiredAction,dueDate,knownRansomwareCampaignUse,notes,cwes
0,CVE-2021-27104,Accellion,FTA,Accellion FTA OS Command Injection Vulnerability,2021-11-03,Accellion FTA contains an OS command injection...,Apply updates per vendor instructions.,2021-11-17,Known,https://nvd.nist.gov/vuln/detail/CVE-2021-27104,"CWE-20, CWE-78"
1,CVE-2021-27102,Accellion,FTA,Accellion FTA OS Command Injection Vulnerability,2021-11-03,Accellion FTA contains an OS command injection...,Apply updates per vendor instructions.,2021-11-17,Known,https://nvd.nist.gov/vuln/detail/CVE-2021-27102,"CWE-20, CWE-78"
2,CVE-2021-27101,Accellion,FTA,Accellion FTA SQL Injection Vulnerability,2021-11-03,Accellion FTA contains a SQL injection vulnera...,Apply updates per vendor instructions.,2021-11-17,Known,https://nvd.nist.gov/vuln/detail/CVE-2021-27101,"CWE-89, CWE-138"
3,CVE-2021-27103,Accellion,FTA,Accellion FTA Server-Side Request Forgery (SSR...,2021-11-03,Accellion FTA contains a server-side request f...,Apply updates per vendor instructions.,2021-11-17,Known,https://nvd.nist.gov/vuln/detail/CVE-2021-27103,CWE-918
4,CVE-2021-21017,Adobe,Acrobat and Reader,Adobe Acrobat and Reader Heap-based Buffer Ove...,2021-11-03,Acrobat Acrobat and Reader contain a heap-base...,Apply updates per vendor instructions.,2021-11-17,Unknown,https://nvd.nist.gov/vuln/detail/CVE-2021-21017,CWE-122


In [6]:
text1 = sample_df["vulnerabilityName"]
text1.head()


919    Google Chrome Skia Integer Overflow Vulnerability
298    Realtek Jungle SDK Remote Code Execution Vulne...
926            Linux Kernel Race Condition Vulnerability
323    Primetek Primefaces Remote Code Execution Vuln...
239    SIMalliance Toolbox Browser Command Injection ...
Name: vulnerabilityName, dtype: object

In [7]:
text2 = sample_df["shortDescription"]
text2.head()

919    Google Chromium Skia contains an integer overf...
298    RealTek Jungle SDK contains multiple memory co...
926    Linux Kernel contains a race condition vulnera...
323    Primetek Primefaces is vulnerable to a weak en...
239    SIMalliance Toolbox Browser contains an comman...
Name: shortDescription, dtype: object

In [10]:
# Save text1 to a file
with open('text1.txt', 'w') as file1:
    file1.write('\n'.join(text1))

# Save text2 to a file
with open('text2.txt', 'w') as file2:
    file2.write('\n'.join(text2))

In [13]:
file = open('text2.txt')
text = file.read()
file.close()
doc = nlpSpacy(text)
displacy.render(doc, style="ent", jupyter=True)

In [14]:
for entity in doc.ents:
  print(f"Name of the entity {entity.text} type is {entity.label_}")

Name of the entity Google Chromium Skia type is ORG
Name of the entity Google Chrome type is ORG
Name of the entity RealTek Jungle type is PERSON
Name of the entity Linux Kernel type is PRODUCT
Name of the entity Primetek Primefaces type is ORG
Name of the entity SIMalliance Toolbox Browser type is PERSON
Name of the entity Microsoft Windows Adobe Font Manager Library type is ORG
Name of the entity Adobe Type type is PRODUCT
Name of the entity 1 type is CARDINAL
Name of the entity Windows 10 type is PRODUCT
Name of the entity Windows 10 type is PRODUCT
Name of the entity AppContainer type is ORG
Name of the entity Google Chrome WebAudio type is PRODUCT
Name of the entity Microsoft Internet Explorer type is ORG
Name of the entity 2 type is CARDINAL
Name of the entity Microsoft Internet Explorer type is ORG
Name of the entity Apache CouchDB type is PRODUCT
Name of the entity Apple iOS type is ORG
Name of the entity TOCTOU type is ORG
Name of the entity Pointer Authentication type is FAC


## Annotate the text for NER 

Navigate to 

https://tecoholic.github.io/ner-annotator/

and make all the custom NER and annotate the entities on the text one by one

## Make a Spacy custom NER MODEL

In [15]:
import json
 
with open('annotations 1.json', 'r') as f:
    data = json.load(f)
    
print(data['annotations'][0])

['Google Chrome Skia Integer Overflow Vulnerability', {'entities': [[0, 6, 'ORG'], [7, 13, 'PRODUCT'], [14, 18, 'LIBRARY'], [19, 35, 'FIELD']]}]


In [16]:
data['classes']

['PRODUCT', 'FIELD', 'LIBRARY', 'ORG', 'FUNCTION']

## Create empty model

In [17]:
training_data = data.copy()
training_data['classes'] = data['classes']
training_data['annotations'] = []
for text, annotation in data['annotations']:
    try:
        if text!="" and len(annotation['entities']) > 0:
            temp_dict = {}
            temp_dict['text'] = text
            temp_dict['entities'] = []
            for ent in annotation['entities']:
                start = ent[0]
                end = ent[1]
                label = ent[2].upper()
                temp_dict['entities'].append((start, end, label))
        training_data['annotations'].append(temp_dict)
    except:
        pass
print(training_data)

{'classes': ['PRODUCT', 'FIELD', 'LIBRARY', 'ORG', 'FUNCTION'], 'annotations': [{'text': 'Google Chrome Skia Integer Overflow Vulnerability', 'entities': [(0, 6, 'ORG'), (7, 13, 'PRODUCT'), (14, 18, 'LIBRARY'), (19, 35, 'FIELD')]}, {'text': 'Realtek Jungle SDK Remote Code Execution Vulnerability', 'entities': [(0, 7, 'ORG'), (8, 18, 'PRODUCT'), (19, 40, 'FIELD')]}, {'text': 'Linux Kernel Race Condition Vulnerability', 'entities': [(0, 5, 'ORG'), (6, 27, 'FIELD')]}, {'text': 'Primetek Primefaces Remote Code Execution Vulnerability', 'entities': [(0, 8, 'ORG'), (9, 19, 'PRODUCT'), (20, 41, 'FIELD')]}, {'text': 'SIMalliance Toolbox Browser Command Injection Vulnerability', 'entities': [(0, 11, 'ORG'), (12, 27, 'PRODUCT'), (28, 45, 'FIELD')]}, {'text': 'Microsoft Windows Adobe Font Manager Library Remote Code Execution Vulnerability', 'entities': [(0, 9, 'ORG'), (10, 17, 'PRODUCT'), (18, 36, 'FUNCTION'), (45, 66, 'FIELD')]}, {'text': 'Google Chrome WebAudio Use-After-Free Vulnerability', '

### Configuration variables

In [18]:
modelSpacy = None
n_iter=100

### Load the model

In [19]:
if modelSpacy is not None:
    nlp = spacy.load(modelSpacy)  
    print("Loaded model '%s'" % modelSpacy)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


### Set up pipeline

In [20]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

### Train the NER model

In [21]:
#getting all the entities
for annotations in training_data["annotations"]:
    for ent in annotations['entities']:
        ner.add_label(ent[2])
        
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        losses = {}
        for text, annotations in tqdm(data["annotations"]):
            try:
                if text!="":
                    example = Example.from_dict(nlp.make_doc(text), annotations)
                    #Update the model
                    nlp.update(
                        [example],  
                        drop=0.5,  
                        sgd=optimizer,
                        losses=losses)
            except:
                pass
        print(losses)


100%|██████████| 300/300 [00:02<00:00, 129.48it/s]


{'ner': 1281.9669577487327}


100%|██████████| 300/300 [00:01<00:00, 150.50it/s]


{'ner': 840.0535356115821}


100%|██████████| 300/300 [00:02<00:00, 144.71it/s]


{'ner': 766.8872955690897}


100%|██████████| 300/300 [00:01<00:00, 151.61it/s]


{'ner': 621.9582066397048}


100%|██████████| 300/300 [00:02<00:00, 147.53it/s]


{'ner': 579.9971837363139}


100%|██████████| 300/300 [00:01<00:00, 150.47it/s]


{'ner': 522.984526827317}


100%|██████████| 300/300 [00:01<00:00, 150.89it/s]


{'ner': 489.3154441261283}


100%|██████████| 300/300 [00:01<00:00, 150.77it/s]


{'ner': 408.7254339386059}


100%|██████████| 300/300 [00:01<00:00, 150.95it/s]


{'ner': 382.1992948738301}


100%|██████████| 300/300 [00:02<00:00, 148.82it/s]


{'ner': 370.49139371129695}


100%|██████████| 300/300 [00:01<00:00, 151.16it/s]


{'ner': 356.29948546571694}


100%|██████████| 300/300 [00:01<00:00, 150.84it/s]


{'ner': 329.30499883781556}


100%|██████████| 300/300 [00:01<00:00, 151.11it/s]


{'ner': 330.8009111004652}


100%|██████████| 300/300 [00:02<00:00, 147.61it/s]


{'ner': 317.90230533663527}


100%|██████████| 300/300 [00:02<00:00, 142.30it/s]


{'ner': 315.89062723259366}


100%|██████████| 300/300 [00:02<00:00, 146.58it/s]


{'ner': 248.43661278295198}


100%|██████████| 300/300 [00:02<00:00, 144.16it/s]


{'ner': 281.3452590747915}


100%|██████████| 300/300 [00:02<00:00, 147.47it/s]


{'ner': 247.0803682023971}


100%|██████████| 300/300 [00:02<00:00, 148.80it/s]


{'ner': 201.76588304577834}


100%|██████████| 300/300 [00:01<00:00, 151.32it/s]


{'ner': 228.53862924777167}


100%|██████████| 300/300 [00:01<00:00, 151.67it/s]


{'ner': 154.91019140699026}


100%|██████████| 300/300 [00:01<00:00, 151.95it/s]


{'ner': 188.83103589074562}


100%|██████████| 300/300 [00:02<00:00, 147.44it/s]


{'ner': 222.3397565547952}


100%|██████████| 300/300 [00:02<00:00, 146.69it/s]


{'ner': 187.77009310556218}


100%|██████████| 300/300 [00:01<00:00, 151.52it/s]


{'ner': 168.47857202230705}


100%|██████████| 300/300 [00:02<00:00, 147.88it/s]


{'ner': 168.17331818481333}


100%|██████████| 300/300 [00:01<00:00, 151.59it/s]


{'ner': 163.0097961921721}


100%|██████████| 300/300 [00:01<00:00, 151.99it/s]


{'ner': 172.55120058600164}


100%|██████████| 300/300 [00:01<00:00, 152.38it/s]


{'ner': 154.3882574220234}


100%|██████████| 300/300 [00:01<00:00, 150.78it/s]


{'ner': 160.34974528754518}


100%|██████████| 300/300 [00:02<00:00, 144.08it/s]


{'ner': 154.11349355710573}


100%|██████████| 300/300 [00:01<00:00, 152.19it/s]


{'ner': 173.05557579307848}


100%|██████████| 300/300 [00:02<00:00, 147.66it/s]


{'ner': 182.07398041901888}


100%|██████████| 300/300 [00:02<00:00, 148.99it/s]


{'ner': 143.96682784036523}


100%|██████████| 300/300 [00:02<00:00, 147.65it/s]


{'ner': 140.3697529599119}


100%|██████████| 300/300 [00:02<00:00, 147.07it/s]


{'ner': 135.2666948198769}


100%|██████████| 300/300 [00:01<00:00, 151.59it/s]


{'ner': 154.88981302857542}


100%|██████████| 300/300 [00:01<00:00, 151.80it/s]


{'ner': 139.52329179664832}


100%|██████████| 300/300 [00:01<00:00, 152.90it/s]


{'ner': 110.35705882160639}


100%|██████████| 300/300 [00:02<00:00, 144.72it/s]


{'ner': 136.60331439014095}


100%|██████████| 300/300 [00:02<00:00, 143.98it/s]


{'ner': 97.42932154182118}


100%|██████████| 300/300 [00:01<00:00, 150.71it/s]


{'ner': 106.80441110350164}


100%|██████████| 300/300 [00:02<00:00, 147.06it/s]


{'ner': 103.07596747027989}


100%|██████████| 300/300 [00:02<00:00, 135.36it/s]


{'ner': 116.82368093521004}


100%|██████████| 300/300 [00:02<00:00, 146.30it/s]


{'ner': 83.85791959313549}


100%|██████████| 300/300 [00:02<00:00, 139.29it/s]


{'ner': 129.29232206087488}


100%|██████████| 300/300 [00:02<00:00, 144.33it/s]


{'ner': 94.26230627899699}


100%|██████████| 300/300 [00:02<00:00, 141.22it/s]


{'ner': 119.53293453730436}


100%|██████████| 300/300 [00:02<00:00, 146.79it/s]


{'ner': 135.30904889523342}


100%|██████████| 300/300 [00:01<00:00, 152.23it/s]


{'ner': 99.05282406612254}


100%|██████████| 300/300 [00:01<00:00, 152.99it/s]


{'ner': 104.03353796514313}


100%|██████████| 300/300 [00:01<00:00, 150.71it/s]


{'ner': 102.3485964308939}


100%|██████████| 300/300 [00:01<00:00, 152.96it/s]


{'ner': 93.07427838620458}


100%|██████████| 300/300 [00:01<00:00, 152.97it/s]


{'ner': 107.37481146386799}


100%|██████████| 300/300 [00:01<00:00, 151.47it/s]


{'ner': 103.04913830032989}


100%|██████████| 300/300 [00:02<00:00, 146.60it/s]


{'ner': 104.32575380293689}


100%|██████████| 300/300 [00:02<00:00, 144.45it/s]


{'ner': 101.0787321629625}


100%|██████████| 300/300 [00:02<00:00, 144.52it/s]


{'ner': 117.87798588377488}


100%|██████████| 300/300 [00:01<00:00, 150.35it/s]


{'ner': 93.17317045790986}


100%|██████████| 300/300 [00:02<00:00, 143.90it/s]


{'ner': 64.82527822835137}


100%|██████████| 300/300 [00:02<00:00, 145.84it/s]


{'ner': 96.67483149959716}


100%|██████████| 300/300 [00:01<00:00, 155.38it/s]


{'ner': 108.8433394990334}


100%|██████████| 300/300 [00:01<00:00, 155.36it/s]


{'ner': 85.56142929553701}


100%|██████████| 300/300 [00:01<00:00, 155.01it/s]


{'ner': 92.84505525870621}


100%|██████████| 300/300 [00:01<00:00, 155.42it/s]


{'ner': 78.9593197678924}


100%|██████████| 300/300 [00:02<00:00, 145.31it/s]


{'ner': 87.23420945885502}


100%|██████████| 300/300 [00:02<00:00, 148.08it/s]


{'ner': 84.99744159560021}


100%|██████████| 300/300 [00:02<00:00, 139.13it/s]


{'ner': 94.70047409114488}


100%|██████████| 300/300 [00:01<00:00, 150.00it/s]


{'ner': 112.141392301845}


100%|██████████| 300/300 [00:01<00:00, 152.57it/s]


{'ner': 84.49978640322291}


100%|██████████| 300/300 [00:01<00:00, 150.74it/s]


{'ner': 89.25598050707883}


100%|██████████| 300/300 [00:01<00:00, 151.54it/s]


{'ner': 94.38121175283409}


100%|██████████| 300/300 [00:01<00:00, 152.32it/s]


{'ner': 87.1340484571713}


100%|██████████| 300/300 [00:02<00:00, 145.96it/s]


{'ner': 79.42422373441713}


100%|██████████| 300/300 [00:01<00:00, 152.46it/s]


{'ner': 83.78087170918491}


100%|██████████| 300/300 [00:01<00:00, 150.73it/s]


{'ner': 75.36906342589768}


100%|██████████| 300/300 [00:01<00:00, 152.79it/s]


{'ner': 105.47241901896996}


100%|██████████| 300/300 [00:01<00:00, 152.55it/s]


{'ner': 58.23608758745549}


100%|██████████| 300/300 [00:01<00:00, 152.49it/s]


{'ner': 69.91618530163997}


100%|██████████| 300/300 [00:01<00:00, 152.69it/s]


{'ner': 79.10005010381546}


100%|██████████| 300/300 [00:01<00:00, 150.31it/s]


{'ner': 73.92910966754287}


100%|██████████| 300/300 [00:01<00:00, 152.78it/s]


{'ner': 83.7119213343703}


100%|██████████| 300/300 [00:01<00:00, 151.76it/s]


{'ner': 74.03410367556239}


100%|██████████| 300/300 [00:01<00:00, 152.78it/s]


{'ner': 74.87024671173259}


100%|██████████| 300/300 [00:02<00:00, 145.79it/s]


{'ner': 64.82319179774322}


100%|██████████| 300/300 [00:02<00:00, 149.33it/s]


{'ner': 68.39488677394267}


100%|██████████| 300/300 [00:02<00:00, 145.33it/s]


{'ner': 72.07917041642511}


100%|██████████| 300/300 [00:01<00:00, 151.00it/s]


{'ner': 61.427306504864106}


100%|██████████| 300/300 [00:01<00:00, 150.11it/s]


{'ner': 103.35852474489408}


100%|██████████| 300/300 [00:01<00:00, 151.40it/s]


{'ner': 69.8113688070033}


100%|██████████| 300/300 [00:02<00:00, 146.86it/s]


{'ner': 53.66956653237426}


100%|██████████| 300/300 [00:01<00:00, 152.00it/s]


{'ner': 72.62520633347525}


100%|██████████| 300/300 [00:02<00:00, 148.80it/s]


{'ner': 72.89700632710158}


100%|██████████| 300/300 [00:02<00:00, 143.44it/s]


{'ner': 77.8504591670615}


100%|██████████| 300/300 [00:01<00:00, 150.90it/s]


{'ner': 74.84911607340268}


100%|██████████| 300/300 [00:01<00:00, 152.27it/s]


{'ner': 65.63667760831115}


100%|██████████| 300/300 [00:01<00:00, 152.49it/s]


{'ner': 72.43354419085117}


100%|██████████| 300/300 [00:01<00:00, 150.82it/s]


{'ner': 71.09343940391909}


100%|██████████| 300/300 [00:01<00:00, 152.87it/s]


{'ner': 77.6210527065922}


100%|██████████| 300/300 [00:01<00:00, 152.72it/s]

{'ner': 75.89528029367014}





### Test the trained model

In [22]:
for annotations in training_data["annotations"]:
    doc = nlp(annotations['text'])
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Google', 'ORG'), ('Chrome', 'PRODUCT'), ('Skia', 'LIBRARY'), ('Integer Overflow', 'FIELD')]
Tokens [('Google', 'ORG', 3), ('Chrome', 'PRODUCT', 3), ('Skia', 'LIBRARY', 3), ('Integer', 'FIELD', 3), ('Overflow', 'FIELD', 1), ('Vulnerability', '', 2)]
Entities [('Realtek', 'ORG'), ('Jungle SDK', 'PRODUCT'), ('Remote Code Execution', 'FIELD')]
Tokens [('Realtek', 'ORG', 3), ('Jungle', 'PRODUCT', 3), ('SDK', 'PRODUCT', 1), ('Remote', 'FIELD', 3), ('Code', 'FIELD', 1), ('Execution', 'FIELD', 1), ('Vulnerability', '', 2)]
Entities [('Linux Kernel Race', 'PRODUCT')]
Tokens [('Linux', 'PRODUCT', 3), ('Kernel', 'PRODUCT', 1), ('Race', 'PRODUCT', 1), ('Condition', '', 2), ('Vulnerability', '', 2)]
Entities [('Primetek', 'ORG'), ('Primefaces', 'PRODUCT'), ('Remote Code Execution', 'FIELD')]
Tokens [('Primetek', 'ORG', 3), ('Primefaces', 'PRODUCT', 3), ('Remote', 'FIELD', 3), ('Code', 'FIELD', 1), ('Execution', 'FIELD', 1), ('Vulnerability', '', 2)]
Entities [('SIMalliance', 'ORG'), ('T

### Save the cusom NER model

In [23]:
output_dir= os.path.abspath(os.getcwd())
nlp.to_disk(output_dir)
print("Saved model to", output_dir) 

Saved model to /Users/dj/Desktop/Capstone/Capstone


### Test the saved custom model

In [24]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for annotations in training_data["annotations"][3:5]:
    doc = nlp2(annotations['text'])
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from /Users/dj/Desktop/Capstone/Capstone
Entities [('Primetek', 'ORG'), ('Primefaces', 'PRODUCT'), ('Remote Code Execution', 'FIELD')]
Tokens [('Primetek', 'ORG', 3), ('Primefaces', 'PRODUCT', 3), ('Remote', 'FIELD', 3), ('Code', 'FIELD', 1), ('Execution', 'FIELD', 1), ('Vulnerability', '', 2)]
Entities [('SIMalliance', 'ORG'), ('Toolbox Browser', 'PRODUCT'), ('Command Injection', 'FIELD')]
Tokens [('SIMalliance', 'ORG', 3), ('Toolbox', 'PRODUCT', 3), ('Browser', 'PRODUCT', 1), ('Command', 'FIELD', 3), ('Injection', 'FIELD', 1), ('Vulnerability', '', 2)]


In [25]:
file = open('text2.txt')
text = file.read()
file.close()
nlp2 = spacy.load(output_dir)
doc = nlp2(text)
displacy.render(doc, style="ent", jupyter=True)

In [26]:
import json
 
with open('annotations.json', 'r') as f:
    data = json.load(f)
    
print(data['annotations'][0])
data['classes']
## Create empty model
training_data = data.copy()
training_data['classes'] = data['classes']
training_data['annotations'] = []
for text, annotation in data['annotations']:
    try:
        if text!="" and len(annotation['entities']) > 0:
            temp_dict = {}
            temp_dict['text'] = text
            temp_dict['entities'] = []
            for ent in annotation['entities']:
                start = ent[0]
                end = ent[1]
                label = ent[2].upper()
                temp_dict['entities'].append((start, end, label))
        training_data['annotations'].append(temp_dict)
    except:
        pass
print(training_data)
### Configuration variables
modelSpacy = None
n_iter=100
### Load the model
if modelSpacy is not None:
    nlp = spacy.load(modelSpacy)  
    print("Loaded model '%s'" % modelSpacy)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")
### Set up pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')
### Train the NER model
#getting all the entities
for annotations in training_data["annotations"]:
    for ent in annotations['entities']:
        ner.add_label(ent[2])
        
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        losses = {}
        for text, annotations in tqdm(data["annotations"]):
            try:
                if text!="":
                    example = Example.from_dict(nlp.make_doc(text), annotations)
                    #Update the model
                    nlp.update(
                        [example],  
                        drop=0.5,  
                        sgd=optimizer,
                        losses=losses)
            except:
                pass
        print(losses)

### Test the trained model
for annotations in training_data["annotations"]:
    doc = nlp(annotations['text'])
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
### Save the cusom NER model
output_dir= os.path.abspath(os.getcwd())
nlp.to_disk(output_dir)
print("Saved model to", output_dir) 
### Test the saved custom model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for annotations in training_data["annotations"][3:5]:
    doc = nlp2(annotations['text'])
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
file = open('text2.txt')
text = file.read()
file.close()
nlp2 = spacy.load(output_dir)
doc = nlp2(text)
displacy.render(doc, style="ent", jupyter=True)

['Microsoft Office Publisher contains a security feature bypass vulnerability that allows for a local, authenticated attack on a targeted system.', {'entities': [[0, 9, 'VENDOR'], [10, 26, 'PRODUCT'], [38, 61, 'VULNERABILITY NAME'], [94, 121, 'ACTOR'], [127, 142, 'MODULE']]}]
{'classes': ['VENDOR', 'PRODUCT', 'FEATURE', 'ACTOR', 'EXPLOIT', 'MODULE', 'VULNERABILITY NAME'], 'annotations': [{'text': 'Microsoft Office Publisher contains a security feature bypass vulnerability that allows for a local, authenticated attack on a targeted system.', 'entities': [(0, 9, 'VENDOR'), (10, 26, 'PRODUCT'), (38, 61, 'VULNERABILITY NAME'), (94, 121, 'ACTOR'), (127, 142, 'MODULE')]}, {'text': 'Microsoft Exchange Server contains an unspecified vulnerability that allows for remote code execution. This vulnerability is part of the ProxyLogon exploit chain.', 'entities': [(0, 9, 'VENDOR'), (10, 25, 'PRODUCT'), (38, 49, 'VULNERABILITY NAME'), (80, 102, 'EXPLOIT'), (137, 147, 'EXPLOIT')]}, {'text': 'Unspecifi

100%|██████████| 31/31 [00:00<00:00, 80.35it/s]


{'ner': 511.01224369685497}


100%|██████████| 31/31 [00:00<00:00, 72.73it/s]


{'ner': 324.8892041780855}


100%|██████████| 31/31 [00:00<00:00, 82.68it/s]


{'ner': 298.92395720276704}


100%|██████████| 31/31 [00:00<00:00, 82.23it/s]


{'ner': 274.04506498165756}


100%|██████████| 31/31 [00:00<00:00, 82.53it/s]


{'ner': 378.2655463790519}


100%|██████████| 31/31 [00:00<00:00, 83.67it/s]


{'ner': 267.0504632670235}


100%|██████████| 31/31 [00:00<00:00, 77.05it/s]


{'ner': 278.77798714209223}


100%|██████████| 31/31 [00:00<00:00, 77.87it/s]


{'ner': 265.8112647497519}


100%|██████████| 31/31 [00:00<00:00, 83.28it/s]


{'ner': 250.84692137300118}


100%|██████████| 31/31 [00:00<00:00, 82.96it/s]


{'ner': 304.8193462071319}


100%|██████████| 31/31 [00:00<00:00, 77.68it/s]


{'ner': 269.331589568524}


100%|██████████| 31/31 [00:00<00:00, 76.40it/s]


{'ner': 245.6555304862586}


100%|██████████| 31/31 [00:00<00:00, 83.38it/s]


{'ner': 265.6732646022625}


100%|██████████| 31/31 [00:00<00:00, 82.53it/s]


{'ner': 223.8592767043765}


100%|██████████| 31/31 [00:00<00:00, 82.83it/s]


{'ner': 271.41076546216493}


100%|██████████| 31/31 [00:00<00:00, 83.04it/s]


{'ner': 221.08538702611978}


100%|██████████| 31/31 [00:00<00:00, 83.15it/s]


{'ner': 226.51630779637736}


100%|██████████| 31/31 [00:00<00:00, 83.02it/s]


{'ner': 219.84219469043984}


100%|██████████| 31/31 [00:00<00:00, 83.02it/s]


{'ner': 212.20535975810955}


100%|██████████| 31/31 [00:00<00:00, 77.87it/s]


{'ner': 217.82540056722794}


100%|██████████| 31/31 [00:00<00:00, 82.66it/s]


{'ner': 210.18899472163378}


100%|██████████| 31/31 [00:00<00:00, 83.26it/s]


{'ner': 204.93369287984441}


100%|██████████| 31/31 [00:00<00:00, 83.27it/s]


{'ner': 186.16770926909095}


100%|██████████| 31/31 [00:00<00:00, 83.24it/s]


{'ner': 171.25698772988616}


100%|██████████| 31/31 [00:00<00:00, 83.30it/s]


{'ner': 157.40536022031662}


100%|██████████| 31/31 [00:00<00:00, 82.43it/s]


{'ner': 188.3831356538368}


100%|██████████| 31/31 [00:00<00:00, 82.65it/s]


{'ner': 148.3632485609029}


100%|██████████| 31/31 [00:00<00:00, 82.74it/s]


{'ner': 156.1414489213178}


100%|██████████| 31/31 [00:00<00:00, 77.99it/s]


{'ner': 172.61081754092942}


100%|██████████| 31/31 [00:00<00:00, 82.35it/s]


{'ner': 169.14676880301766}


100%|██████████| 31/31 [00:00<00:00, 82.34it/s]


{'ner': 145.49253001636836}


100%|██████████| 31/31 [00:00<00:00, 82.50it/s]


{'ner': 142.47594444654774}


100%|██████████| 31/31 [00:00<00:00, 82.88it/s]


{'ner': 132.42227125293903}


100%|██████████| 31/31 [00:00<00:00, 83.05it/s]


{'ner': 148.677973093293}


100%|██████████| 31/31 [00:00<00:00, 82.72it/s]


{'ner': 142.843559083917}


100%|██████████| 31/31 [00:00<00:00, 82.45it/s]


{'ner': 135.87309507032404}


100%|██████████| 31/31 [00:00<00:00, 82.50it/s]


{'ner': 152.38327804621412}


100%|██████████| 31/31 [00:00<00:00, 73.13it/s]


{'ner': 121.8362503263298}


100%|██████████| 31/31 [00:00<00:00, 80.80it/s]


{'ner': 123.95348926643099}


100%|██████████| 31/31 [00:00<00:00, 81.23it/s]


{'ner': 126.75781361413864}


100%|██████████| 31/31 [00:00<00:00, 80.04it/s]


{'ner': 114.26250731254511}


100%|██████████| 31/31 [00:00<00:00, 81.95it/s]


{'ner': 99.3548051178024}


100%|██████████| 31/31 [00:00<00:00, 77.13it/s]


{'ner': 98.83262172563265}


100%|██████████| 31/31 [00:00<00:00, 82.74it/s]


{'ner': 109.92822820518236}


100%|██████████| 31/31 [00:00<00:00, 82.58it/s]


{'ner': 110.12926451481903}


100%|██████████| 31/31 [00:00<00:00, 80.35it/s]


{'ner': 76.35469584550083}


100%|██████████| 31/31 [00:00<00:00, 82.19it/s]


{'ner': 79.39286640717339}


100%|██████████| 31/31 [00:00<00:00, 82.24it/s]


{'ner': 98.82805911297432}


100%|██████████| 31/31 [00:00<00:00, 82.27it/s]


{'ner': 85.84249866118819}


100%|██████████| 31/31 [00:00<00:00, 82.02it/s]


{'ner': 85.08741706447275}


100%|██████████| 31/31 [00:00<00:00, 82.76it/s]


{'ner': 91.14309153610587}


100%|██████████| 31/31 [00:00<00:00, 77.75it/s]


{'ner': 82.20616080480863}


100%|██████████| 31/31 [00:00<00:00, 82.24it/s]


{'ner': 78.2019926880512}


100%|██████████| 31/31 [00:00<00:00, 82.17it/s]


{'ner': 83.91605972828013}


100%|██████████| 31/31 [00:00<00:00, 82.85it/s]


{'ner': 67.1574175537916}


100%|██████████| 31/31 [00:00<00:00, 82.75it/s]


{'ner': 110.30641336332621}


100%|██████████| 31/31 [00:00<00:00, 82.63it/s]


{'ner': 73.87392652443157}


100%|██████████| 31/31 [00:00<00:00, 82.45it/s]


{'ner': 88.76891649853195}


100%|██████████| 31/31 [00:00<00:00, 82.48it/s]


{'ner': 62.140642151919224}


100%|██████████| 31/31 [00:00<00:00, 82.51it/s]


{'ner': 61.58763159803968}


100%|██████████| 31/31 [00:00<00:00, 76.93it/s]


{'ner': 58.20427252861253}


100%|██████████| 31/31 [00:00<00:00, 82.81it/s]


{'ner': 73.24328671410044}


100%|██████████| 31/31 [00:00<00:00, 82.38it/s]


{'ner': 72.90203213391213}


100%|██████████| 31/31 [00:00<00:00, 81.67it/s]


{'ner': 40.183011806486654}


100%|██████████| 31/31 [00:00<00:00, 82.69it/s]


{'ner': 84.35267154179908}


100%|██████████| 31/31 [00:00<00:00, 81.97it/s]


{'ner': 59.27750394450004}


100%|██████████| 31/31 [00:00<00:00, 79.50it/s]


{'ner': 94.10358983715533}


100%|██████████| 31/31 [00:00<00:00, 81.34it/s]


{'ner': 53.13480264536272}


100%|██████████| 31/31 [00:00<00:00, 78.89it/s]


{'ner': 64.67557370006428}


100%|██████████| 31/31 [00:00<00:00, 74.50it/s]


{'ner': 54.47531811464736}


100%|██████████| 31/31 [00:00<00:00, 82.33it/s]


{'ner': 77.89096298072062}


100%|██████████| 31/31 [00:00<00:00, 82.42it/s]


{'ner': 61.442109741733674}


100%|██████████| 31/31 [00:00<00:00, 82.64it/s]


{'ner': 57.75193529695368}


100%|██████████| 31/31 [00:00<00:00, 82.45it/s]


{'ner': 42.805538947348545}


100%|██████████| 31/31 [00:00<00:00, 82.60it/s]


{'ner': 59.93375707848889}


100%|██████████| 31/31 [00:00<00:00, 80.75it/s]


{'ner': 51.14118389957376}


100%|██████████| 31/31 [00:00<00:00, 78.40it/s]


{'ner': 45.38783609075748}


100%|██████████| 31/31 [00:00<00:00, 84.73it/s]


{'ner': 47.653169962143686}


100%|██████████| 31/31 [00:00<00:00, 85.35it/s]


{'ner': 52.036474302880926}


100%|██████████| 31/31 [00:00<00:00, 85.41it/s]


{'ner': 51.31606878675281}


100%|██████████| 31/31 [00:00<00:00, 85.71it/s]


{'ner': 68.20029088065004}


100%|██████████| 31/31 [00:00<00:00, 86.53it/s]


{'ner': 46.59918236029379}


100%|██████████| 31/31 [00:00<00:00, 79.10it/s]


{'ner': 45.46004651518803}


100%|██████████| 31/31 [00:00<00:00, 67.04it/s]


{'ner': 36.003943076603704}


100%|██████████| 31/31 [00:00<00:00, 76.86it/s]


{'ner': 28.48073139578559}


100%|██████████| 31/31 [00:00<00:00, 82.97it/s]


{'ner': 45.74753610265821}


100%|██████████| 31/31 [00:00<00:00, 78.87it/s]


{'ner': 50.35194180103279}


100%|██████████| 31/31 [00:00<00:00, 79.17it/s]


{'ner': 33.59984976097147}


100%|██████████| 31/31 [00:00<00:00, 79.94it/s]


{'ner': 41.91905209621664}


100%|██████████| 31/31 [00:00<00:00, 79.92it/s]


{'ner': 32.833154104585056}


100%|██████████| 31/31 [00:00<00:00, 76.30it/s]


{'ner': 35.38371870071802}


100%|██████████| 31/31 [00:00<00:00, 83.90it/s]


{'ner': 50.781888568879694}


100%|██████████| 31/31 [00:00<00:00, 80.54it/s]


{'ner': 36.82392651336295}


100%|██████████| 31/31 [00:00<00:00, 85.12it/s]


{'ner': 38.36881006607929}


100%|██████████| 31/31 [00:00<00:00, 84.56it/s]


{'ner': 27.054100542857658}


100%|██████████| 31/31 [00:00<00:00, 84.21it/s]


{'ner': 16.771503538299026}


100%|██████████| 31/31 [00:00<00:00, 84.18it/s]


{'ner': 45.22882965795594}


100%|██████████| 31/31 [00:00<00:00, 84.87it/s]


{'ner': 48.065608580371034}


100%|██████████| 31/31 [00:00<00:00, 76.80it/s]


{'ner': 19.537979260562864}


100%|██████████| 31/31 [00:00<00:00, 69.34it/s]


{'ner': 16.280409569208356}
Entities [('Microsoft', 'VENDOR'), ('Office Publisher', 'PRODUCT'), ('security feature bypass', 'VULNERABILITY NAME'), ('local, authenticated attack', 'ACTOR'), ('targeted system', 'MODULE')]
Tokens [('Microsoft', 'VENDOR', 3), ('Office', 'PRODUCT', 3), ('Publisher', 'PRODUCT', 1), ('contains', '', 2), ('a', '', 2), ('security', 'VULNERABILITY NAME', 3), ('feature', 'VULNERABILITY NAME', 1), ('bypass', 'VULNERABILITY NAME', 1), ('vulnerability', '', 2), ('that', '', 2), ('allows', '', 2), ('for', '', 2), ('a', '', 2), ('local', 'ACTOR', 3), (',', 'ACTOR', 1), ('authenticated', 'ACTOR', 1), ('attack', 'ACTOR', 1), ('on', '', 2), ('a', '', 2), ('targeted', 'MODULE', 3), ('system', 'MODULE', 1), ('.', '', 2)]
Entities [('Microsoft', 'VENDOR'), ('Exchange Server', 'PRODUCT'), ('unspecified', 'VULNERABILITY NAME'), ('remote code execution.', 'EXPLOIT'), ('ProxyLogon', 'EXPLOIT')]
Tokens [('Microsoft', 'VENDOR', 3), ('Exchange', 'PRODUCT', 3), ('Server', 'PRODUCT'