In [1]:
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from tqdm import tqdm



In [2]:
nlp=spacy.load("en_core_web_lg")

In [3]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
sentence = "Daniil Medvedev and Novak Djokovic have built an intriguing rivalry since the Australian Open decider, which the Serb won comprehensively."
doc = nlp(sentence)

In [5]:
displacy.render(doc, style="ent", jupyter=True)

In [6]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [7]:
for ent in doc.ents:
    print(ent.text, "|", ent.label_)

Daniil Medvedev | PERSON
Novak Djokovic | PERSON
Australian Open | ORG
Serb | NORP


In [8]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [9]:
sentence = "As a Full Stack Developer, you will develop applications in a very passionate environment being responsible for Front-end and Back-end development. You will perform development and day-to-day maintenance on large applications. You have multiple opportunities to work on cross-system single-page applications."

In [10]:
doc = nlp(sentence)
displacy.render(doc, style="ent", jupyter=True)



CREATING CUSTOM PIPELINE FOR JOB_ROLE

Steps to build the custom NER model for detecting the job role in job postings:

1. Annotate the data to train the model.
2. Convert the annotated data into the spaCy bin object.
3. Generate the config file from the spaCy website.
4. Train the model in the command line.
5. Load and test the saved model.

In [11]:
trainData=[("We are looking for a Full-stack Developer who is motivated to combine the art of design with the art of programming.",{"entities":[(21,40,"job_role")]}), 
("As a Full Stack Developer, you will develop applications in a very passionate environment being responsible for Front-end and Back-end development.",{"entities":[(5,25,"job_role")]}), 
("Ivy Mobility is looking for Full-stack Dot net Tech Lead who has the ability to work in a fast-paced environment, on multiple projects concurrently.",{"entities":[(28,56,"job_role")]}),
("Python Developer",{"entities":[(0,16,"job_role")]}),
("Previous experience working as a React Native Developer.", {"entities":[(33,51,"job_role")]}),
("Yellow Riddle is looking for a front-end Shopify Developer to join our growing team with our increasing volume of Shopify work.",{"entities":[(31,58,"job_role")]}),
("Job Title: Lead / Senior React Native Developer.",{"entities":[(25,47,"job_role")]}),
("Job Title: Salesforce Developer (LWC)",{"entities":[(11,31,"job_role")]}),
("Yours a highly-skilled market analyst with a proven ability to strategize the full lifecycle of product production from conception through release. ",{"entities":[(23,37,"job_role")]})]

CONVERT THE ANNOTATED DATA INTO THE SPACY BIN OBJECT

In [12]:
nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object
for text, annot in tqdm(trainData): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    try:
        doc.ents = ents # label the text with the ents
        db.add(doc)
    except:
        print(text, annot)
db.to_disk("./train.spacy") # save the docbin object

100%|██████████| 9/9 [00:00<00:00, 700.20it/s]


Generate the config file to train via Command line

In [13]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy




In [14]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy

[i] No output directory provided
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------


[2022-09-08 11:40:59,352] [INFO] Set up nlp object from config
[2022-09-08 11:40:59,360] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-09-08 11:40:59,363] [INFO] Created vocabulary
[2022-09-08 11:40:59,366] [INFO] Finished initializing nlp object
[2022-09-08 11:40:59,494] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
OMP: Hint This means that multiple copies of the OpenMP runtime have been linked into the program. That is dangerous, since it can degrade performance or cause incorrect results. The best thing to do is to ensure that only a single OpenMP runtime is linked into the process, e.g. by avoiding static linking of the OpenMP runtime in any library. As an unsafe, unsupported, undocumented workaround you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow the program to continue to execute, but that may cause crashes or silently produce incorrect results. For more in

In [15]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

[i] Saving to output directory: output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------


[2022-09-08 11:41:09,430] [INFO] Set up nlp object from config
[2022-09-08 11:41:09,439] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-09-08 11:41:09,442] [INFO] Created vocabulary
[2022-09-08 11:41:09,443] [INFO] Finished initializing nlp object
[2022-09-08 11:41:09,546] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
OMP: Hint This means that multiple copies of the OpenMP runtime have been linked into the program. That is dangerous, since it can degrade performance or cause incorrect results. The best thing to do is to ensure that only a single OpenMP runtime is linked into the process, e.g. by avoiding static linking of the OpenMP runtime in any library. As an unsafe, unsupported, undocumented workaround you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow the program to continue to execute, but that may cause crashes or silently produce incorrect results. For more in

In cmd cd Downloads

python -m spacy init fill-config base_config.cfg config.cfg

python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy

python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

In [17]:
nlp = spacy.load("output/model-last/")

In [18]:
sentence = "We are looking for a Backend Developer who has experience in designing, developing and implementing backend services using Python and Django."

doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style="ent", jupyter=True) 