**Install the 'spacy' library and the the 'spacy_transformers' library**

In [None]:
!pip install -U spacy
!pip install spacy_transformers

Collecting spacy_transformers
  Downloading spacy_transformers-1.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.8/190.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<4.31.0,>=3.4.0 (from spacy_transformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy_transformers)
  Downloading spacy_alignments-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers<4.31.0,>=3.4.0->spacy_transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

**Import necessary libraries**

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

**Load the dataset**

In [None]:
cv_data=json.load(open('/content/drive/MyDrive/JobDescription/dataset/train_data.json','r'))

**Process text data using spaCy**

In [None]:
def get_spacy_doc(file,data):
  nlp=spacy.blank("en")
  db=DocBin()
  for text,annot  in tqdm(data):
    doc=nlp.make_doc(text)
    # Extract entity annotations from the data
    annot=annot['entities']
    ents=[]
    entity_indices=[]
    for start,end,label in annot:
      skip_entity=False
      for idx in range(start,end):
        if idx in entity_indices:
          skip_entity=True
          break
      if skip_entity==True:
        continue
      entity_indices=entity_indices+list(range(start,end))
      try:
        span=doc.char_span(start,end,label=label,alignment_mode='strict')
      except:
        continue
      if span is None:
        # Handle cases where the span could not be created
        err_data=str([start,end])+"     "+str(text)+"\n"
        file.write(err_data)
      else:
        ents.append(span)
    try:
      doc.ents=ents
      db.add(doc)
    except:
      pass
  return db

**Split the 'cv_data' into training and testing sets**

In [None]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(cv_data,test_size=0.3)

**Load the spaCy model from the specified path**

In [None]:
nlp=spacy.load('/content/drive/MyDrive/JdModel/JdModel/output/model-best')



**Install the 'PyMuPDF' library for working with PDF file**

In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.3-cp310-none-manylinux2014_x86_64.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.3 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.3 PyMuPDFb-1.23.3


**Import the necessary libraries for PDF processing**

In [None]:
import sys,fitz
fname='/content/drive/MyDrive/python_developer.pdf'
doc=fitz.open(fname)

**Extract text from each page of the PDF and append it to 'text'**

In [None]:
text=" "
for page in doc:
  text=text+str(page.get_text())

**Process the extracted text using the spaCy model and print entity text and labels detected by the model**

In [None]:
doc=nlp(text)
for ent in doc.ents:
  print(ent.text,"    ->>>>>>>>>>    ",ent.label_)

Python Developer     ->>>>>>>>>>     JOBPOST
1 years     ->>>>>>>>>>     EXPERIENCE
Django     ->>>>>>>>>>     SKILLS
Python     ->>>>>>>>>>     SKILLS
Bootstrap     ->>>>>>>>>>     SKILLS
HTML     ->>>>>>>>>>     SKILLS
Jquery     ->>>>>>>>>>     SKILLS
CSS     ->>>>>>>>>>     SKILLS
Ajax     ->>>>>>>>>>     SKILLS
Javascript     ->>>>>>>>>>     SKILLS
Bootstrap     ->>>>>>>>>>     SKILLS
GIT     ->>>>>>>>>>     SKILLS
lab     ->>>>>>>>>>     SKILLS
Diploma in Computer Science (College or University)     ->>>>>>>>>>     DEGREE
Docker     ->>>>>>>>>>     SKILLS
Kubernetes     ->>>>>>>>>>     SKILLS
OpenShift     ->>>>>>>>>>     SKILLS
Vue.js     ->>>>>>>>>>     SKILLS
Angular     ->>>>>>>>>>     SKILLS
React     ->>>>>>>>>>     SKILLS
HTML5     ->>>>>>>>>>     SKILLS
CSS3     ->>>>>>>>>>     SKILLS
GIT     ->>>>>>>>>>     SKILLS
Webpack     ->>>>>>>>>>     SKILLS
NPM     ->>>>>>>>>>     SKILLS
