**Install the 'spacy' library and the the 'spacy_transformers' library**

In [None]:
!pip install -U spacy
!pip install spacy_transformers

Collecting spacy_transformers
  Downloading spacy_transformers-1.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.8/190.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<4.31.0,>=3.4.0 (from spacy_transformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy_transformers)
  Downloading spacy_alignments-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers<4.31.0,>=3.4.0->spacy_transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

**Import necessary libraries**

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

**Load the dataset**

In [None]:
cv_data=json.load(open('/content/drive/MyDrive/JobDescription/dataset/train_data.json','r'))

**Initialize a spaCy configuration file by filling in values from 'base_config.cfg' and save the resulting configuration to 'config.cfg'**

In [None]:
!python -m spacy init fill-config /content/drive/MyDrive/JobDescription/config/base_config.cfg /content/drive/MyDrive/JobDescription/config/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/JobDescription/config/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


**Process text data using spaCy**

In [None]:
def get_spacy_doc(file,data):
  nlp=spacy.blank("en")
  db=DocBin()
  for text,annot  in tqdm(data):
    doc=nlp.make_doc(text)
    annot=annot['entities']
    ents=[]
    entity_indices=[]
    for start,end,label in annot:
      skip_entity=False
      for idx in range(start,end):
        if idx in entity_indices:
          skip_entity=True
          break
      if skip_entity==True:
        continue
      entity_indices=entity_indices+list(range(start,end))
      try:
        span=doc.char_span(start,end,label=label,alignment_mode='strict')
      except:
        continue
      if span is None:
        err_data=str([start,end])+"     "+str(text)+"\n"
        file.write(err_data)
      else:
        ents.append(span)
    try:
      doc.ents=ents
      db.add(doc)
    except:
      pass
  return db

**Split the dataset 'cv_data' into training and testing sets using train_test_split with a test size of 30% (0.3), assigning the results to 'train' (training set) and 'test' (testing set)**

In [None]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(cv_data,test_size=0.3)

**Load the spaCy model from the specified path**

In [None]:
nlp=spacy.load('/content/drive/MyDrive/JdModel/JdModel/output/model-best')



**Install the 'PyMuPDF' library for working with PDF file**

In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.3-cp310-none-manylinux2014_x86_64.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.3 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.3 PyMuPDFb-1.23.3


**Open train_file.txt for writing. Preprocess and convert the 'train' and 'test' data into spaCy format, and save them as spaCy binary files**

In [None]:
file=open('/content/drive/MyDrive/JobDescription/JobDescriptionModel/train_file.txt','w')
db=get_spacy_doc(file,train)
db.to_disk('/content/drive/MyDrive/JobDescription/JobDescriptionModel/train_data.spacy')
db=get_spacy_doc(file,test)
db.to_disk('/content/drive/MyDrive/JobDescription/JobDescriptionModel/test_data.spacy')
file.close()

100%|██████████| 140/140 [00:01<00:00, 85.55it/s]
100%|██████████| 60/60 [00:01<00:00, 47.13it/s]


**Train the spaCy model using the specified configuration file, training and development data and GPU with ID 0 and save the trained model to the specified output directory**

In [None]:
!python -m spacy train /content/drive/MyDrive/JobDescription/config/config.cfg --output /content/drive/MyDrive/JobDescription/JobDescriptionModel/output --paths.train  /content/drive/MyDrive/JobDescription/JobDescriptionModel/train_data.spacy --paths.dev /content/drive/MyDrive/JobDescription/JobDescriptionModel/test_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/JobDescription/JobDescriptionModel/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
Yo