# Resume (CV) Parsing using Spacy 3


## NER Training Preparation
- Download base `config` file from https://spacy.io/usage/training#quickstart
- Modify config file for training data
- Prepare training data
- Do training
- Final testing

Data
https://github.com/SShadabHussain/Resume-Parsing

In [None]:
pip install spacy_transformers
pip install -U spacy

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [None]:
spacy.__version__

In [None]:
git clone https://github.com/SShadabHussain/Resume-Parsing.git

In [None]:
cv_data = json.load(open('Resume-Parsing/data/training/train_data.json', 'r'))

In [None]:
len(cv_data)

In [None]:
python -m spacy init fill-config /content/Resume-Parsing/data/training/base_config.cfg /content/Resume-Parsing/data/training/config.cfg

In [None]:
# cv_data[0]

In [None]:
def get_spacy_doc(file, data):
  nlp = spacy.blank('en')
  db = DocBin()

  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity=True
          break
      if skip_entity==True:
        continue

      entity_indices = entity_indices + list(range(start, end))

      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)

      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db
      

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(cv_data, test_size=0.3)

In [None]:
len(train), len(test)

In [None]:
file = open('error.txt', 'w')

db = get_spacy_doc(file, train)
db.to_disk('train_data.spacy')

db = get_spacy_doc(file, test)
db.to_disk('test_data.spacy')

file.close()

In [None]:
len(db.tokens)

In [None]:
python -m spacy train /content/Resume-Parsing/data/training/config.cfg --output ./output --paths.train ./train_data.spacy --paths.dev ./test_data.spacy --gpu-id 0

### Model Test

In [None]:
nlp = spacy.load('/content/output/model-best')

In [None]:
doc = nlp('My name is Syed Shadab Hussain. I am a data scientist. Check out this resume parsing project')
for ent in doc.ents:
  print(ent.text, "   ->>>>> ", ent.label_)

In [None]:
pip install PyMuPDF

In [None]:
import sys, fitz

In [None]:
fname = '/content/Resume-Parsing/data/test/Alice Clark CV.pdf'
doc = fitz.open(fname)

In [None]:
# doc = [page.getText() for page in doc]

In [None]:
text = " "
for page in doc:
  text = text + str(page.get_text())

In [None]:
text = text.strip()

In [None]:
text = ' '.join(text.split())

In [None]:
text

In [None]:
doc = nlp(text)
for ent in doc.ents:
  print(ent.text, "   ->>>>> ", ent.label_)