When custom training a SpaCy-RoBERTa model, you cannot use JSON files directly as input for training. The JSON files first need to be converted to the .spacy format to be compatible with the training process.

To do this, specify the path to your annotated JSON files in the "data_folder" variable.

In [2]:
import os
import json
import spacy
from spacy.tokens import DocBin

data_folder = "/content/drive/MyDrive/ResumeParser_FinalYear(Annotations)/AnnotatedResumes"

nlp = spacy.blank("en")

# DocBin to store the examples
doc_bin = DocBin()

# Looping over all the JSON files in the folder
for filename in os.listdir(data_folder):
    if filename.endswith(".json"):
        with open(os.path.join(data_folder, filename), "r") as f:
            data = json.load(f)
            # Processing each annotation in the file
            for text, annot in data['annotations']:
                doc = nlp.make_doc(text)
                ents = []
                for start, end, label in annot['entities']:
                    span = doc.char_span(start, end, label=label)
                    if span:
                        ents.append(span)
                doc.ents = ents
                doc_bin.add(doc)

# DocBin object
doc_bin.to_disk("./train.spacy")
