In [1]:
import json
import os
import json
import spacy
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.util import minibatch, compounding

# from spacy.gold import GoldParse
from spacy.scorer import Scorer

In [10]:
print(spacy.__version__)

3.5.1


### convert_data_to_spacy 
takes a JSON file path as an input and returns the training data formatted in a way that is suitable for training a spaCy model

In [2]:
# to convert data 
def convert_data_to_spacy(JSON_FilePath):
    try:
        ''' 
        Initialize empty list training_data to store the training data.
        Initialize empty list lines to store the lines from the input file.'''
        training_data = [] 
        lines=[]
        '''
        Open the input file using a with statement, specifying the file path, read mode ('r'), 
        and UTF-8 encoding.
        '''
        with open(JSON_FilePath, 'r',encoding='utf-8') as f: # open the json file
            lines = f.readlines()

        # Iterate through each line in lines.
        for line in lines:
            
            data = json.loads(line) #Load the JSON data and store it in the data variable
            text = data['content'] # Extract the text content from the data dictionary and store it in the text variable.
            entities = [] # Initialize an empty list entities to store entity annotations
            
            #Iterate through each annotation in data['annotation']
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0] # Get the starting and ending points of the entity from the points key
                labels = annotation['label'] # Extract the entity label(s) from the label key in the annotation dictionary.
                
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]
                
                # Iterate through each label in the labels list and append a tuple (start, end, label) to the entities list.
                for label in labels:
                    entities.append((point['start'], point['end'] + 1 ,label))

            # Append a tuple (text, {"entities": entities}) to the training_data list.
            training_data.append((text, {"entities" : entities}))

        # After processing all lines, return the training_data list.
        return training_data

    # If any exception occurs during the process, log the exception with a message and return None.
    except Exception as e: # in case of exception print-
        logging.exception("Unable to process " + JSON_FilePath + "\n" + "error = " + str(e))
        return None


In [3]:
train_data = convert_data_to_spacy("./input/training/Entity Recognition in Resumes.json")

In [4]:
'''
check_existing_model takes a model name as input and checks if the model exists or not. 
If the model exists, it prints "Model Exists. Updating the model" and returns the model name. 
If the model does not exist, it prints "Model by this name does not exist. 
Building a new one" and returns None.'''

def check_existing_model(model_name): # take model name as an input
# pass this in a try except block
    try: 
        nlp=spacy.load(model_name)
        print("Model Exists. Updating the model")
        return model_name
    except Exception as e: # exception
        print("Model by this name does not exist. Building a new one")
        return None


In [5]:
model = check_existing_model("nlp_model")

Model by this name does not exist. Building a new one


In [19]:
# function to train the model
def build_spacy_model(train_data,model):

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    TRAIN_DATA = train_data
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.get_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")     

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        if model is None:
            optimizer = nlp.begin_training()
        for itn in range(2):
            print("Starting iteration " + str(itn))
            # random.shuffle(TRAIN_DATA)
            # losses = {}
            # batches = minibatch(TRAIN_DATA, size=compounding(8., 32., 1.001))
            # for batch in batches:
            #     texts, annotations = zip(*batch)
            #     nlp.update(texts, annotations, sgd=optimizer, 
            #                losses=losses)
            # print('Losses', losses)
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                    try:
                        nlp.update(
                            [text],  # batch of texts
                            [annotations],  # batch of annotations
                            drop=0.2,  # dropout - make it harder to memorise data
                            sgd=optimizer,  # callable to update weights
                            losses=losses)
                    except Exception as e:
                        pass
            print(losses)
    
    nlp.to_disk("model")
    return nlp

In [29]:
model = build_spacy_model(train_data, model)

In [31]:
from tika import parser

function for text convertion 
def convert_pdf_to_text(dir):
    output=[]
    for root, dirs, files in os.walk(dir):
        print(files)
        for file in files:
            path_to_pdf = os.path.join(root, file)
            #print(path_to_pdf)
            [stem, ext] = os.path.splitext(path_to_pdf)
            if ext == '.pdf':
                print("Processing " + path_to_pdf)
                pdf_contents = parser.from_file(path_to_pdf,service='text')
                path_to_txt = stem + '.txt'
                # with open(path_to_txt, 'w',encoding='utf-8') as txt_file:
                #     print("Writing contents to " + path_to_txt)
                #     txt_file.write(pdf_contents['content'])
                output.append(pdf_contents['content'])
    return output