# Named Entity Recognition

In [1]:

import json
import random
import logging
import spacy
from spacy.training import Example
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
from spacy.tokens import Span
import re
#from spacy.training import Scorer
from spacy.training import Example
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
import fitz  # PyMuPDF



# Train with selected entities 

In [4]:
import spacy
from spacy.training import Example
import json
import re
import random
import logging

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans."""
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            if valid_start < valid_end:
                valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

def convert_dataturks_to_spacy(dataturks_JSON_FilePath, selected_entities):
    try:
        training_data = []
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    # Filter out only "Skills" and "Name" labels
                    if label in selected_entities:
                        entities.append((point['start'], point['end'] + 1, label))

            training_data.append((text, {"entities": entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

def keep_longest_entities(entities):
    """Remove overlapping entities by keeping only the longest one"""
    entities = sorted(entities, key=lambda x: (x[0], -(x[1] - x[0])))

    non_overlapping_entities = []
    
    for i, (start, end, label) in enumerate(entities):
        if i == 0:
            non_overlapping_entities.append((start, end, label))
        else:
            prev_start, prev_end, prev_label = non_overlapping_entities[-1]
            if start >= prev_end:
                non_overlapping_entities.append((start, end, label))
            else:
                if (end - start) > (prev_end - prev_start):
                    non_overlapping_entities[-1] = (start, end, label)

    return non_overlapping_entities


def train_spacy(selected_entities):
    TRAIN_DATA = convert_dataturks_to_spacy("../data/traindata.json",selected_entities)
    TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
    nlp = spacy.blank('en')  # create blank Language class
    
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)

    # Add only "Skills" and "Name" labels to the NER component
    for _, annotations in TRAIN_DATA:
        annotations['entities'] = keep_longest_entities(annotations['entities'])
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])  # Only adds "Skills" and "Name" labels

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # Only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update(
                    [example],
                    drop=0.3,
                    sgd=optimizer,
                    losses=losses
                )
            print(losses)

    return losses, nlp

# Train the model
#'College Name', 'Companies worked at', 'Degree', 'Designation', 'Email Address', 'Graduation Year', 'Location', 'Name', 'Skills', 'UNKNOWN', 'Years of Experience'
selected_entities = ["Skills", "Name"]
losses, nlp = train_spacy(selected_entities)


Starting iteration 0
{'ner': 5214.083728252095}
Starting iteration 1
{'ner': 791.2761714185349}
Starting iteration 2
{'ner': 727.1208322038265}
Starting iteration 3
{'ner': 742.1385025036657}
Starting iteration 4
{'ner': 677.9047111781462}
Starting iteration 5
{'ner': 694.0711231891801}
Starting iteration 6
{'ner': 753.6272058137095}
Starting iteration 7
{'ner': 648.5127004660479}
Starting iteration 8
{'ner': 614.0949430490323}
Starting iteration 9
{'ner': 593.9946133920737}


In [5]:
print("Pipeline components:", nlp.pipe_names)
# Accessing the NER component parameters
ner = nlp.get_pipe('ner')
print("NER labels:", ner.labels)

# You can also inspect the model's config settings
print("NER configuration:", ner.cfg)

Pipeline components: ['ner']
NER labels: ('Name', 'Skills')
NER configuration: {'moves': None, 'update_with_oracle_cut_size': 100, 'multitasks': [], 'min_action_freq': 1, 'learn_tokens': False, 'beam_width': 1, 'beam_density': 0.0, 'beam_update_prob': 0.0, 'incorrect_spans_key': None}


In [6]:
nlp.config

{'paths': {'train': None, 'dev': None, 'vectors': None, 'init_tok2vec': None},
 'system': {'seed': 0, 'gpu_allocator': None},
 'nlp': {'lang': 'en',
  'pipeline': ['ner'],
  'disabled': [],
  'before_creation': None,
  'after_creation': None,
  'after_pipeline_creation': None,
  'batch_size': 1000,
  'tokenizer': {'@tokenizers': 'spacy.Tokenizer.v1'},
  'vectors': {'@vectors': 'spacy.Vectors.v1'}},
 'components': {'ner': {'factory': 'ner',
   'incorrect_spans_key': None,
   'model': {'@architectures': 'spacy.TransitionBasedParser.v2',
    'state_type': 'ner',
    'extra_state_tokens': False,
    'hidden_width': 64,
    'maxout_pieces': 2,
    'use_upper': True,
    'tok2vec': {'@architectures': 'spacy.HashEmbedCNN.v2',
     'pretrained_vectors': None,
     'width': 96,
     'depth': 4,
     'embed_size': 2000,
     'window_size': 1,
     'maxout_pieces': 3,
     'subword_features': True},
    'nO': None},
   'moves': None,
   'scorer': {'@scorers': 'spacy.ner_scorer.v1'},
   'update_wi

In [7]:
# save the model for n interation
# spaCy model and it's stored in `nlp`
output_dir = "../model/NER_NLP_it10"
# Save the trained model to the output directory
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")

Model saved to ../model/NER_NLP_it10


# NER training with validation set

In [8]:
# if you want to split a separate validation dataset train with the following code

In [None]:
import random
from sklearn.model_selection import train_test_split

# Function to split training data into train and validation sets
def split_data(data, test_size=0.2):
    train_data, val_data = train_test_split(data, test_size=test_size, random_state=42)
    return train_data, val_data

# Function to calculate validation loss
def evaluate_on_validation(nlp, val_data):
    val_losses = {}
    for text, annotations in val_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.0, losses=val_losses)
    return val_losses

def train_spacy():
    TRAIN_DATA = convert_dataturks_to_spacy("../data/traindata.json")
    TRAIN_DATA = trim_entity_spans(TRAIN_DATA)

    # Split data into training and validation sets (80% train, 20% validation)
    train_data, val_data = split_data(TRAIN_DATA, test_size=0.1)

    nlp = spacy.blank('en')  # create blank Language class
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)

    # Add labels
    for _, annotations in TRAIN_DATA:
        annotations['entities'] = keep_longest_entities(annotations['entities'])  # Keep only longest entities
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(50):
            print(f"Starting iteration {itn}")
            random.shuffle(train_data)
            train_losses = {}
            
            # Training phase
            for text, annotations in train_data:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.2, sgd=optimizer, losses=train_losses)

            # Validation phase (no training, just forward pass)
            val_losses = evaluate_on_validation(nlp, val_data)
            
            # Print losses for both training and validation sets
            print(f"Iteration {itn}:")
            print(f"Training Loss: {train_losses}")
            print(f"Validation Loss: {val_losses}")

    return nlp, val_losses, train_losses

# Train the model and return it
trained_nlp, val_losses, train_losses = train_spacy()

# Test the performance

In [None]:


def evaluate_model(nlp, test_data, selected_entities=None):
    y_true = []
    y_pred = []
    
    for text, annotations in test_data:
        doc = nlp(text)  # Apply the model to the test text
        example = Example.from_dict(nlp.make_doc(text), annotations)  # Create example using ground truth
        
        # Ensure the example's NER alignment is valid
        aligned_ner = example.get_aligned_ner()
        
        if aligned_ner is None:
            print(f"Warning: NER alignment failed for text: {text}")
            continue
        
        # Create a list of token-based true labels for the text
        true_labels = ["O"] * len(doc)  # Initialize with "O" (no entity)
        
        for start, end, label in annotations["entities"]:
            for token in doc:
                if token.idx >= start and token.idx + len(token) <= end:
                    true_labels[token.i] = label
        
        # Create a list of token-based predicted labels
        pred_labels = ["O"] * len(doc)
        for ent in doc.ents:
            for token in doc:
                if token.idx >= ent.start_char and token.idx + len(token) <= ent.end_char:
                    pred_labels[token.i] = ent.label_
        
        # Add the true and predicted labels to the global lists
        y_true.extend(true_labels)
        y_pred.extend(pred_labels)

    # Filter y_true and y_pred to only include selected entities
    if selected_entities is not None:
        y_true_filtered = [
            label if label in selected_entities or label == "O" else "O" 
            for label in y_true
        ]
        y_pred_filtered = [
            label if label in selected_entities or label == "O" else "O" 
            for label in y_pred
        ]
    else:
        y_true_filtered = y_true
        y_pred_filtered = y_pred

    # Calculate precision, recall, F1-score, and accuracy across only the selected labels
    print(classification_report(y_true_filtered, y_pred_filtered, zero_division=0))

    accuracy = accuracy_score(y_true_filtered, y_pred_filtered)
    print(f"Accuracy: {accuracy * 100:.2f}%")

# Load the trained model
output_dir = "../model/NER_NLP_it10"
nlp = spacy.load(output_dir)
print("Model loaded from", output_dir)

# Define the selected entities that you want to evaluate (e.g., only 'PERSON' and 'ORG')
selected_entities = ["Skills", "Name"]

# Test the model and evaluate it
examples = convert_dataturks_to_spacy("../data/testdata.json", selected_entities)
examples = trim_entity_spans(examples)
for _, annotations in examples:
    annotations['entities'] = keep_longest_entities(annotations['entities'])  # Keep only longest entities
# Call the evaluate_model function with the selected entities
evaluate_model(nlp, examples, selected_entities)
#evaluate_model(nlp, examples)


Model loaded from ../model/NER_NLP_it10


lecturer - oracle tutorials

Mumbai,..." with entities "[(0, 13, 'Name'), (973, 1703, 'Skills')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


              precision    recall  f1-score   support

        Name       0.95      0.97      0.96        39
           O       0.97      0.99      0.98     12285
      Skills       0.86      0.64      0.73      1184

    accuracy                           0.96     13508
   macro avg       0.92      0.87      0.89     13508
weighted avg       0.96      0.96      0.96     13508

Accuracy: 95.88%


In [None]:
# show the testing performance with the testdata.json
with open("../data/testdata.json", 'r') as f:
    test_data = []
    for line in f:
        test_data.append(json.loads(line))
# load the trained model
output_dir = "../model/NER_NLP_it10"
nlp = spacy.load(output_dir)
print("Model loaded from", output_dir)

for text in test_data[1:6]:
    content = text['content']
    #content = ' '.join(content.split())
    doc = nlp(content)  # Apply the model to the test text
    entities = [(ent.label_, ent.text) for ent in doc.ents if ent.label_ == "Skills"]  # Extracting entities and their labels
    print(entities)

Model loaded from ../model/NER_NLP_it10
[('Skills', 'Database (Less than 1 year), HTML (Less than 1 year), Linux. (Less than 1 year), MICROSOFT\nACCESS (Less than 1 year), MICROSOFT WINDOWS (Less than 1 year)')]
[('Skills', 'servicenow (1 year), Mainframe (3 years), cobol (3 years), Jcl (3 years), Teradata (3 years)')]
[('Skills', 'Database (3 years), SQL (3 years), Sql Dba')]
[('Skills', 'SEARCH ENGINE MARKETING (2 years), SEM (2 years), ACCESS (Less than 1 year), AJAX (Less\nthan 1 year), APACHE (Less than 1 year)')]
[('Skills', 'JAVA (1 year), C++ (Less than 1 year), Hadoop (Less than 1 year), HADOOP (Less than 1 year),\nCSS (Less than 1 year)')]


# Test the model with real pdf

In [None]:
from pdfminer.high_level import extract_text
import os
import json

# Function to extract text from a PDF using pdfminer.six
def extract_text_from_pdf(pdf_path):
    try:
        # Use pdfminer.six's extract_text function to get the text from the PDF
        text = extract_text(pdf_path)
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

# Function to format the extracted text in the same format as testdata.json
def format_extracted_text(pdf_text, filename):
    return {"content": pdf_text, "annotation": [{"label": [], "points": []}]}

# Main function to extract text from multiple PDFs in a folder and save to a JSON file
def extract_text_from_pdfs_in_folder(folder_path, output_json_path):
    extracted_data = []
    
    with open(output_json_path, 'w') as output_file:
        for filename in os.listdir(folder_path):
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(folder_path, filename)
                print(f"Extracting text from: {pdf_path}")
                
                # Extract text from the PDF
                pdf_text = extract_text_from_pdf(pdf_path)
                
                if pdf_text:  # Only process if text was extracted successfully
                    formatted_data = format_extracted_text(pdf_text, filename)
                    
                    # Dump each resume as a single line
                    json.dump(formatted_data, output_file, ensure_ascii=False)
                    output_file.write("\n")  # Add a newline after each resume's data
    
    print(f"Text extraction completed. Data saved to {output_json_path}")

In [None]:

extract_text_from_pdfs_in_folder('../data/Resume_data_pdf', '../data/Resume_data_pdf/resume_data.json')

Extracting text from: ../data/Resume_data_pdf/Deemah_Alabdulaali_Resume.pdf
Extracting text from: ../data/Resume_data_pdf/Ali Abuharb's CV.pdf
Extracting text from: ../data/Resume_data_pdf/Whitmore-resume.pdf
Extracting text from: ../data/Resume_data_pdf/resume_juanjosecarin.pdf
Text extraction completed. Data saved to ../data/Resume_data_pdf/resume_data.json


In [None]:
# load the trained model
output_dir = "../model/NER_NLP_it10"
nlp = spacy.load(output_dir)
print("Model loaded from", output_dir)
with open("../data/Resume_data_pdf/resume_data.json", 'r') as f:
    resume_data = []
    for line in f:
        resume_data.append(json.loads(line))
        
# Iterate through the resume entries and print the content
for idx, entry in enumerate(resume_data):
    print(f"Resume {idx + 1}:")
    #print(entry['content'])  # Print the cleaned resume content
    doc = nlp(entry['content'])  # Apply the model to the test text
    entities = [(ent.label_, ent.text) for ent in doc.ents]  # Extracting entities and their labels
    print(entities)
    print("\n")  # Add a blank line between resumes

Model loaded from ../model/NER_NLP_it10
Resume 1:
[('Name', 'Ali Ibrahim'), ('Skills', 'Technical Skills: \n\n•  Deep Learning, Machine Learning, NLP, \n\nComputer Vision \n\n•  Python \n\n•  PySpark \n\n• \n\nSQL, T-SQL, PL-SQL \n\n•  Alteryx \n\n•  Data Quality \n\n•  Data Engineering \n\n•  Prompt Engineering \n\n•  HTML5, CSS3 \n\n•  Google Analytics \n\n•  Minitab \n\n•  Anylogic \n\n•  Microsoft PowerBi, Tableau \n\n•  Back-End Development \n\n•  Data Analytical/Calculation Engines \n\n• \n\nStatistics')]


Resume 2:
[('Name', 'Jonathan Whitmore'), ('Skills', 'Languages Python, SQL (Impala/Hive), R, LATEX, Bash.')]


Resume 3:
[('Name', 'Mountain View')]




In [None]:
# keep only the name, skills, 'Degree' 'Years of Experience' from 
# ('College Name', 'Companies worked at', 'Degree', 'Designation', 'Email Address', 'Graduation Year',
#  'Location', 'Name', 'Skills', 'UNKNOWN', 'Years of Experience')