## LOADING AND PREPROCESSING DATA

In [1]:
import json
import re

# function to convert the raw json format into a more clean and acceptable format
def json_to_spacy(json_path):
    
    file = open(json_path, 'r',encoding='utf8')                       # reading JSON file
    lines = file.readlines()
    training_data = []  
    for line in lines:
        data = json.loads(line)                                       # converting JSON string into a dictionary
        text = data['content'].replace("\n", " ")                     # replacing '\n' with ' '
        entities = []
        data_annotations = data['annotation']
        
        if data_annotations is not None:
            for annotation in data_annotations:
                point = annotation['points'][0]
                labels = annotation['label']
                
                if not isinstance(labels, list):                     # converting labels to list if it is not a list
                    labels = [labels]
                    
                for lname in labels:
                    start = point['start']
                    end = point['end']
                    text = point['text']

                    left_diff = len(text) - len(text.lstrip())        
                    right_diff = len(text) - len(text.rstrip())
                    if left_diff != 0:                              # updating the start and end points based on strip diff.
                        start = start + left_diff
                    if right_diff != 0:
                        end = end - right_diff
                    entities.append((start, end + 1 , lname))
                    
        training_data.append((text, {"entities" : entities}))
        
    return training_data


# function to remove leading and trailing whitespaces from entity spans
def trim_space(data: list) -> list:

    invalid_token = re.compile(r'\s')             # setting whitespce as invalid token using regular expression

    cleaned_data = []                             # array for storing final cleaned data
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            
            #checking for invalid token in the start of entity
            while valid_start < len(text) and invalid_token.match(text[valid_start]):
                valid_start += 1
                
            #checking for invalid token at the end of entity  
            while len(text) > valid_end > 1 and invalid_token.match(text[valid_end - 1]):              
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
            
        cleaned_data.append((text,valid_entities))
    return cleaned_data

In [2]:
data = trim_space(json_to_spacy("Entity Recognition in Resumes.json"))
data[20]

('Govardhana K',
 [[1749, 1755, 'Companies worked at'],
  [1696, 1702, 'Companies worked at'],
  [1417, 1423, 'Companies worked at'],
  [1356, 1793, 'Skills'],
  [1209, 1215, 'Companies worked at'],
  [1136, 1247, 'Skills'],
  [928, 932, 'Graduation Year'],
  [858, 889, 'College Name'],
  [821, 856, 'Degree'],
  [787, 791, 'Graduation Year'],
  [744, 750, 'Companies worked at'],
  [722, 742, 'Designation'],
  [658, 664, 'Companies worked at'],
  [640, 656, 'Designation'],
  [574, 580, 'Companies worked at'],
  [555, 572, 'Designation'],
  [470, 493, 'Companies worked at'],
  [444, 468, 'Designation'],
  [308, 314, 'Companies worked at'],
  [234, 240, 'Companies worked at'],
  [175, 198, 'Companies worked at'],
  [93, 136, 'Email Address'],
  [39, 48, 'Location'],
  [13, 37, 'Designation'],
  [0, 12, 'Name']])

##  CREATING A SPACY OBJECT

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

def get_spacy_doc(data):
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(data): # data in previous format
        doc = nlp(text) # create doc object from text
        ents=[]
        entity_indices=[]
        for start, end, label in annot :
            # add character indexes
            skip_entity=False
            for idx in range(start,end):
                if idx in entity_indices:
                    skip_entity=True
                    break
            if skip_entity==True:
                continue
            entity_indices=entity_indices+list(range(start,end))
            
            try:
                span = doc.char_span(start, end, label=label)
            except:
                continue
            if span is None:
                continue
            else:
                ents.append(span)
            
        try:
            doc.ents=ents
            db.add(doc)
        except:
            pass
    return db

db=get_spacy_doc(data)
db.to_disk("./train.spacy") # save the docbin object

## TRAINING AND LOADING THE MODEL

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy 

In [3]:
import spacy
nlp = spacy.load("output/model-best/") 

## TESTING THE MODEL ON TRAINING DATA AND PARSING RESUMES

In [3]:
for i,_ in data:
    doc = nlp(i)
    print([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
# Extracting text from pdf
from pdfminer.high_level import extract_text
def text_from_pdf(pdf):
    return extract_text(pdf)

# Taking resume as input
resume = text_from_pdf("Pulkit_Saxena_resume.pdf")


# Removing the punctuations
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~•'''
for i in resume:
    if i in punctuations:
        resume = resume.replace(i, "")
        
# Obtaing cleaned text from pdf
final_text = str.join(" ", resume.splitlines())

from spacy import displacy
doc = nlp(final_text)
ans = [(ent.text, ent.label_) for ent in doc.ents]
displacy.render(doc,style='ent')

## MATCHING  --  GIVING SCORES TO RESUMES

In [3]:
company_list = open("Companies.txt","r")
company_string = company_list.readlines()[0]
degree_list=open("Degrees.txt","r")
degree_string=degree_list.readlines()[0]
skill_list=open("Skills.txt","r")
skill_string=skill_list.readlines()[0]
skill_string=skill_string.lower()

In [7]:
pref_skillS = 30
pref_degreeS = 10
pref_companiesS = 20

matched_skill=0
matched_degree=0
matched_companies=0
for i in ans:
    if i[1]=='Companies worked at':
        if i[0] in company_string:
            matched_companies+=1
    if i[1]=='Degree':
        if i[0] in degree_string:
            matched_degree+=1
    if i[1]=='Skills':
        if i[0].lower() in skill_string:
            matched_skill+=1


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity([[matched_skill,matched_degree,matched_companies]],[[pref_skillS,pref_degreeS,pref_companiesS]])
print(f"The input resume matches {similarity[0][0]*100:.2f} percent with the required job position.")

The input resume matches 53.45 percent with the required job position.
