# Generate own Spacy NER tag categories

In [1]:
# Basic categories

objective = {'profile','career goal','objective','career objective','employment objective','professional objective','summary',
             'career summary', 'professional summary','personal statement'}

work_and_employment=  { 'employment history','work history','work experience','professional experience','professional background',
        'career related experience','employment','career experience','career summary','career history','professional proficiency'}
        
education_and_training= {'academic background','academic experience','programs','courses','related courses','education',
        'educational background','educational qualifications','educational training','education and training','training','academic training',
        'professional training','course project experience','related course projects','internship experience','internships','apprenticeships',
        'college activities','certifications','special training','qualifications','degree'}

skills = {'areas of experience','areas of expertise','areas of knowledge','skills',"other skills",
        "other abilities",'career related skills','professional skills','specialized skills','technical skills','computer skills',
        'personal skills','computer knowledge','technologies','technical experience','proficiencies','languages','language competencies and skills',
        'programming languages','competencies'}

misc={'activities','affiliations','professional affiliations','associations','professional associations',
        'memberships','professional memberships','athletic involvement','community involvement','referee','civic activities',
        'extra-curricular activities','professional activities','volunteer work','additional information',
        'interests','volunteer','volunteering','community'}

accomplishments={'achievement','licenses','presentations','conference presentations','conventions','dissertations','exhibits',
        'papers','publications','professional publications','research','research grants','project','research projects','personal projects',
        'current research interests','thesis','theses','projects'}
linkedin = {'linkedin'}

In [2]:
# create the training data into the right format from the objective list
# takes in the predefined dictionaries from above
def create_training_data(dict_inp, type): 
    
    data = dict_inp
    
    patterns = []
    for item in data:
        pattern = {
                    "label": type,
                    "pattern": item
                    }
        patterns.append(pattern)
    return (patterns)

In [3]:
patterns = create_training_data(objective,'OBJECTIVE')
print(patterns)
patterns_w = create_training_data(work_and_employment,'WORK_AND_EMPLOYMENT')
print(patterns_w)
patterns_e = create_training_data(education_and_training,'EDUCATION_AND_TRAINING')
print(patterns_e)
patterns_sk = create_training_data(skills,'SKILLS')
print(patterns_sk)
patterns_m = create_training_data(misc,'MISC')
print(patterns_m)
patterns_a = create_training_data(accomplishments,'ACCOMPLISHMENTS')
print(patterns_a)
patterns_l = create_training_data(linkedin,'LINKEDIN')
print(patterns_l)

[{'label': 'OBJECTIVE', 'pattern': 'career objective'}, {'label': 'OBJECTIVE', 'pattern': 'objective'}, {'label': 'OBJECTIVE', 'pattern': 'profile'}, {'label': 'OBJECTIVE', 'pattern': 'professional summary'}, {'label': 'OBJECTIVE', 'pattern': 'career summary'}, {'label': 'OBJECTIVE', 'pattern': 'employment objective'}, {'label': 'OBJECTIVE', 'pattern': 'career goal'}, {'label': 'OBJECTIVE', 'pattern': 'summary'}, {'label': 'OBJECTIVE', 'pattern': 'personal statement'}, {'label': 'OBJECTIVE', 'pattern': 'professional objective'}]
[{'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'work history'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'career history'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'career related experience'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'professional background'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'employment'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'career summary'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'professional proficiency'},

In [4]:
combine_dict = patterns+patterns_w+patterns_e+patterns_sk+patterns_m #+patterns_a+patterns_l
print(combine_dict)

[{'label': 'OBJECTIVE', 'pattern': 'career objective'}, {'label': 'OBJECTIVE', 'pattern': 'objective'}, {'label': 'OBJECTIVE', 'pattern': 'profile'}, {'label': 'OBJECTIVE', 'pattern': 'professional summary'}, {'label': 'OBJECTIVE', 'pattern': 'career summary'}, {'label': 'OBJECTIVE', 'pattern': 'employment objective'}, {'label': 'OBJECTIVE', 'pattern': 'career goal'}, {'label': 'OBJECTIVE', 'pattern': 'summary'}, {'label': 'OBJECTIVE', 'pattern': 'personal statement'}, {'label': 'OBJECTIVE', 'pattern': 'professional objective'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'work history'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'career history'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'career related experience'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'professional background'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'employment'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'career summary'}, {'label': 'WORK_AND_EMPLOYMENT', 'pattern': 'professional proficiency'}, 

In [5]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json

# create a new NER pattern based on objective and saves them in resume_ner 
def generate_rules(patterns):
    #nlp = English()
    nlp = spacy.load('en_core_web_lg')
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    nlp.to_disk("resume_ner")

In [6]:
nlp_res = spacy.load('resume_ner')

In [7]:
# now generte the rules based on combined dictionary
generate_rules(combine_dict)

In [8]:
# a simple functon to print out the entities in a text string
def print_doc_ents(text):
    
    doc = nlp_res(text.lower()) # incase it isn't already lower case    
    
    #for token in doc:
    #    if(token.pos_print(token.text,'->',token.pos_)
    #ind = {}
    ind_list = []
    for ent in doc.ents:
        if(ent.label_=='OBJECTIVE'):
            print(ent.text, ent.label_,ent.start, ent.end)
            ind_list.append((ent.text,ent.start))
        elif(ent.label_=='WORK_AND_EMPLOYMENT'):
            #print(token.tag_)
            print(ent.text, ent.label_, ent.start,ent.end) #_char-ent.sent.start_char, ent.end_char-ent.sent.start_char)
            ind_list.append((ent.text,ent.start))
        elif(ent.label_=='EDUCATION_AND_TRAINING'):
            print(ent.text, ent.label_, ent.start, ent.end)
            ind_list.append((ent.text,ent.start))
        elif(ent.label_=='SKILLS_HEADER'):
            print(ent.text, ent.label_, ent.start, ent.end)
            ind_list.append((ent.text,ent.start))
        elif(ent.label_=='MISC'):
            print(ent.text, ent.label_, ent.start, ent.end)
            ind_list.append((ent.text,ent.start))
        elif(ent.label_=='ACCOMPLISHMENTS'):
            print(ent.text, ent.label_, ent.start, ent.end)
            ind_list.append((ent.text,ent.start))
                    
    print(sorted(ind_list))

In [9]:
from pdfminer.high_level import extract_text

def extract_text_from_pdf(pdf_path):
    text = extract_text(pdf_path)
    #for line in text:
    #    print(line)
    #print(text)
    return text 


In [10]:
peter_text = extract_text_from_pdf('/home/chris/reesby/reverse_Malih/new_resumes/pdfs/Peter-Ziminovic-Resume.pdf')

In [11]:
peter_text = peter_text.replace('\n',' ')
peter_text = peter_text.replace('\t',' ')
peter_text = peter_text.replace('\uf0b7',' ')
peter_text = peter_text.lower()

In [12]:
peter_text



In [13]:
print_doc_ents(peter_text)

personal statement OBJECTIVE 19 21
education EDUCATION_AND_TRAINING 145 146
degree EDUCATION_AND_TRAINING 179 180
education EDUCATION_AND_TRAINING 232 233
training EDUCATION_AND_TRAINING 249 250
career history WORK_AND_EMPLOYMENT 340 342
employment WORK_AND_EMPLOYMENT 408 409
training EDUCATION_AND_TRAINING 558 559
programs EDUCATION_AND_TRAINING 697 698
community MISC 875 876
referee MISC 880 881
referee MISC 884 885
[('career history', 340), ('community', 875), ('degree', 179), ('education', 145), ('education', 232), ('employment', 408), ('personal statement', 19), ('programs', 697), ('referee', 880), ('referee', 884), ('training', 249), ('training', 558)]


In [14]:
doc = nlp_res(peter_text)

In [15]:
doc[19:21]

personal statement

In [16]:
doc[19:144]

personal statement  i was employed at the bureau of meteorology (bom) for 27 years as an operational  meteorologist and was made redundant in december 2020. i am currently completing a  masters in data science (expected completion august 2021).  i am currently doing a data science internship at reeby.  in my previous role i  provided direct advice to air traffic controllers and airlines on the  current and expected weather at sydney airport. i have extensive knowledge of aviation  meteorology and how that affects airline operations. i also used my extensive computing  knowledge to develop and maintain several oracle databases tables and java based web  applications.   

In [17]:
doc[145:180]

education history demonstrates the drive i have to keep my skills up to date and i wish  to continue learning and challenging myself. i am currently current completing a master’s  degree

In [18]:
doc[232:278]

education   hsc barker college, hornsby, completed 1989     dip met. bureau of meteorology training centre, completed 1994   grad dip computing monash university, completed 2003   master of data science - james cook university – expected august 2021.  

In [19]:
doc[340:558]

career history  reesby  at reesby i have worked on 2 projects. the first was an emotional recognition system that  would split up and interview into the speakers and then perform emotional recognition  based on the separated audio and video of that interview. the current project involves extracting information from a persons resume and tehn using  that information to match with potential employment opportunities. both projects have been developed using the python programming language.  may 2021-present  bureau of meteorology  operational weather forecaster for nsw 1995-2000. antarctic weather forecaster - casey station.          summer 1997 – 1998                   january 1994 – december 2020  
senior forecaster - sydney airport meteorological unit    2000-2020     in my day to day operations, i must provide:  o accurate and timely weather forecasts for sydney airport working on 12-hour  o make decisive and urgent decision-making action, as a result of developing   o talking to custom

In [20]:
doc[558:875]



In [21]:
doc[875:]

community organisations level 3 football referee - kuringai district referee association player/manager - west pymble football club teams  player - gordon golf club            2012-present          2007-present          2007-present  professional references  

# now to load lots of resumes and see if the categorisation works.

In [22]:
from tika import parser
import os,glob
import pandas as pd


def convert_pdf_to_text(dir):
    output = []
    #output = pd.DataFrame()
    #for root, dirs, files in os.walk(dir):
    #    print(files)
    res_list = sorted(glob.glob(dir),key=os.path.getmtime)
    #print('from convert to pdf - ', res_list)
    for res in res_list:
    

#        for file in files:
        #path_to_pdf = os.path.join(root, file)
        [stem, ext] = os.path.splitext(res)#path_to_pdf)
        if ext == ".pdf":
            print("Processing " + res)#  path_to_pdf)
            pdf_contents = parser.from_file(res, service="text") 
            path_to_txt = stem + ".txt"
            pdf_remove_newline = pdf_contents["content"].replace("\n"," ")
            output.append(pdf_remove_newline) #pdf_contents["content"])
    
    df = pd.DataFrame(output, columns=["Resumes"])
    df.to_csv("/home/chris/reesby/reverse_Malih/new_resumes/pdfs/All_new.csv")
    return print_doc_ents(' '.join(output))

In [23]:
convert_pdf_to_text('/home/chris/reesby/reverse_Malih/new_resumes/pdfs/*.*')

Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Resume 2.pdf
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Emily Loughlin Online Sales Representative Resume.pdf
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Francesca Purcell Online Sales Representative Resume.pdf
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Joshua Nicholson Online Sales Representative Resume.pdf
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Jon-Michael Parr Online Sales Representative Resume.pdf
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Mathew Milanese Online Sales Representative Resume.pdf
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Sarah Musgrave Online Sales Representative Resume.pdf
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Genovev Biddle Online Sales Representative Resume.pdf
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Martin Howells Online Sales Representative Resume.pd

In [236]:
from spacy import displacy
displacy.render(nlp_res(doc.text), style='ent', jupyter=True)

In [26]:
import torch
from transformers import BertTokenizer, BertForTokenClassification,pipeline

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
#import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

#model_name = 'manishiitg/distilbert-resume-parts-classify'

# Load pre-trained model tokenizer (vocabulary)
#tokenizer = BertTokenizer.from_pretrained(model)

model_name_or_path = 'manishiitg/distilbert-resume-parts-classify'
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
model = BertForTokenClassification.from_pretrained(model_name_or_path)  # Pytorch
# model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path)  # Tensorflow

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

ner_results = nlp(peter_text)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at manishiitg/distilbert-resume-parts-classify were not used when initializing BertForTokenClassification: ['distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.3.ffn.lin2.weight', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.3.attention.k_lin.bias', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.2.attention.v_lin.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.0.output_layer_norm.weight', 'distilbert.transformer.layer.4.attention.k_lin.bias', 'distilbert.transformer.layer.5.attention.q_lin

In [27]:
#print(ner_results)
for ent in ner_results:
    print(ent["entity"],ent["word"])#.text, ent.label_)

LABEL_7 date
LABEL_7 :
LABEL_0 20th
LABEL_7 june
LABEL_6 2020
LABEL_5 name
LABEL_10 :
LABEL_7 peter
LABEL_6 z
LABEL_9 ##imo
LABEL_7 ##nov
LABEL_7 ##ic
LABEL_6 email
LABEL_1 :
LABEL_10 stormy
LABEL_0 _
LABEL_7 pete
LABEL_0 _
LABEL_5 1
LABEL_0 @
LABEL_1 yahoo
LABEL_5 .
LABEL_0 com
LABEL_4 phone
LABEL_7 :
LABEL_1 +
LABEL_10 61
LABEL_7 404
LABEL_7 123
LABEL_7 333
LABEL_7 personal
LABEL_7 statement
LABEL_7 i
LABEL_0 was
LABEL_7 employed
LABEL_0 at
LABEL_1 the
LABEL_6 bureau
LABEL_1 of
LABEL_1 meteor
LABEL_4 ##ology
LABEL_7 (
LABEL_0 bo
LABEL_5 ##m
LABEL_6 )
LABEL_7 for
LABEL_7 27
LABEL_0 years
LABEL_4 as
LABEL_9 an
LABEL_7 operational
LABEL_1 meteor
LABEL_0 ##ologist
LABEL_4 and
LABEL_1 was
LABEL_0 made
LABEL_6 redundant
LABEL_6 in
LABEL_7 december
LABEL_6 2020
LABEL_2 .
LABEL_6 i
LABEL_9 am
LABEL_7 currently
LABEL_7 completing
LABEL_4 a
LABEL_0 masters
LABEL_6 in
LABEL_10 data
LABEL_0 science
LABEL_1 (
LABEL_7 expected
LABEL_10 completion
LABEL_4 august
LABEL_6 2021
LABEL_7 )
LABEL_0 .
LAB

In [28]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification  # for pytorch
#from transformers import TFAutoModelForTokenClassification  # for tensorflow
from transformers import pipeline


model_name_or_path = 'manishiitg/distilbert-resume-parts-classify'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)  # Pytorch

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

ner_results = nlp(peter_text)



Some weights of the model checkpoint at manishiitg/distilbert-resume-parts-classify were not used when initializing DistilBertForTokenClassification: ['pre_classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
#print(ner_results)
for ent in ner_results:
    print(ent["entity"],ent["word"])#.text, ent.label_)

LABEL_7 date
LABEL_7 :
LABEL_7 20th
LABEL_7 june
LABEL_7 2020
LABEL_7 name
LABEL_7 :
LABEL_3 peter
LABEL_3 z
LABEL_8 ##imo
LABEL_3 ##nov
LABEL_3 ##ic
LABEL_11 email
LABEL_7 :
LABEL_7 stormy
LABEL_7 _
LABEL_7 pete
LABEL_7 _
LABEL_3 1
LABEL_8 @
LABEL_8 yahoo
LABEL_8 .
LABEL_8 com
LABEL_7 phone
LABEL_7 :
LABEL_7 +
LABEL_7 61
LABEL_7 404
LABEL_7 123
LABEL_3 333
LABEL_7 personal
LABEL_11 statement
LABEL_11 i
LABEL_3 was
LABEL_3 employed
LABEL_3 at
LABEL_3 the
LABEL_7 bureau
LABEL_7 of
LABEL_7 meteor
LABEL_7 ##ology
LABEL_3 (
LABEL_7 bo
LABEL_7 ##m
LABEL_11 )
LABEL_3 for
LABEL_3 27
LABEL_7 years
LABEL_7 as
LABEL_7 an
LABEL_3 operational
LABEL_7 meteor
LABEL_7 ##ologist
LABEL_3 and
LABEL_3 was
LABEL_3 made
LABEL_3 redundant
LABEL_7 in
LABEL_3 december
LABEL_3 2020
LABEL_7 .
LABEL_11 i
LABEL_11 am
LABEL_7 currently
LABEL_3 completing
LABEL_3 a
LABEL_7 masters
LABEL_7 in
LABEL_7 data
LABEL_7 science
LABEL_3 (
LABEL_3 expected
LABEL_3 completion
LABEL_7 august
LABEL_3 2021
LABEL_7 )
LABEL_7 .
LA

In [None]:
["awards", "certifications", "education_", "exp_", "extra", "hobbies", "personal_", "projects_", "references", "skills", "summary", "training"]

In [30]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification  # for pytorch
from transformers import TFAutoModelForTokenClassification  # for tensorflow
from transformers import pipeline


model_name_or_path = 'manishiitg/resume-ner'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)  # Pytorch
# model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path)  # Tensorflow

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

ner_results = nlp(peter_text)


In [31]:
#ner_results = nlp(peter_text)
for ent in ner_results:
    print(ent["entity"],ent["word"])

Phone 123
ORG bureau
ORG of
ORG ##ology
ORG bo
ORG ##m
ExperianceYears 27
ExperianceYears years
Designation ##ologist
DATE december
DATE 2020
EducationDegree masters
DATE august
DATE 2021
EducationDegree master
ExperianceYears 16
ExperianceYears years
DATE 2021


In [None]:
martin_text = extract_text_from_pdf('/home/chris/reesby/reverse_Malih/new_resumes/pdfs/Martin Howells Online Sales Representative Resume.pdf')
#print(donna_text)
martin_text = martin_text.replace('\n',' ')
martin_text = martin_text.replace('\t',' ')
martin_text = martin_text.lower()

with open('/home/chris/reesby/reverse_Malih/new_resumes/txts/martin_howells.txt','w') as f:
    f.write(martin_text)
f.close()

In [None]:
ner_results = nlp(martin_text)
for ent in ner_results:
    print(ent["entity"],ent["word"])

In [None]:
jon_text = extract_text_from_pdf('/home/chris/reesby/reverse_Malih/new_resumes/pdfs/Jon-Michael Parr Online Sales Representative Resume.pdf')
#print(donna_text)
jon_text = jon_text.replace('\n',' ')
jon_text = jon_text.replace('\t',' ')
jon_text = jon_text.lower()

with open('/home/chris/reesby/reverse_Malih/new_resumes/txts/jon_michael_parr','w') as f:
    f.write(jon_text)
f.close()


In [128]:
donna_text = extract_text_from_pdf('/home/chris/reesby/reverse_Malih/new_resumes/pdfs/Donna Harvey Online Sales Representative Resume.pdf')
#print(donna_text)
donna_text = donna_text.replace('\n',' ')
donna_text = donna_text.replace('\t',' ')
donna_text = donna_text.lower()

with open('/home/chris/reesby/reverse_Malih/new_resumes/txts/donna_harvey.txt','w') as f:
    f.write(donna_text)
f.close()

In [None]:
peter_text = extract_text_from_pdf('/home/chris/reesby/reverse_Malih/new_resumes/pdfs/Peter-Ziminovic-Resume.pdf')
peter_text = peter_text.replace('\n',' ')
peter_text = peter_text.replace('\t',' ')
peter_text = peter_text.replace('\uf0b7',' ')
peter_text = peter_text.lower()

with open('/home/chris/reesby/reverse_Malih/new_resumes/txts/peter_ziminovic.txt','w') as f:
    f.write(peter_text)
f.close()

In [33]:
from tika import parser
import os
import glob
#import emoji
#import pandas as pd

'''
Takes a directory as input and then converts all pdfs to text and saves in .txt files
'''
def convert_pdf_to_text_file(dir):
    output = []
    dir_files = os.path.join(dir,'*.*') # need to add in *.* to search for all files
    res_list = sorted(glob.glob(dir_files),key=os.path.getmtime)
    #print('from convert to pdf - ', res_list)
    for res in res_list:
        [stem, ext] = os.path.splitext(res)    #path_to_pdf)
        #process pdf files with tika 
        if (ext == ".pdf"):
            print("Processing " + res)#  path_to_pdf)
            pdf_contents = parser.from_file(res, service="text") 
            path_to_txt = stem + ".txt"
            pdf_remove_newline = pdf_contents["content"].replace("\n"," ").lower()
            #pdf_remove_newline = emoji.get_emoji_regexp().sub("", pdf_remove_newline)
            # write text to filename.txt
            with open(path_to_txt,'w') as f:                
                print('Text writen to ',path_to_txt)
                f.write(pdf_remove_newline)
                f.close()
        elif (ext ==".doc"):
            # add this in later to convert doc files
            pass 

In [34]:
convert_pdf_to_text_file('/home/chris/reesby/reverse_Malih/new_resumes/pdfs/')

Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Resume 2.pdf
Text writen to  /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Resume 2.txt
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Emily Loughlin Online Sales Representative Resume.pdf
Text writen to  /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Emily Loughlin Online Sales Representative Resume.txt
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Francesca Purcell Online Sales Representative Resume.pdf
Text writen to  /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Francesca Purcell Online Sales Representative Resume.txt
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Joshua Nicholson Online Sales Representative Resume.pdf
Text writen to  /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Joshua Nicholson Online Sales Representative Resume.txt
Processing /home/chris/reesby/reverse_Malih/new_resumes/pdfs/Jon-Michael Parr Online Sales Representative Resume.pdf
Text writen to 

In [35]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json

# create a new NER pattern based on objective and saves them in resume_ner 
def generate_rules(patterns):
    #nlp = English()
    nlp = spacy.load('en_core_web_lg')
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    nlp.to_disk("resume_ner")

In [94]:
import json
labeled_data = []
with open(r"/home/chris/reesby/reverse_Malih/Doccano/all.jsonl", "r") as read_file:
#with open(r"/home/chris/reesby/reverse_Malih/python/Resume-Parser-master/Entity_Recognition_in_Resumes.json", "r") as read_file:
    for line in read_file:
        data = json.loads(line)
        labeled_data.append(data)
#print(labeled_data)

In [96]:
TRAINING_DATA = []
for entry in labeled_data:
    
    entities = []
    #print(entry['label'])
    for e in entry["label"]:
        entities.append((e[0], e[1],e[2]))
    spacy_entry = (entry['label'], {"entities": entities})
    TRAINING_DATA.append(spacy_entry)

KeyError: 'points'

In [79]:

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
    data (list): The data to be cleaned in spaCy JSON format.

    Returns:
    list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            # if there's preceding spaces, move the start position to nearest character
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data


In [80]:
trim_entity_spans(TRAINING_DATA)

TypeError: expected string or bytes-like object

In [67]:
print(TRAINING_DATA)

[([[6, 20, 'DATE'], [28, 43, 'PERSON'], [82, 97, 'PHONE'], [1155, 1186, 'EDUCATION'], [1187, 1207, 'INSTITUTION'], [1237, 1240, 'EDUCATION'], [1241, 1255, 'INSTITUTION'], [1285, 1293, 'EDUCATION'], [1294, 1331, 'INSTITUTION'], [1350, 1368, 'EDUCATION'], [1369, 1386, 'INSTITUTION'], [1405, 1427, 'EDUCATION'], [1430, 1451, 'INSTITUTION'], [1650, 1653, 'SKILLS'], [1808, 1814, 'SKILLS'], [1819, 1820, 'SKILLS'], [1861, 1867, 'ADDRESS'], [1872, 1878, 'ADDRESS'], [2354, 2375, 'ADDRESS'], [2377, 2407, 'SKILLS'], [2427, 2455, 'SKILLS'], [2550, 2567, 'SKILLS'], [2570, 2604, 'ADDRESS'], [3067, 3085, 'SKILLS'], [3191, 3214, 'COMPUTING_SKILLS'], [4854, 4878, 'ACCOMPLISHMENTS'], [4881, 4918, 'INSTITUTION'], [4919, 4961, 'ACCOMPLISHMENTS'], [4969, 4994, 'ACCOMPLISHMENTS'], [141, 162, 'EMPLOYER'], [173, 181, 'DATE'], [51, 74, 'EMAIL'], [188, 214, 'SKILLS'], [1209, 1223, 'DATE'], [1333, 1347, 'DATE'], [1266, 1280, 'DATE'], [1388, 1402, 'DATE'], [1463, 1474, 'DATE'], [1560, 1574, 'DATE'], [1540, 1557, '

In [97]:
import json
labeled_data = []
#ith open(r"/home/chris/reesby/reverse_Malih/Doccano/all.jsonl", "r") as read_file:
with open(r"/home/chris/reesby/reverse_Malih/python/Resume-Parser-master/Entity_Recognition_in_Resumes.json", "r") as read_file:
    for line in read_file:
        data = json.loads(line)
        labeled_data.append(data)
#print(labeled_data)

In [115]:
TRAINING_DATA = []
for entry in labeled_data:
    
    entities = []
    #print(entry)
    #for key in entry:
    #    print(key)
    for e in entry["annotation"]:
        print(e)
        #entities.append((e[0], e[1],e[2]))
    spacy_entry = (entry['label'], {"entities": entities})
    TRAINING_DATA.append(spacy_entry)

{'label': ['Companies worked at'], 'points': [{'start': 1749, 'end': 1754, 'text': 'Oracle'}]}
{'label': ['Companies worked at'], 'points': [{'start': 1696, 'end': 1701, 'text': 'Oracle'}]}
{'label': ['Companies worked at'], 'points': [{'start': 1417, 'end': 1422, 'text': 'Oracle'}]}
{'label': ['Skills'], 'points': [{'start': 1356, 'end': 1792, 'text': 'Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle\nPL-SQL programming, Sales Force with APEX.\nTools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer,\nPL/SQL Developer, WinSCP, Putty\nWeb Technologies: JavaScript, XML, HTML, Webservice\n\nOperating Systems: Linux, Windows\nVersion control system SVN & Git-Hub\nDatabases: Oracle\nMiddleware: Web logic, OC4J\nProduct FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x'}]}
{'label': ['Companies worked at'], 'points': [{'start': 1209, 'end': 1214, 'text': 'Oracle'}]}
{'label': ['Skills'], 'points': [{'start': 1136, 'end': 1247, 'text': 'APEX. (Less than 1 year), 

KeyError: 'label'

In [39]:
import spacy
import random
import json
nlp = spacy.blank("en")  #spacy.load('resume_ner')#
#nlp = spacy.load('en_core_web_lg')
ner = nlp.create_pipe("ner")

ner.add_label("RESUME")
ner.add_label('PERSON')
ner.add_label('ADDRESS')
ner.add_label('INSTITUTION')
ner.add_label('EDUCATION')
ner.add_label('DATE')
ner.add_label('EMPLOYMENT')
ner.add_label('SKILLS')
ner.add_label('MISC')
ner.add_label('LINKEDIN')
ner.add_label('PROFILE')
ner.add_label('PHONE')
ner.add_label('EMAIL')
ner.add_label('EMPLOYER')
ner.add_label('COMPUTING_SKILLS')
ner.add_label('ACCOMPLISHMENTS')
ner.add_label('REFERENCES')
ner.add_label('GPE')

nlp.add_pipe(ner)
# Start the training
nlp.begin_training()
# Loop for 40 iterations
for itn in range(40):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}
# Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]
# Update the model
        nlp.update(texts, annotations, losses=losses, drop=0.3)
    print(losses)


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


ValueError: [E024] Could not find an optimal move to supervise the parser. Usually, this means that the model can't be updated in a way that's valid and satisfies the correct annotations specified in the GoldParse. For example, are all labels added to the model? If you're training a named entity recognizer, also make sure that none of your annotated entity spans have leading or trailing whitespace or punctuation. You can also use the experimental `debug-data` command to validate your JSON-formatted training data. For details, run:
python -m spacy debug-data --help

In [61]:
def train_spacy(data,iterations):
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True) 

    #add labels
    for _, annotations in TRAINING_DATA:
          for ent in annotations.get('entities'):
            ner.add_label(ent[2])
          
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAINING_DATA)
            losses = {}
            for text, annotations in TRAINING_DATA:
                try:
                    nlp.update(
                        [text],  
                        [annotations],  
                        drop=0.3,  
                        sgd=optimizer,  
                        losses=losses)
                except Exception as error:
                    print(error)
                    continue
            print(losses)
    return nlp

In [62]:
train_spacy(TRAINING_DATA,1)

Starting iteration 0
{}


<spacy.lang.en.English at 0x7f5aa586bdc0>

In [40]:
ner.labels

('ACCOMPLISHMENTS',
 'ADDRESS',
 'COMPUTING_SKILLS',
 'DATE',
 'EDUCATION',
 'EMAIL',
 'EMPLOYER',
 'EMPLOYMENT',
 'GPE',
 'INSTITUTION',
 'LINKEDIN',
 'MISC',
 'PERSON',
 'PHONE',
 'PROFILE',
 'REFERENCES',
 'RESUME',
 'SKILLS')

In [70]:
import json
#Converting JSON1 files to Spacy tuples format
def convert_doccano_to_spacy(filepath):
    with open(filepath, 'rb') as fp:
        data = fp.readlines()
        training_data = []
        for record in data:
            entities = []
            read_record = json.loads(record)
            text = read_record['data']
            entities_record = read_record['label']
            for start, end, label in entities_record:
                entities.append((start, end, label))
                training_data.append((text, {'entities': entities}))
    return training_data


In [71]:
TRAINING_DATA = convert_doccano_to_spacy("/home/chris/reesby/reverse_Malih/Doccano/all.jsonl")

In [73]:
#TRAINING_DATA

In [74]:
for itn in range(40):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}
# Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]
# Update the model
        nlp.update(texts, annotations, losses=losses, drop=0.2)
    print(losses)

ValueError: [E024] Could not find an optimal move to supervise the parser. Usually, this means that the model can't be updated in a way that's valid and satisfies the correct annotations specified in the GoldParse. For example, are all labels added to the model? If you're training a named entity recognizer, also make sure that none of your annotated entity spans have leading or trailing whitespace or punctuation. You can also use the experimental `debug-data` command to validate your JSON-formatted training data. For details, run:
python -m spacy debug-data --help

In [None]:
ner = nlp.get_pipe("ner")
ner.add_label("B-PERSON")

In [34]:
for itn in range(40):
				random.shuffle(TRAINING_DATA)
				losses = {}
				# batch up the examples using spaCy's minibatch
				#size=compounding(2.0, batchsize, 1.005)
				batches = minibatch(TRAINING_DATA, 50) #size)
				for batch in batches:
					texts, annotations = zip(*batch)
					nlp.update(
						texts,  # batch of texts
						annotations,  # batch of annotations
						drop=dropout,  # dropout - make it harder to memorise data
						losses=losses,
					)
				count = itn + 1
				print("Losses", losses, " Iteration: ", count, " of ", n_iter, " Fold: " , input_file)
				sys.stdout.flush()

NameError: name 'minibatch' is not defined

In [None]:
# Import generic wrappers
from transformers import AutoModel, AutoTokenizer 


# Define the model repo
model_name = "manishiitg/resume-ner" 


# Download pytorch model
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Transform input tokens 
inputs = tokenizer(peter_text[1:100], return_tensors="pt")

# Model apply
outputs = model(**inputs)

In [None]:
print(outputs)

In [116]:
import json
import os
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer


def convert_data_to_spacy(JSON_FilePath):
    try:
        training_data = []
        lines = []
        with open(JSON_FilePath, "r", encoding="utf-8") as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data["content"]
            entities = []
            for annotation in data["annotation"]:
                # only a single point in text annotation.
                point = annotation["points"][0]
                labels = annotation["label"]
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    entities.append((point["start"], point["end"] + 1, label))

            training_data.append((text, {"entities": entities}))

        return training_data
    except Exception as e:
        logging.exception(
            "Unable to process " + JSON_FilePath + "\n" + "error = " + str(e)
        )
        return None


In [118]:

TRAIN_DATA = convert_data_to_spacy('/home/chris/reesby/reverse_Malih/python/Resume-Parser-master/Entity_Recognition_in_Resumes.json')

In [130]:
TRAINING_DATA = convert_data_to_spacy("/home/chris/reesby/reverse_Malih/Doccano/all.jsonl")

ERROR:root:Unable to process /home/chris/reesby/reverse_Malih/Doccano/all.jsonl
error = 'content'
Traceback (most recent call last):
  File "<ipython-input-116-26ce81338b98>", line 21, in convert_data_to_spacy
    text = data["content"]
KeyError: 'content'


In [205]:
import spacy
import random
import matplotlib.pyplot as plt
import warnings


def build_spacy_model(train, model):
    num_iter = 100
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    TRAIN_DATA = train
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():  # only train NER
        warnings.filterwarnings("ignore", category=UserWarning, module="spacy")
        if model is None:
            optimizer = nlp.begin_training()
        for itn in range(num_iter):
            # train for 50 iteration
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                try:
                    nlp.update(
                        [text],  # batch of texts
                        [annotations],  # batch of annotations
                        drop=0.2,  # dropout - make it harder to memorise data
                        sgd=optimizer,  # callable to update weights
                        losses=losses,
                    )
                except Exception as e:
                    pass
            print(losses)
        #plt.scatter(itn, losses["ner"])
        #plt.ylabel("ner_loss")
        #plt.xlabel("Iterations")
        #plt.show()

    nlp.to_disk("nlp_model")
    # plt.savefig("loss.png")
    return nlp

In [206]:
build_spacy_model(TRAIN_DATA,None)

Created blank 'en' model
Starting iteration 0
{'ner': 14208.397722794278}
Starting iteration 1
{'ner': 10290.876100309388}
Starting iteration 2
{'ner': 12019.649179039468}
Starting iteration 3
{'ner': 11041.912965121322}
Starting iteration 4
{'ner': 7556.105870439613}
Starting iteration 5
{'ner': 6998.120160554286}
Starting iteration 6
{'ner': 6092.6563845245055}
Starting iteration 7
{'ner': 7232.370569138731}
Starting iteration 8
{'ner': 4746.564893755847}
Starting iteration 9
{'ner': 5850.876040689234}
Starting iteration 10
{'ner': 4578.469948753975}
Starting iteration 11
{'ner': 4388.211800535973}
Starting iteration 12
{'ner': 7934.905154772421}
Starting iteration 13
{'ner': 4732.7523837081735}
Starting iteration 14
{'ner': 3960.874605380123}
Starting iteration 15
{'ner': 4568.806123594107}
Starting iteration 16
{'ner': 4125.741895400983}
Starting iteration 17
{'ner': 3929.5414740234146}
Starting iteration 18
{'ner': 3574.1447494540366}
Starting iteration 19
{'ner': 3288.69540465225

<spacy.lang.en.English at 0x7f59db65bf40>

In [1]:
#from spacy import displacy
#nlp_res_model = spacy.load('nlp_model')
#displacy.render(nlp_res_model(donna_text), style='ent', jupyter=True)

In [196]:
from doccano_transformer.datasets import NERDataset
from doccano_transformer.utils import read_jsonl

dataset = read_jsonl(filepath='/home/chris/reesby/reverse_Malih/Doccano/all.jsonl', dataset=NERDataset, encoding='utf-8')

TRAINING_DATA =  dataset.to_spacy(tokenizer=str.split)

In [197]:
print(TRAINING_DATA)

<generator object NERDataset.to_spacy at 0x7f5a4e5329e0>


In [202]:
build_spacy_model(TRAINING_DATA,None)

Created blank 'en' model
Starting iteration 0
{}
Starting iteration 1
{}
Starting iteration 2
{}
Starting iteration 3
{}
Starting iteration 4
{}
Starting iteration 5
{}
Starting iteration 6
{}
Starting iteration 7
{}
Starting iteration 8
{}
Starting iteration 9
{}
Starting iteration 10
{}
Starting iteration 11
{}
Starting iteration 12
{}
Starting iteration 13
{}
Starting iteration 14
{}
Starting iteration 15
{}
Starting iteration 16
{}
Starting iteration 17
{}
Starting iteration 18
{}
Starting iteration 19
{}
Starting iteration 20
{}
Starting iteration 21
{}
Starting iteration 22
{}
Starting iteration 23
{}
Starting iteration 24
{}
Starting iteration 25
{}
Starting iteration 26
{}
Starting iteration 27
{}
Starting iteration 28
{}
Starting iteration 29
{}
Starting iteration 30
{}
Starting iteration 31
{}
Starting iteration 32
{}
Starting iteration 33
{}
Starting iteration 34
{}
Starting iteration 35
{}
Starting iteration 36
{}
Starting iteration 37
{}
Starting iteration 38
{}
Starting i

<spacy.lang.en.English at 0x7f59d0eeef10>

In [259]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("manishiitg/distilbert-resume-parts-classify")

model = AutoModelForSequenceClassification.from_pretrained("manishiitg/distilbert-resume-parts-classify")

In [264]:
tokens = tokenizer(peter_text, padding=True, truncation=True, return_tensors="pt")


In [265]:
output = model(**tokens)

In [266]:
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.3887,  3.6951, -0.2164, -1.5175, -1.0144, -0.4429, -1.0947, -2.0734,
         -0.5574, -2.3246, -2.7552,  0.8744]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)


# Investigations into huggingface libraries

In [319]:
from transformers import pipeline

# Initialize the NER pipeline
ner_res = pipeline("ner", model=   "manishiitg/resume-ner") #distilbert-resume-parts-classify") 
#"nikunjbjj/jd-resume-model")# 

# NER task
ner_result = ner(peter_text)

# Print result
print(ner_result)

[{'entity_group': 'ORG', 'score': 0.787362, 'word': 'met', 'start': 151, 'end': 154}, {'entity_group': 'ORG', 'score': 0.7318672, 'word': '##ology', 'start': 157, 'end': 162}, {'entity_group': 'LOC', 'score': 0.9200826, 'word': '##ney', 'start': 532, 'end': 535}]


In [320]:
print(ner(peter_text))

[{'entity_group': 'ORG', 'score': 0.787362, 'word': 'met', 'start': 151, 'end': 154}, {'entity_group': 'ORG', 'score': 0.7318672, 'word': '##ology', 'start': 157, 'end': 162}, {'entity_group': 'LOC', 'score': 0.9200826, 'word': '##ney', 'start': 532, 'end': 535}]


In [321]:
doc = ner_res(peter_text) # incase it isn't already lower case    
for ent in doc:
    #if(ent['entity']=='LABEL_9'):
    print(ent['entity'],ent['word'])

Phone 123
ORG bureau
ORG of
ORG ##ology
ORG bo
ORG ##m
ExperianceYears 27
ExperianceYears years
Designation ##ologist
DATE december
DATE 2020
EducationDegree masters
DATE august
DATE 2021
EducationDegree master
ExperianceYears 16
ExperianceYears years
DATE 2021


In [2]:
import pandas as pd
df = pd.read_csv("/home/chris/reesby/reverse_Malih/python/resume_parser-master/resume_parser/University_Courses.csv", header=None)
            #print(df[2])
            
degree_name = []
# column 2 is the degree names - need to only have words no special characters that may affect the regex
degree_uni = [i.lower() for i in df[2].str.replace('\W', ' ')]
            
print(pd.unique(degree_uni))

['diploma of higher education  health  bachelor of nursing science   pre registration'
 'bachelor of medicine  bachelor of surgery'
 'bachelor of education  primary   online ' ... 'm tech ' 'pharm d '
 'phd ']


  degree_uni = [i.lower() for i in df[2].str.replace('\W', ' ')]
