In [1]:
#import dependencies
import spacy
import pickle
import random

In [2]:
#Latest Version
print(spacy.__version__)

2.2.4


In [3]:
#To support codes of all versions of spacy 
!python -m spacy download nl
!pip install pyldavis

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('nl_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/nl_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/nl
You can now load the model via spacy.load('nl')


In [4]:
#Training Data:
#The data consists of the contents of the resume which is extracted from a PDF file, 
#followed by a dictionary consisting of a label and the start and end index of the value in the resume.

In [5]:
#Import the data in pickle form:
train_data = pickle.load(open('/content/drive/MyDrive/train_data (1).pkl', 'rb'))
train_data[0]

('Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [6]:
#Model Building
nlp = spacy.blank('en')

def train_model(train_data):
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last = True)
    
    for _, annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])
            
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            index = 0
            for text, annotations in train_data:
                try:
                    nlp.update(
                        [text],  # batch of texts
                        [annotations],  # batch of annotations
                        drop=0.2,  # dropout - make it harder to memorise data
                        sgd=optimizer,  # callable to update weights
                        losses=losses)
                except Exception as e:
                    pass
                
            print(losses)

In [7]:
train_model(train_data)

Statring iteration 0
{'ner': 9662.9684827991}
Statring iteration 1
{'ner': 10881.860874334552}
Statring iteration 2
{'ner': 9231.693189548072}
Statring iteration 3
{'ner': 6978.71988298404}
Statring iteration 4
{'ner': 6859.903828134332}
Statring iteration 5
{'ner': 7585.600888044513}
Statring iteration 6
{'ner': 7011.657694526}
Statring iteration 7
{'ner': 5093.812266804907}
Statring iteration 8
{'ner': 5391.867579339394}
Statring iteration 9
{'ner': 4668.503320905883}


In [17]:
nlp.to_disk('nlp_model')

In [18]:
nlp_model = spacy.load('nlp_model')

In [15]:
#Test the trained model with existing data:
train_data[0][0]

"Srinivas VO Sr. Test Manager  Mumbai, Maharashtra - Email me on Indeed: indeed.com/r/Srinivas-VO/39c80e42cb6bc97f  A Test Manager, with a track record of 15+Yrs ( 4yrs UK onsite) delivering major test solutions for global projects ($40m) on behalf of leading blue chip organisations. Delivering IT solutions, ranging from simple to complex and challenging projects and programs, establishing an enviable record of on-time, high quality & added value delivery. ● Testing capabilities to existing customers and prospective customers during client visits / at customer location. ● Own and Support RFI/RFPs, proposal walkthroughs and presentations and Transition knowledge from pre-sales to delivery, in case of project win ● Analyze proposal requirements in direct relation with clients, and provide innovative solutions, as part of proposals ● Develop proof of concepts to prospects during pre-sales phase, Provide test consulting services, on demand & Collate repository from delivery team, along wit

In [13]:
train_data[0][1]

{'entities': [(11201, 11409, 'Skills'),
  (11172, 11191, 'College Name'),
  (11147, 11171, 'Degree'),
  (11125, 11145, 'College Name'),
  (11100, 11123, 'Degree'),
  (6954, 6970, 'Designation'),
  (5332, 5338, 'Location'),
  (5307, 5317, 'Designation'),
  (3111, 3152, 'Email Address'),
  (72, 113, 'Email Address'),
  (30, 36, 'Location'),
  (12, 28, 'Designation'),
  (0, 11, 'Name')]}

In [19]:
doc = nlp_model(train_data[0][0])
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Srinivas VO
DESIGNATION                   - Sr. Test Manager
LOCATION                      - Mumbai
EMAIL ADDRESS                 - indeed.com/r/Srinivas-VO/39c80e42cb6bc97f
DESIGNATION                   - QA Manager
DEGREE                        - MSC in Computer science
COLLEGE NAME                  - Nagarjuna University
DEGREE                        - BSC in Computer science
COLLEGE NAME                  - Kakatiya University
SKILLS                        - Testing (10+ years), Program Management (10+ years), Automation Testing (10+ years), Selenium Webdriver (4 years), Project Management (10+ years), Java (10+ years), AWS (10+ years), Cloud Computing (4 years)


In [20]:
#To convert reumes to text
!pip install PyMuPDF

Collecting PyMuPDF
[?25l  Downloading https://files.pythonhosted.org/packages/ef/43/84a502a902f5045108b6264a054ea1b50b479f247879c7a66c0d190be44e/PyMuPDF-1.18.14-cp37-cp37m-manylinux2010_x86_64.whl (6.4MB)
[K     |████████████████████████████████| 6.4MB 13.7MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.18.14


In [21]:
#Try new resume 
import sys, fitz
fname = '/content/drive/MyDrive/Sajan One Page.pdf'
doc = fitz.open(fname)
text = ""
for page in doc:
    text = text + str(page.getText())

tx = " ".join(text.split('\n'))
print(tx)

                        PON SUDHIR SAJAN S.S  RESUME OBJECTIVE    sssajanaero89@gmail.com  +91-9080980219  28, Pallivilai, Vetturnimadam,  Nagercoil-629003  PROFESSIONAL EXPERIENCE  pon-sudhir-sajan-s-s-0b2687188  RESEARCH INTEREST    •  Machine Learning  •  Deep Learning  •  Natural Language Processing  •  Computer Vision  •  Time Series Forecasting  •  Aerospace AI Applications    SKILLS  <Python>  ✓ TensorFlow  ✓ Keras  ✓  PyTorch  ✓ Scikit-learn  ✓ Numpy , Pandas, Matplotlib       CERTIFICATION  “Certification on PyTorch”  2x3Gt11L2S262NC749    “Deep Convolutional Neural Networks”  8uUZie246Y127h7j30    “Encoder Decoder Models”  4l60zM756105L422Pb    ”Object Detection”  8073Dj6h295xf2Z135    “Training Feedforward Neural  Networks”  7ng21568620U21h06G    @GUVI Geek Networks, IITM Research Park  REFERENCE         Mr.K.Samuel  (Emp.id:399)  Software Engineer (ML)  CapeStart   samuel-k-6a609253   +91-9789254242    Experienced Faculty in the domain of Aerospace Engineering and Aviation 

In [22]:
doc = nlp_model(tx)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - PON SUDHIR SAJAN
DESIGNATION                   - Software Engineer
DEGREE                        - B.E /Aeronautical Engineering (2006
COLLEGE NAME                  - Nagercoil
DEGREE                        - M.B.A / Airport and Aviation Management (2010
COLLEGE NAME                  - Anna University
SKILLS                        - M.E /Aeronautical Engineering (2012
GRADUATION YEAR               - 2014


Results are not doing outstanding with complex resumes ..but doing fine with the resumes of simplest layouts.