In [1]:
!pip install spacy



In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
!pip install srsly



In [2]:
import json

with open("spacy_training_data.json", "r") as f:
    resume_data = json.load(f)

In [3]:
len(resume_data)

2961

In [4]:
resume_data[0]

['Ankita Nagendra Babar babarankita112@gmail.com | 8329004204   Strong proficiency in HTML5, CSS3 and JavaScript   Strong proficiency in JavaScript, including DOM manipulation and JavaScript object model   Hands-on knowledge on ReactJs, Redux   Familiarity with newer specifications of ECMAScript 6   Ability to understand business requirements and translate them into technical requirements   A knack for benchmarking and optimization   Familiarity with code versioning tools (Git)   Understanding of responsive web development with Bootstrap.   Strong verbal and written communication skills. WORK EXPERIENCE O2 Soft Solutions, Pune Nov 2019   Dec2021 Successfully developed and maintained School ERP. Was responsible for developing modules, maintaining and testing them. EDUCATION University of Pune, Pune.   M.E. (Computer Engineering) 2016 First Class(7.5 SGPA) from TSSM s Bhivarabai Sawant College Of Engineering and research, Narhe, Pune   B.E. (Computer Engineering) 2014 Higher Second Class

In [5]:
import random
from sklearn.model_selection import train_test_split

In [6]:
unique_data = {}
for text, ann in resume_data:
    key = text.strip().lower()
    if key not in unique_data:
        unique_data[key] = (text, ann)

deduped_data = list(unique_data.values())
random.seed(42)
random.shuffle(deduped_data)

In [7]:
len(deduped_data)

2701

In [9]:
train_data, dev_data = train_test_split(deduped_data, test_size = 0.2, random_state = 42)


In [11]:
len(train_data)

2160

In [12]:
len(dev_data)

541

In [13]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [14]:
nlp = spacy.blank("en")

In [15]:
def save_data_to_spacy_file(data, nlp, output_path, log_path=None):
    doc_bin = DocBin()
    with open(log_path, "w") if log_path else open("/dev/null", "w") as log_file:
        for text, annot in tqdm(data):
            doc = nlp.make_doc(text)
            ents = []
            entity_indices = set()
            for start, end, label in annot["entities"]:
                if any(i in entity_indices for i in range(start, end)):
                    continue
                span = doc.char_span(start, end, label = label, alignment_mode = "contract")
                if span is None:
                    log_file.write(f"Skipping: {text[start:end]} ({start}-{end}) [{label}]\n")
                    continue
                ents.append(span)
                entity_indices.update(range(start, end))
            doc.ents = ents
            doc_bin.add(doc)
    doc_bin.to_disk(output_path)

In [16]:
save_data_to_spacy_file(train_data, nlp, "train.spacy", log_path="skipped_train.txt")
save_data_to_spacy_file(dev_data, nlp, "dev.spacy", log_path="skipped_dev.txt")

100%|██████████| 2160/2160 [00:14<00:00, 146.64it/s]
100%|██████████| 541/541 [00:03<00:00, 149.69it/s]


In [17]:
!python -m spacy init config base_config.cfg --lang en --pipeline ner --optimize accuracy --force

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [19]:
spacy.require_gpu()

!python -m spacy train /content/base_config.cfg --output ./output \
    --paths.train /content/train.spacy --paths.dev /content/dev.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    807.40    0.00    0.00    0.00    0.00
  0     200       2000.08  22288.26   26.91   33.22   22.62    0.27
  0     400       1297.03  14644.01   24.44   44.33   16.87    0.27
  0     600        474.78  12218.41   14.71   45.51    8.77    0.20
  0     800        284.73  10959.60   33.46   52.43   24.57    0.35
  0    1000        728.63  12732.29   24.22   56.31   15.43    0.29
  0    1200        250.57  12569.83   27.42   45.79   19.57    0.30
  0    1400        267.30  11807.74   31.51   56.74   21.81    0.35
  0    1600        297.43  12170.67   35.27   48.44   27.73    0.36
  0    1800        489.86  11635.44   32.78

In [20]:
!python -m spacy evaluate --gpu-id 0 output/model-best dev.spacy

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   45.80 
NER R   62.27 
NER F   52.78 
SPEED   52603 

[1m

                       P       R       F
PERSON             77.57   73.13   75.28
PHONE              85.10   96.23   90.32
SKILL              40.69   63.91   49.73
EMAIL              90.10   98.24   93.99
ORG                56.21   61.64   58.80
DATE               73.64   78.73   76.10
PROJECT_ORG        31.40   42.68   36.18
PROJECT_TITLE      60.57   45.57   52.01
DEGREE             75.71   79.64   77.63
LOC                56.84   17.25   26.47
CERTIFICATION      47.41   22.46   30.48
PROJECT_DURATION   58.21   47.33   52.21



In [21]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [22]:
import spacy
import re
from PyPDF2 import PdfReader

nlp = spacy.load("output/model-best")

def clean_resume(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.encode('ascii', errors='ignore').decode()
    text = text.strip()
    return text

pdf_path = "/content/Aakash_Nihalani_Resume_22-05-2022-23-00-20 (1).pdf"

reader = PdfReader(pdf_path)
extracted_text = ''
for page in reader.pages:
    if page.extract_text():
        extracted_text += page.extract_text()

cleaned_resume = clean_resume(extracted_text)

doc = nlp(cleaned_resume)

print("Entities found in Resume:")
print("="*30)
for ent in doc.ents:
    print(f"{ent.text} --> {ent.label_}")

Entities found in Resume:
Aakash Nihalani --> PERSON
Bangalore, India --> LOC
+91-9737604171 --> PHONE
aakashnihalani26@gmail.com --> EMAIL
Python --> SKILL
Django --> SKILL
React --> SKILL
Golang --> SKILL
Bachelor of Technology, Computer Science --> DEGREE
Nirma University --> ORG
Python --> SKILL
Javascript --> SKILL
Typescript --> SKILL
SQL --> SKILL
Django --> SKILL
Airflow --> SKILL
git --> SKILL
MySQL --> SKILL
Splunk --> SKILL
CircleCI --> SKILL
Redis --> SKILL
Nginx --> SKILL
Blockchain --> SKILL
Bitcoin --> SKILL
Django --> SKILL
MySQL --> SKILL
Celery --> SKILL
Router --> SKILL
Splunk --> SKILL
Virtual Report Analyzer --> PROJECT_TITLE
Medical Assistant Application --> PROJECT_TITLE


In [24]:
import spacy
import re
from PyPDF2 import PdfReader

nlp = spacy.load("output/model-best")

def clean_resume(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.encode('ascii', errors='ignore').decode()
    text = text.strip()
    return text

pdf_path = "/content/Abhishek Sahu Python Developer 6.2.pdf"

reader = PdfReader(pdf_path)
extracted_text = ''
for page in reader.pages:
    if page.extract_text():
        extracted_text += page.extract_text()

cleaned_resume = clean_resume(extracted_text)

doc = nlp(cleaned_resume)

print("Entities found in Resume:")
print("="*30)
for ent in doc.ents:
    print(f"{ent.text} --> {ent.label_}")

Entities found in Resume:
Abhishek Sahu --> PERSON
Python --> SKILL
Python --> SKILL
Postgresql --> SKILL
Pandas --> SKILL
Django --> SKILL
NumPy --> SKILL
AWS --> SKILL
Python --> SKILL
Capgemini Private Limited --> PROJECT_ORG
Pandas --> SKILL
Python --> SKILL
Resources Global Professionals --> PROJECT_ORG
13 April 2021 - Present --> PROJECT_DURATION
abhisheksahu92@outlook.com --> EMAIL
Amity University --> ORG
Bachelor of Technology in Computer Science --> DEGREE
04/2011-03/2015 --> DATE
Tata Consultancy Services --> PROJECT_ORG
Pandas --> SKILL
Python --> SKILL
AWS Fundamentals: Building Serverless ApplicationsPython --> CERTIFICATION
Python --> SKILL
RERERENCES Suzanna Williams --> ORG
+91-974-036-8585 --> PHONE
+91-998-784-3607 --> PHONE
Deloitte --> PROJECT_ORG
+91-917-688-4492 --> PHONE


In [25]:
import shutil

model_path = "output/model-best"

zip_path = "model-best.zip"

shutil.make_archive("model-best", 'zip', model_path)

'/content/model-best.zip'

In [26]:
from google.colab import files
files.download("model-best.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>