## Download data

In [1]:
import pandas as pd
import kagglehub
import os

resumes_df_path = "data/UpdatedResumeDataSet.csv"
jobs_df_path = "data/data job posts.csv"

if not os.path.exists(resumes_df_path):
    print(f"Resumes dataset not found at {resumes_df_path}")
    print(f"Downloading from kaggle...")
    path = kagglehub.dataset_download("gauravduttakiit/resume-dataset")
    !mkdir -p data
    !mv $path/* ./data

if not os.path.exists(jobs_df_path):
    print(f"Jobs dataset not found at {jobs_df_path}")
    print(f"Downloading from kaggle...")
    path = kagglehub.dataset_download("madhab/jobposts")
    !mkdir -p data
    !mv $path/* ./data


resumes_df = pd.read_csv(resumes_df_path)
jobs_df = pd.read_csv(jobs_df_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
resumes_df.describe()

Unnamed: 0,Category,Resume
count,962,962
unique,25,166
top,Java Developer,"Technical Skills Web Technologies: Angular JS,..."
freq,84,18


In [5]:
resumes_df.loc[100]["Category"]

'Advocate'

In [6]:
jobs_df = pd.read_csv("data/data job posts.csv")

In [7]:
jobs_df.head()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True


In [None]:
sample_5 = jobs_df.sample(5)["jobpost"]
for i in sample_5:
    print(i)
    print("\n\n")

"Arge Business" LLC
TITLE:  Warehouse Manager
START DATE/ TIME:  10 June 2007
DURATION:  Long term with three months probation period.
LOCATION:  Yerevan, Armenia
JOB DESCRIPTION:  N/A
JOB RESPONSIBILITIES:
- Goods' rotation, effective and optimal arrangement duly, properly and
qualitatively provision inside warehouse, orders' preparation;
- Supervision and active enrolment in goods' order preparation process;
- Organization of transit goods preparation;
- Duly calculation and inventory of goods;
- Organization and supervision of the good receiving process in
warehouse;
- Organization and control of giving out goods in proper time and
quality;
- Keeping vigilant watch over goods disposal in frame of accepted
standards.
REQUIRED QUALIFICATIONS:
- Secondary education, high education will be considered as advantage;
- Minimum 2 years of experience working in warehouse with 1 year on a
supervising position;
- Knowledge of specifications of warehouse procedures; 
- Knowledge of Armenian and

Observation:
All jobpost are devided into section with capitalized titles, like TITLE, AGE, JOB DESCRIPTION etc...

If some of those section are more or less consistant across the dataset, we can peform some structuring

## Let's perform named entity recognition (NER) on resumes, to extract important information.

In [None]:
!python -m spacy download en_core_web_lg

In [21]:
import spacy
import pandas as pd
from collections import defaultdict

def extract_resume_entities(df, text_column='Resume'):
    """
    Extract relevant entities from resume text using spaCy.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing resumes
    text_column (str): Name of the column containing resume text
    
    Returns:
    pandas.DataFrame: DataFrame with original data and extracted entities
    """
    # Load English language model with parser and NER components
    nlp = spacy.load("en_core_web_lg")
    
    # Custom patterns for skills and education-related terms
    # TODO: move patterns to a separate file and add way more of them
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    patterns = [
        {"label": "SKILL", "pattern": [{"LOWER": {"IN": ["python", "java", "javascript", "sql", "r", "C", "C++",  "tensorflow",
                                                        "pytorch", "machine learning", "deep learning", "nlp",
                                                        "data analysis", "statistical analysis"]}}]},
        {"label": "DEGREE", "pattern": [{"LOWER": {"IN": ["phd", "master's", "bachelor's", "mba", "ms", "bs", "ba"]}}]}
    ]
    ruler.add_patterns(patterns)
    
    def process_text(text):
        if pd.isna(text):
            return {
                'organizations': [],
                'locations': [],
                'dates': [],
                'people': [],
                'skills': [],
                'degrees': [],
                'languages': [],
                'skill_phrases': []
            }
            
        # Process text through spaCy pipeline
        doc = nlp(text)
        
        # Initialize containers for different entity types
        entities = defaultdict(list)
        
        # Extract standard named entities
        for ent in doc.ents:
            if ent.label_ in ['ORG', 'GPE', 'DATE', 'PERSON']:
                entities[ent.label_].append(ent.text)
            
        # Extract custom entities (skills and degrees)
        for ent in doc.ents:
            if ent.label_ in ['SKILL', 'DEGREE']:
                entities[ent.label_].append(ent.text)
        
        # Extract noun chunks as potential skill phrases
        skill_phrases = [chunk.text for chunk in doc.noun_chunks 
                        if any(tech in chunk.text.lower() 
                              for tech in ["python", "java", "analysis", "engineering", "development", "computer"])]
        
        return {
            'organizations': list(set(entities['ORG'])),
            'locations': list(set(entities['GPE'])),
            'dates': list(set(entities['DATE'])),
            'people': list(set(entities['PERSON'])),
            'skills': list(set(entities['SKILL'])),
            'degrees': list(set(entities['DEGREE'])),
            'languages': list(set(entities['LAUGUAGE'])),
            'skill_phrases': list(set(skill_phrases))
        }
    
    # Create a list to store results
    results = []
    
    # Process each resume
    for _, row in df.iterrows():
        results.append(process_text(row[text_column]))
    
    # Convert results to DataFrame
    entities_df = pd.DataFrame(results)
    
    # Combine with original DataFrame
    result_df = pd.concat([df.reset_index(drop=True), entities_df], axis=1)
    
    return result_df

In [22]:
resumes_sample_df = resumes_df.sample(5)
enriched_df = extract_resume_entities(resumes_sample_df)

In [15]:
resumes_sample_df

Unnamed: 0,Category,Resume
481,Electrical Engineering,Skills: 1) MC Office 2) AutoCAD 2016 3) Introd...
601,DevOps Engineer,CORE COMPETENCIES ~ Ant ~ Maven ~ GIT ~ Bitbuc...
605,DevOps Engineer,"TECHNICAL SKILLS â¢ HP ALM, RTC and JIRA â¢ ..."
201,Mechanical Engineer,SKILLS: â¢ Knowledge of software / computer: ...
110,Arts,â¢ Good communication skill â¢ Quick learner...


In [23]:
enriched_df

Unnamed: 0,Category,Resume,organizations,locations,dates,people,skills,degrees,languages,skill_phrases
0,Operations Manager,Education Details \r\n BCA Vinayaka Missions...,"[BNY Mellon PMO, Supporting Departments, Treas...","[Brooklyn, UK, Syracuse, Invoiced, Pittsburgh,...","[Sept 2009-, Annual, daily, semiannual, quarte...","[-PMO\r\n, RM, KYC, Exprience - Less than]",[SQL],[],[],"[Performed skill-gap analysis, Development, em..."
1,Java Developer,Education Details \r\nJanuary 2013 Master of E...,"[Database, S.S.C Pusad, JSF, Oracle, Tata Pow...","[Hibernate, Pusad, Maharashtra K.D. High-Schoo...","[January 1999, January 2013, January 2001, 201...",[AD],"[SQL, Java]",[MS],[],[Maharashtra K.D. High-School\r\nJava Develope...
2,Hadoop,"Skill Set: Hadoop, Map Reduce, HDFS, Hive, Sqo...","[HDFS, Hive, Combines Enterprise, Solution, De...",[Exprience],"[49 months, 2016 to 2017]",[Scala],"[Java, java]",[],[],"[Core Java, java]"
3,SAP Developer,Competencies: SAP Business Intelligence Versio...,"[Nordea, Bangalore\r\nEnvironment SAP BO 4.1, ...","[Exprience, HANA, bex]","[36 months, 72 months]",[Marvin Pictures],[],[BA],[],"[views development, development, clear analysi..."
4,Blockchain,"SKILLS Bitcoin, Ethereum Solidity Hyperledger,...","[Maharastra State Government Hackthon, Relianc...","[Exprience, Maharashtra IIT, Mumbai, Rome]","[6 months, January 2018, January 2011, Nov 201...",[Beginner Tendermint],[Java],[],[],"[Java, Brain Computer Interface, Engineering, ..."


We can extract some entities like language, skills, education etc with acceptable accuracy. But for that to work we need better custom patterns.

### Let's now explore entities and ways to extract them in jobposts

In [41]:
jobs_df_sample = jobs_df.sample(5)
enriched_jobs_df = extract_resume_entities(jobs_df_sample, text_column='jobpost')
enriched_jobs_df

text type is <class 'str'>
text type is <class 'str'>
text type is <class 'str'>
text type is <class 'str'>
text type is <class 'str'>


Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Month,IT,organizations,locations,dates,people,skills,degrees,languages,skill_phrases
0,Armenian Datacom Company CJSC\r\nTITLE: Marke...,Jun 7 2:09 AM,Marketing Manager,Armenian Datacom Company CJSC,,Termless,,,As soon as possible,,...,6,False,"[TIME, ADC, Termless\r\nSTART DATE/, Armenian ...","[Yerevan, REMUNERATION/, Armenia]","[2006, 5 years, 14 June 2007, 06 June 2007]",[],[],[],[],[]
1,Children of Armenia Fund (COAF)\r\nTITLE: Hea...,"Feb 22, 2010",Health and Social Programs Manager,Children of Armenia Fund (COAF),,,,,,1 year with annual extension; the first 3 mont...,...,2,False,"[Social Program, COAF, Children of Armenia Fun...","[Armenia, REMUNERATION/, Armenias]","[2000, annual, 2004, 22 February 2010, 1 year,...","[Armavir Marz, Serob Khachatryan]",[],[MS],[],"[computer literacy, Economic\r\nDevelopment Pr..."
2,USAID Enterprise Development and Market Compet...,"Mar 5, 2013",Senior Financial Sector Expert,USAID Enterprise Development and Market Compet...,PA-ATF-032,Full-Time,,,March 2013,,...,3,False,[The USAID Enterprise Development and Market\r...,"[Yerevan, PA, Armenia]","[05 March 2013, 13 March 2013, March 2013]",[],[],[],[],"[analysis, major computer applications, USAID ..."
3,ArmenTel CJSC\r\nTITLE: Head of Division on I...,"Jul 31, 2008",Head of Division on Interaction with Construct...,ArmenTel CJSC,HDICC/08,,All interested candidates.,,,,...,7,False,"[CV/ Resume, CJSC, LAN, Company, Technical Dir...","[Yerevan, REMUNERATION/, Armenia]","[29 August 2008, 31 July 2008, 0014]","[hrm@, HDICC/08]",[],[MS],[],"[analysis, development, a network development ..."
4,Counterpart International/Armenia\r\nTITLE: L...,"Oct 1, 2010",Local Government Technical Advisor on Municipa...,Counterpart International/Armenia,,,,,Fall 2010,1 year contract with the possibility of multi-...,...,10,False,"[USAID, Municipal\r\nServices, Counterpart Int...","[Yerevan, Washington, Armenia, DC]","[07 October 2010, at least 3 years, multi-year...","[Jrashat, Zarubyan]",[],[MS],[],"[the analysis, organizational development, Sol..."


## Testing proposed approach

### 1. Preprocess text.

In [15]:
import re
import unicodedata

class TextPreprocessor:
    def __init__(self):
       pass


    def normalize(self, text: str) -> str:
        """
        Normalize text by performing the following operations:
        - Convert to lowercase
        - Remove extra whitespace
        - Handle unicode characters
        - Remove multiple spaces
        - Remove email addresses (optional)
        - Remove URLs (optional)
        - Remove phone numbers (optional)
        
        Args:
            text (str): Input text to normalize
            
        Returns:
            str: Normalized text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Handle unicode characters
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove phone numbers
        text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        return text


In [38]:
import spacy
import pandas as pd
from collections import defaultdict
import json


class EntityExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_lg")
        self.ruler = self.nlp.add_pipe("entity_ruler", before="ner")
        self.entity_labels = []
        self.phrases = {}


    def load_patterns(self, patters_file: str = None):
        with open(patters_file) as f:
            patters_json = json.load(f)
            
        self.patterns = [
            {"label": entity_label, "pattern": [{"LOWER": {"IN": entities}}]} for entity_label, entities in patters_json.items()    
        ]
        self.entity_labels = list(patters_json.keys())

        self.ruler.add_patterns(self.patterns)

    def load_phrases(self, phrases_file: str = None):
        with open(phrases_file) as f:
            self.phrases = json.load(f)


    def extract(self, text):
        if pd.isna(text):
            return {
                'organizations': [],
                'locations': [],
                'dates': [],
                'people': [],
                'skills': [],
                'degrees': [],
                'languages': [],
                'skill_phrases': []
            }
            
        doc = self.nlp(text)
        
        entities = defaultdict(list)
        
        # Standard named entities
        for ent in doc.ents:
            if ent.label_ in ['LAUGUAGE']:
                entities[ent.label_].append(ent.text)
        
        # Custom entities
        for ent in doc.ents:
            if ent.label_ in self.entity_labels:
                entities[ent.label_].append(ent.text)
        
        # Extract noun chunks as potential skill phrases
        doc_phrases = {}

        for phrase_label, phrase_list in self.phrases.items():
            doc_phrases[phrase_label] = [chunk.text for chunk in doc.noun_chunks 
                            if any(tech in chunk.text.lower() 
                                for tech in phrase_list)]

        return dict(entities) | doc_phrases
        

In [42]:
entity_extractor = EntityExtractor()
entity_extractor.load_patterns("data/patterns.json")
entity_extractor.load_phrases("data/phrases.json")

text_preprocessor = TextPreprocessor()
# text = resumes_sample_df["Resume"].iloc[0]
text = jobs_df_sample["jobpost"].iloc[0]
text = text_preprocessor.normalize(text)

print(f"[INPUT TEXT]\n {text}\n\n") 
entities = entity_extractor.extract(text)
entities

[INPUT TEXT]
 armenian datacom company cjsc title: marketing manager term: termless start date/ time: as soon as possible location: yerevan, armenia job description: armenian datacom company cjsc is seeking a qualified person for the position of marketing manager. the position reports to the general manager. job responsibilities: - responsible for marketing planning, media contact, brand building, pr activities, market research and statistical analysis; - actively and independently work to make the best plan and strategy to promote the company's image and profile in the market, and present to management a media plan for proposed activities. required qualifications: - experienced marketeer with previous management positions; - academic as well as work experience within the required fields listed in job responsibilities; - minimum of 5 years of relevant marketing experience; - experience from the telecommunication sector can be an advantage. remuneration/ salary: competitive application 

{'SKILL': ['go'],
 'SKILL-PHRASES': ['marketing manager term',
  'marketing manager',
  'marketing planning',
  ', media contact',
  'market research',
  'statistical analysis',
  'strategy',
  'a media plan',
  'previous management positions',
  'relevant marketing experience',
  'the telecommunication sector',
  'competitive application procedures',
  'applications',
  'your application letter',
  '06 june 2007 application deadline',
  'telecommunications services']}

In [28]:
resumes_sample_df = resumes_df.sample(1)

In [8]:
augmented_resume_df = extract_resume_entities(resumes_sample_df)

text type is <class 'str'>


## PDF recognition

In [48]:
import PyPDF2
import pdfplumber

def extract_with_pypdf2(pdf_path):
    """
    Extract text from PDF using PyPDF2.
    Better for simple PDFs with basic text.
    """
    text = ""
    with open(pdf_path, 'rb') as file:
        # Create PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Get number of pages
        num_pages = len(pdf_reader.pages)
        
        # Extract text from each page
        for page_num in range(num_pages):
            # Get page object
            page = pdf_reader.pages[page_num]
            
            # Extract text from page
            text += page.extract_text() + "\n"
    
    return text

def extract_with_pdfplumber(pdf_path):
    """
    Extract text from PDF using pdfplumber.
    Better for complex PDFs with tables and formatted text.
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        # Extract text from each page
        for page in pdf.pages:
            text += page.extract_text() + "\n"
            
    return text

def extract_with_tables(pdf_path):
    """
    Extract both text and tables from PDF using pdfplumber.
    Returns a tuple of (text, tables).
    """
    text = ""
    all_tables = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text
            text += page.extract_text() + "\n"
            
            # Extract tables
            tables = page.extract_tables()
            if tables:
                all_tables.extend(tables)
    
    return text, all_tables

In [46]:
resume_text = extract_with_pdfplumber("data/Resume.pdf")

normalized_text = text_preprocessor.normalize(resume_text)
entities = entity_extractor.extract(normalized_text)
entities

{'DEGREE': ['bachelor'],
 'SKILL': ['c', 'c++', 'python'],
 'SKILL-PHRASES': ['naumenko research engineer',
  'a computer vision engineer',
  'gpu-accelerated computer graphics',
  'systems engineering',
  'real-time applications',
  'embedded hardware',
  'education undergraduate',
  'software engineering',
  'computer science',
  'leadership',
  'problem-solving languages',
  'upper-intermediate) other computer vision',
  'operating systems',
  'ood experience research engineer',
  'convolutional neural network inference',
  'embedded platform',
  'a visual-inertial odom- etry application',
  'software-hardware integration',
  'optimization',
  'specific hardware',
  'developed tools',
  'the rendering system',
  'trainee software developer',
  'research department',
  'uav-specialized onboard computers',
  'small mamba language model',
  'tookpartinresearchofacompletelydifferentapproachforbuildingfundamentalmodels']}

In [None]:
pypdf2_text = extract_with_pypdf2("data/Resume.pdf")

ROMAN NAUMENKO
Research Engineer
+(380) 63-036-5711 ⋄ Lviv, Ukraine
gmail ⋄ linkedin ⋄ github
ABOUT MYSELF
I’m a computer vision engineer with extensive experience in GPU-accelerated computer graphics. I have strong
expertise in systems engineering, integration, and machine learning. My current work focuses on optimizing AI
algorithms for real-time applications on embedded hardware.
EDUCATION
Undergraduate of Software Engineering, National Technical University of Ukraine, Kyiv 2020-2022
Bachelor of Computer Science, Ukrainian Catholic University, Lviv Expected 2025
SKILLS
Technical Skills C/C++, Python, PyTorch, NumPy, Pandas,
Linux, DirectX11/Vulkan/OpenGL, GLSL/HLSL
Soft skills Attention to detail, teamwork, leadership, problem-solving
Languages Ukrainian (native), English (upper-intermediate)
Other Computer vision, machine learning, multithreading, operating systems, modern
rendering techinques, compute shaders, BVHs, OOP, OOD
EXPERIENCE
Research Engineer May 2024 - Now
FoxFour & UC