## Download data

In [1]:
import pandas as pd
import kagglehub
import os

resumes_df_path = "data/UpdatedResumeDataSet.csv"
jobs_df_path = "data/data job posts.csv"

if not os.path.exists(resumes_df_path):
    print(f"Resumes dataset not found at {resumes_df_path}")
    print(f"Downloading from kaggle...")
    path = kagglehub.dataset_download("gauravduttakiit/resume-dataset")
    !mkdir -p data
    !mv $path/* ./data

if not os.path.exists(jobs_df_path):
    print(f"Jobs dataset not found at {jobs_df_path}")
    print(f"Downloading from kaggle...")
    path = kagglehub.dataset_download("madhab/jobposts")
    !mkdir -p data
    !mv $path/* ./data


resumes_df = pd.read_csv(resumes_df_path)
jobs_df = pd.read_csv(jobs_df_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
resumes_df.describe()

Unnamed: 0,Category,Resume
count,962,962
unique,25,166
top,Java Developer,"Technical Skills Web Technologies: Angular JS,..."
freq,84,18


In [5]:
resumes_df.loc[100]["Category"]

'Advocate'

In [6]:
jobs_df = pd.read_csv("data/data job posts.csv")

In [7]:
jobs_df.head()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True


In [None]:
sample_5 = jobs_df.sample(5)["jobpost"]
for i in sample_5:
    print(i)
    print("\n\n")

"Arge Business" LLC
TITLE:  Warehouse Manager
START DATE/ TIME:  10 June 2007
DURATION:  Long term with three months probation period.
LOCATION:  Yerevan, Armenia
JOB DESCRIPTION:  N/A
JOB RESPONSIBILITIES:
- Goods' rotation, effective and optimal arrangement duly, properly and
qualitatively provision inside warehouse, orders' preparation;
- Supervision and active enrolment in goods' order preparation process;
- Organization of transit goods preparation;
- Duly calculation and inventory of goods;
- Organization and supervision of the good receiving process in
warehouse;
- Organization and control of giving out goods in proper time and
quality;
- Keeping vigilant watch over goods disposal in frame of accepted
standards.
REQUIRED QUALIFICATIONS:
- Secondary education, high education will be considered as advantage;
- Minimum 2 years of experience working in warehouse with 1 year on a
supervising position;
- Knowledge of specifications of warehouse procedures; 
- Knowledge of Armenian and

Observation:
All jobpost are devided into section with capitalized titles, like TITLE, AGE, JOB DESCRIPTION etc...

If some of those section are more or less consistant across the dataset, we can peform some structuring

## Let's perform named entity recognition (NER) on resumes, to extract important information.

In [None]:
!python -m spacy download en_core_web_lg

In [21]:
import spacy
import pandas as pd
from collections import defaultdict

def extract_resume_entities(df, text_column='Resume'):
    """
    Extract relevant entities from resume text using spaCy.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing resumes
    text_column (str): Name of the column containing resume text
    
    Returns:
    pandas.DataFrame: DataFrame with original data and extracted entities
    """
    # Load English language model with parser and NER components
    nlp = spacy.load("en_core_web_lg")
    
    # Custom patterns for skills and education-related terms
    # TODO: move patterns to a separate file and add way more of them
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    patterns = [
        {"label": "SKILL", "pattern": [{"LOWER": {"IN": ["python", "java", "javascript", "sql", "r", "C", "C++",  "tensorflow",
                                                        "pytorch", "machine learning", "deep learning", "nlp",
                                                        "data analysis", "statistical analysis"]}}]},
        {"label": "DEGREE", "pattern": [{"LOWER": {"IN": ["phd", "master's", "bachelor's", "mba", "ms", "bs", "ba"]}}]}
    ]
    ruler.add_patterns(patterns)
    
    def process_text(text):
        if pd.isna(text):
            return {
                'organizations': [],
                'locations': [],
                'dates': [],
                'people': [],
                'skills': [],
                'degrees': [],
                'languages': [],
                'skill_phrases': []
            }
            
        # Process text through spaCy pipeline
        doc = nlp(text)
        
        # Initialize containers for different entity types
        entities = defaultdict(list)
        
        # Extract standard named entities
        for ent in doc.ents:
            if ent.label_ in ['ORG', 'GPE', 'DATE', 'PERSON']:
                entities[ent.label_].append(ent.text)
            
        # Extract custom entities (skills and degrees)
        for ent in doc.ents:
            if ent.label_ in ['SKILL', 'DEGREE']:
                entities[ent.label_].append(ent.text)
        
        # Extract noun chunks as potential skill phrases
        skill_phrases = [chunk.text for chunk in doc.noun_chunks 
                        if any(tech in chunk.text.lower() 
                              for tech in ["python", "java", "analysis", "engineering", "development", "computer"])]
        
        return {
            'organizations': list(set(entities['ORG'])),
            'locations': list(set(entities['GPE'])),
            'dates': list(set(entities['DATE'])),
            'people': list(set(entities['PERSON'])),
            'skills': list(set(entities['SKILL'])),
            'degrees': list(set(entities['DEGREE'])),
            'languages': list(set(entities['LAUGUAGE'])),
            'skill_phrases': list(set(skill_phrases))
        }
    
    # Create a list to store results
    results = []
    
    # Process each resume
    for _, row in df.iterrows():
        results.append(process_text(row[text_column]))
    
    # Convert results to DataFrame
    entities_df = pd.DataFrame(results)
    
    # Combine with original DataFrame
    result_df = pd.concat([df.reset_index(drop=True), entities_df], axis=1)
    
    return result_df

In [22]:
resumes_sample_df = resumes_df.sample(5)
enriched_df = extract_resume_entities(resumes_sample_df)

In [15]:
resumes_sample_df

Unnamed: 0,Category,Resume
481,Electrical Engineering,Skills: 1) MC Office 2) AutoCAD 2016 3) Introd...
601,DevOps Engineer,CORE COMPETENCIES ~ Ant ~ Maven ~ GIT ~ Bitbuc...
605,DevOps Engineer,"TECHNICAL SKILLS â¢ HP ALM, RTC and JIRA â¢ ..."
201,Mechanical Engineer,SKILLS: â¢ Knowledge of software / computer: ...
110,Arts,â¢ Good communication skill â¢ Quick learner...


In [23]:
enriched_df

Unnamed: 0,Category,Resume,organizations,locations,dates,people,skills,degrees,languages,skill_phrases
0,Operations Manager,Education Details \r\n BCA Vinayaka Missions...,"[BNY Mellon PMO, Supporting Departments, Treas...","[Brooklyn, UK, Syracuse, Invoiced, Pittsburgh,...","[Sept 2009-, Annual, daily, semiannual, quarte...","[-PMO\r\n, RM, KYC, Exprience - Less than]",[SQL],[],[],"[Performed skill-gap analysis, Development, em..."
1,Java Developer,Education Details \r\nJanuary 2013 Master of E...,"[Database, S.S.C Pusad, JSF, Oracle, Tata Pow...","[Hibernate, Pusad, Maharashtra K.D. High-Schoo...","[January 1999, January 2013, January 2001, 201...",[AD],"[SQL, Java]",[MS],[],[Maharashtra K.D. High-School\r\nJava Develope...
2,Hadoop,"Skill Set: Hadoop, Map Reduce, HDFS, Hive, Sqo...","[HDFS, Hive, Combines Enterprise, Solution, De...",[Exprience],"[49 months, 2016 to 2017]",[Scala],"[Java, java]",[],[],"[Core Java, java]"
3,SAP Developer,Competencies: SAP Business Intelligence Versio...,"[Nordea, Bangalore\r\nEnvironment SAP BO 4.1, ...","[Exprience, HANA, bex]","[36 months, 72 months]",[Marvin Pictures],[],[BA],[],"[views development, development, clear analysi..."
4,Blockchain,"SKILLS Bitcoin, Ethereum Solidity Hyperledger,...","[Maharastra State Government Hackthon, Relianc...","[Exprience, Maharashtra IIT, Mumbai, Rome]","[6 months, January 2018, January 2011, Nov 201...",[Beginner Tendermint],[Java],[],[],"[Java, Brain Computer Interface, Engineering, ..."


We can extract some entities like language, skills, education etc with acceptable accuracy. But for that to work we need better custom patterns.

### Let's now explore entities and ways to extract them in jobposts

In [24]:
jobs_df_sample = jobs_df.sample(5)
enriched_jobs_df = extract_resume_entities(jobs_df_sample, text_column='jobpost')
enriched_jobs_df

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Month,IT,organizations,locations,dates,people,skills,degrees,languages,skill_phrases
0,ArmenTel CJSC\r\nTITLE: Technical Manager of ...,"Jul 3, 2008",Technical Manager of IP Products,ArmenTel CJSC,TMIPP/08,,All interested candidates.,,,,...,7,False,"[CV/Resume, TMIPP/08, ADSL, CJSC]","[REMUNERATION/, Yerevan, Armenia]","[25 July 2008, 0014]",[hrm@],[],[MS],[],[- Advanced computer skills]
1,Chemonics International\r\nTITLE: Regulatory ...,Jun 29 5:18 AM,Regulatory and Institutional Framework Compone...,Chemonics International,,,,,,,...,6,False,[Regulatory and Institutional Framework Compon...,"[Yerevan, Armenia]","[29 June 2012, July 8, 2012, 08 July 2012]",[],[],[],[],"[professional\r\ndevelopment, professional dev..."
2,World Vision Armenia\r\nTITLE: Sponsorship Co...,"Feb 27, 2014",Sponsorship Coordinator,World Vision Armenia,,,,,,Open ended,...,2,False,"[Organization, Public Relations, Education, th...","[WV Armenia, Amasia, Yerevan, Armenia]","[Annual, the 3rd of every month, 10-15 years, ...",[Shirak\r\nmarz],[],[],[],"[economic development, development, the Area D..."
3,Dynamic System LTD\r\nTITLE: Assistant to Dir...,"Jan 25, 2010",Assistant to Director,Dynamic System LTD,,,All qualified individuals,,ASAP,"Long term, with probation period.",...,1,False,"[TIME, Dynamic System LTD]","[REMUNERATION/, Yerevan, Armenia]","[25 January 2010, 15 February 2010, daily]",[Max],[],[],[],"[computer software, an engineering company]"
4,Armenian Red Cross Society (ARCS)\r\nTITLE: C...,"Oct 5, 2012",Community Technology Access Consultant,Armenian Red Cross Society (ARCS),,,,,,Up to 3 months,...,10,False,"[ToT, Business Administration, Community Techn...","[Yerevan, Armenia]","[05 October 2012, 3 months, 11 October 2012]",[],[],[],[],"[future CTA\r\ndevelopment, computer literacy]"
