In [29]:
import pandas as pd
import numpy as np

In [30]:
df = pd.read_csv("/kaggle/input/resume-dataset-for-resume-ranking-group-10/resume_data.csv")

In [31]:
# Flatten the list of degrees
all_req = df['educationaL_requirements'].dropna().explode()

unique_req = all_req.unique()

unique_req = sorted(unique_req)

print(unique_req)

['B.Sc in Civil Engineering from a reputed university.', 'B.Sc in Computer Science & Engineering from a reputed university.', 'BBA in Accounting and Finance', 'Bachelor of Science (BSc)', 'Bachelor of Science (BSc) in Civil Engineering', 'Bachelor of Science (BSc) in Computer Science', 'Bachelor of Science (BSc) in Computer Science & Engineering', 'Bachelor of Science (BSc) in Mechanical Engineering, Diploma in Mechanical', 'Bachelor/Honors', 'Bachelors or Masters degree in Computer Science, Engineering, or a related field.', 'Bachelor’s degree in Mechanical Engineering from a reputed institute.', 'Diploma, Bachelor/Honors', 'Fresh graduates with a Bachelor’s degree in Mechanical Engineering or a related field.', 'M.Sc in Computer Science & Engineering or in any relevant discipline from a reputed University', 'Master of Business Administration (MBA)', 'Master of Commerce (MCom) in Accounting, Must have CACC certification from any CA firm', 'Masters degree in any discipline, Bachelor of

In [32]:
import re

# Mapping canonical degree to various common forms
EDUCATION_ALIASES = {
    "phd": ["doctor of philosophy", "ph.d", "ph.d.", "phd", "doctorate", "ph.d. in", "phd candidate"],
    "mba": ["master of business administration", "mba executive", "executive mba", "mba", "masters of business administration"],
    "msc": ["master of science", "m.sc", "m.s", "masters of science", "msc", "masters in science", "m.sc."],
    "ma": ["master of arts", "m.a", "m.a.", "masters of arts"],
    "mcom": ["master of commerce", "m.com", "mcom"],
    "me": ["master of engineering", "m.e", "m.eng", "m.e.", "m.engg"],
    "mtech": ["master of technology", "m.tech", "mtech", "mtech integrated"],
    "bsc": ["bachelor of science", "b.sc", "b.s", "bsc", "b.sc.", "b.s.", "honours bachelor of science", "bachelors of science"],
    "ba": ["bachelor of arts", "b.a", "ba", "b.a.", "bachelors of arts"],
    "bcom": ["bachelor of commerce", "b.com", "bcom"],
    "be": ["bachelor of engineering", "b.e", "b.e.", "b.eng", "b.engg", "bachelor of engineering (b.e"],
    "btech": ["bachelor of technology", "b.tech", "b.tech.", "btech", "b.tech(computers)", "dual degree (b.tech + m.tech)", "integrated b.tech & m.tech"],
    "bba": ["bachelor of business administration", "b.b.a", "bba", "bba - accounting", "bba - finance", "bachelor business administration"],
    "bca": ["bachelor of computer applications", "b.c.a", "bca"],
    "mca": ["master of computer applications", "m.c.a", "mca"],
    "bs": ["bs", "b.s", "b.s.", "b.s in", "bachelor's degree in science", "bachelor's in science"],
    "ms": ["ms", "m.s", "m.s.", "master in computer science", "masters of science in information technology"],
    "aa": ["associate of arts", "a.a", "aa"],
    "aas": ["associate of applied science", "a.a.s", "aas"],
    "as": ["associate of science", "a.s", "as", "associate of science degree"],
    "associate": ["associate's degree", "associate degree", "associates degree", "associates", "associate"],
    "diploma": ["technical diploma", "associate diploma", "polytechnic diploma", "diploma", "general diploma", "pg diploma", "master's diploma"],
    "high school": ["high school diploma", "ged", "grade 12", "xii", "x", "kcse"],
    "certificate": ["certificate of completion", "graduate certificate", "business certification", "epa certification", "aws brazing certification", "skills", "course", "certification", "minor", "training", "coaching"],
    "others": ["n/a", "select one", "attending", "testing computer software", "general courses"],

    # Education levels that are more common in the Sri Lankan context
    "al": ["advanced level", "a/l", "a.l", "gce a/l", "gce advanced level", "gce (a/l)", "gce(al)", "gce-a/l"],
    "ol": ["ordinary level", "o/l", "o.l", "gce o/l", "gce ordinary level", "gce (o/l)", "gce(ol)", "gce-o/l"],
    "nvq": ["nvq", "nvq level 3", "nvq level 4", "nvq level 5", "nvq level 6", "national vocational qualification", "nvq diploma"],
    "hnd": ["hnd", "higher national diploma", "hnd in", "higher national diploma in"],
    "cima": ["cima", "chartered institute of management accountants", "cima qualification"],
    "acca": ["acca", "association of chartered certified accountants"],
    "ca": ["chartered accountant", "institute of chartered accountants of sri lanka", "ica", "ca sri lanka"],
    "slim": ["slim", "slim diploma", "sri lanka institute of marketing", "slim pgd"],
    "nibt": ["nibt", "national institute of business & technology", "nibt diploma"],
    "bit": ["bit", "bachelor of information technology", "bit degree", "bit (colombo university)"]

}

# Ranking the levels (higher number = higher qualification)
# EDUCATION_RANKS = {
EDUCATION_RANKS = {
    "others": 0,
    
    "high school": 1,
    "certificate": 1,
    "ol": 1,
    
    "al": 2,
    
    "diploma": 3,
    "associate": 3,
    "nvq": 3,
    "hnd": 3,
    "aa": 3,
    "aas": 3,
    "as": 3,
    "slim": 3,  
    "nibt": 3,   
    
    "bsc": 4,
    "bs": 4,
    "ba": 4,
    "be": 4,
    "btech": 4,
    "bit": 4,    
    "cima": 4,   
    "acca": 4,      
    "bcom": 4,
    "bba": 4,
    "bca": 4,
    
    "msc": 5,
    "ms": 5,
    "ma": 5,
    "me": 5,
    "mtech": 5,
    "mcom": 5,
    "mba": 5,
    "mca": 5,
    "ca": 5,
    
    "phd": 6
}

In [33]:
# Flattenning the aliases for easy reverse lookup
FLATTENED_EDU_ALIASES = {
    synonym: canonical
    for canonical, synonyms in EDUCATION_ALIASES.items()
    for synonym in synonyms
}

In [42]:
def encode_ed_req(text):
    if not isinstance(text, str) or not text.strip():
        return EDUCATION_RANKS["others"]

    best_rank = EDUCATION_RANKS["others"]

    # Normalize and split if needed (e.g., multiple degrees mentioned with comma, slash, newline, etc.)
    parts = re.split(r'[,\n/•;]', text.lower())

    for part in parts:
        part_clean = re.sub(r'[^\w\s]', '', part.strip())  # remove punctuation

        # Match from aliases
        for synonym, canonical in FLATTENED_EDU_ALIASES.items():
            if synonym in part_clean:
                rank = EDUCATION_RANKS.get(canonical, 0)
                best_rank = max(best_rank, rank)

        # Fallback direct canonical match
        for canonical, rank in EDUCATION_RANKS.items():
            if canonical in part_clean:
                best_rank = max(best_rank, rank)

    return best_rank

In [43]:
df['ed_req_encoded'] = df['educationaL_requirements'].apply(encode_ed_req)

In [44]:
df[['ed_req_encoded', 'educationaL_requirements']]

Unnamed: 0,ed_req_encoded,educationaL_requirements
0,4,B.Sc in Computer Science & Engineering from a ...
1,5,M.Sc in Computer Science & Engineering or in a...
2,5,Master of Business Administration (MBA)
3,4,Bachelor/Honors
4,4,Bachelor of Science (BSc) in Computer Science
...,...,...
9539,4,Bachelor of Science (BSc)
9540,4,Bachelor of Science (BSc) in Computer Science ...
9541,4,BBA in Accounting and Finance
9542,4,Bachelor/Honors


In [45]:
ed_req_counts = df["educationaL_requirements"].value_counts()

print(ed_req_counts)

educationaL_requirements
Bachelor/Honors                                                                                                                     2044
Bachelor of Science (BSc) in Computer Science                                                                                        681
Bachelor of Science (BSc) in Computer Science & Engineering                                                                          680
Bachelor of Science (BSc)                                                                                                            680
Bachelor of Science (BSc) in Civil Engineering                                                                                       342
B.Sc in Civil Engineering from a reputed university.                                                                                 342
•Bachelor of Business Administration (BBA) in Human Resource Management\n•BBA/MBA in HRM/Management from any reputed university.     342
•Bachelor of Bus

In [46]:
ed_req_encoded = df["ed_req_encoded"].value_counts()

print(ed_req_encoded)

ed_req_encoded
4    5451
5    4093
Name: count, dtype: int64
