In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/kaggle/input/resume-dataset-for-resume-ranking-group-10/resume_data.csv")

In [None]:
null_summary = pd.DataFrame({
    'Column Name': df.columns,
    'Null Count': df.isnull().sum().values
})

null_summary = null_summary[null_summary['Null Count'] > 0]

print(null_summary)

In [None]:
cols = [
    'degree_names',
    'major_field_of_studies',
    'start_dates',
    'end_dates',
]

df = df.dropna(subset=cols)

In [None]:
df.shape

# Converting the degree_names into desired format

In [3]:
import re

EDUCATION_ALIASES = {
    "phd": ["doctor of philosophy", "ph.d", "ph.d.", "phd", "doctorate", "ph.d. in", "phd candidate"],
    "mba": ["master of business administration", "mba executive", "executive mba", "mba", "masters of business administration"],
    "msc": ["master of science", "m.sc", "m.s", "masters of science", "msc", "masters in science", "m.sc."],
    "ma": ["master of arts", "m.a", "m.a.", "masters of arts"],
    "mcom": ["master of commerce", "m.com", "mcom"],
    "me": ["master of engineering", "m.e", "m.eng", "m.e.", "m.engg"],
    "mtech": ["master of technology", "m.tech", "mtech", "mtech integrated"],
    "bsc": ["bachelor of science", "b.sc", "b.s", "bsc", "b.sc.", "b.s.", "honours bachelor of science", "bachelors of science"],
    "ba": ["bachelor of arts", "b.a", "ba", "b.a.", "bachelors of arts"],
    "bcom": ["bachelor of commerce", "b.com", "bcom"],
    "be": ["bachelor of engineering", "b.e", "b.e.", "b.eng", "b.engg", "bachelor of engineering (b.e"],
    "btech": ["bachelor of technology", "b.tech", "b.tech.", "btech", "b.tech(computers)", "dual degree (b.tech + m.tech)", "integrated b.tech & m.tech"],
    "bba": ["bachelor of business administration", "b.b.a", "bba", "bba - accounting", "bba - finance", "bachelor business administration"],
    "bca": ["bachelor of computer applications", "b.c.a", "bca"],
    "mca": ["master of computer applications", "m.c.a", "mca"],
    "bs": ["bs", "b.s", "b.s.", "b.s in", "bachelor's degree in science", "bachelor's in science"],
    "ms": ["ms", "m.s", "m.s.", "master in computer science", "masters of science in information technology"],
    "aa": ["associate of arts", "a.a", "aa"],
    "aas": ["associate of applied science", "a.a.s", "aas"],
    "as": ["associate of science", "a.s", "as", "associate of science degree"],
    "associate": ["associate's degree", "associate degree", "associates degree", "associates", "associate"],
    "diploma": ["technical diploma", "associate diploma", "polytechnic diploma", "diploma", "general diploma", "pg diploma", "master's diploma"],
    "high school": ["high school diploma", "ged", "grade 12", "xii", "x", "kcse"],
    "certificate": ["certificate of completion", "graduate certificate", "business certification", "epa certification", "aws brazing certification", "skills", "course", "certification", "minor", "training", "coaching"],
    "others": ["n/a", "select one", "attending", "testing computer software", "general courses"],

    # Education levels that are more common in the Sri Lankan context
    "al": ["advanced level", "a/l", "a.l", "gce a/l", "gce advanced level", "gce (a/l)", "gce(al)", "gce-a/l"],
    "ol": ["ordinary level", "o/l", "o.l", "gce o/l", "gce ordinary level", "gce (o/l)", "gce(ol)", "gce-o/l"],
    "nvq": ["nvq", "nvq level 3", "nvq level 4", "nvq level 5", "nvq level 6", "national vocational qualification", "nvq diploma"],
    "hnd": ["hnd", "higher national diploma", "hnd in", "higher national diploma in"],
    "cima": ["cima", "chartered institute of management accountants", "cima qualification"],
    "acca": ["acca", "association of chartered certified accountants"],
    "ca": ["chartered accountant", "institute of chartered accountants of sri lanka", "ica", "ca sri lanka"],
    "slim": ["slim", "slim diploma", "sri lanka institute of marketing", "slim pgd"],
    "nibt": ["nibt", "national institute of business & technology", "nibt diploma"],
    "bit": ["bit", "bachelor of information technology", "bit degree", "bit (colombo university)"]

}

EDUCATION_RANKS = {
    "others": 0,
    
    "high school": 1,
    "certificate": 1,
    "ol": 1,
    
    "al": 2,
    
    "diploma": 3,
    "associate": 3,
    "nvq": 3,
    "hnd": 3,
    "aa": 3,
    "aas": 3,
    "as": 3,
    "slim": 3,  
    "nibt": 3,   
    
    "bsc": 4,
    "bs": 4,
    "ba": 4,
    "be": 4,
    "btech": 4,
    "bit": 4,    
    "cima": 4,   
    "acca": 4,      
    "bcom": 4,
    "bba": 4,
    "bca": 4,
    
    "msc": 5,
    "ms": 5,
    "ma": 5,
    "me": 5,
    "mtech": 5,
    "mcom": 5,
    "mba": 5,
    "mca": 5,
    "ca": 5,
    
    "phd": 6
}

# Flattenning the aliases for easy reverse lookup
FLATTENED_ALIASES = {}
for canonical, synonyms in EDUCATION_ALIASES.items():
    for synonym in synonyms:
        FLATTENED_ALIASES[synonym.lower()] = canonical


def get_highest_education(degree_entries):
    if not isinstance(degree_entries, list):
        degree_entries = [degree_entries]

    best_match = ("unknown", -1)

    for entry in degree_entries:
        if not isinstance(entry, str):
            continue

        # Clean and normalize text
        text = re.sub(r'[^\w\s]', '', entry.lower())

        # Try matching from aliases
        for synonym, canonical in FLATTENED_ALIASES.items():
            if synonym in text:
                rank = EDUCATION_RANKS.get(canonical, -1)
                if rank > best_match[1]:
                    best_match = (canonical, rank)

        # Fallback to direct canonical keyword matching
        for canonical, rank in EDUCATION_RANKS.items():
            if canonical in text:
                if rank > best_match[1]:
                    best_match = (canonical, rank)

    return best_match[1]

# Educational Requirements

In [4]:
import re

EDUCATION_ALIASES = {
    "phd": ["doctor of philosophy", "ph.d", "ph.d.", "phd", "doctorate", "ph.d. in", "phd candidate"],
    "mba": ["master of business administration", "mba executive", "executive mba", "mba", "masters of business administration"],
    "msc": ["master of science", "m.sc", "m.s", "masters of science", "msc", "masters in science", "m.sc."],
    "ma": ["master of arts", "m.a", "m.a.", "masters of arts"],
    "mcom": ["master of commerce", "m.com", "mcom"],
    "me": ["master of engineering", "m.e", "m.eng", "m.e.", "m.engg"],
    "mtech": ["master of technology", "m.tech", "mtech", "mtech integrated"],
    "bsc": ["bachelor of science", "b.sc", "b.s", "bsc", "b.sc.", "b.s.", "honours bachelor of science", "bachelors of science"],
    "ba": ["bachelor of arts", "b.a", "ba", "b.a.", "bachelors of arts"],
    "bcom": ["bachelor of commerce", "b.com", "bcom"],
    "be": ["bachelor of engineering", "b.e", "b.e.", "b.eng", "b.engg", "bachelor of engineering (b.e"],
    "btech": ["bachelor of technology", "b.tech", "b.tech.", "btech", "b.tech(computers)", "dual degree (b.tech + m.tech)", "integrated b.tech & m.tech"],
    "bba": ["bachelor of business administration", "b.b.a", "bba", "bba - accounting", "bba - finance", "bachelor business administration"],
    "bca": ["bachelor of computer applications", "b.c.a", "bca"],
    "mca": ["master of computer applications", "m.c.a", "mca"],
    "bs": ["bs", "b.s", "b.s.", "b.s in", "bachelor's degree in science", "bachelor's in science"],
    "ms": ["ms", "m.s", "m.s.", "master in computer science", "masters of science in information technology"],
    "aa": ["associate of arts", "a.a", "aa"],
    "aas": ["associate of applied science", "a.a.s", "aas"],
    "as": ["associate of science", "a.s", "as", "associate of science degree"],
    "associate": ["associate's degree", "associate degree", "associates degree", "associates", "associate"],
    "diploma": ["technical diploma", "associate diploma", "polytechnic diploma", "diploma", "general diploma", "pg diploma", "master's diploma"],
    "high school": ["high school diploma", "ged", "grade 12", "xii", "x", "kcse"],
    "certificate": ["certificate of completion", "graduate certificate", "business certification", "epa certification", "aws brazing certification", "skills", "course", "certification", "minor", "training", "coaching"],
    "others": ["n/a", "select one", "attending", "testing computer software", "general courses"],

    # Education levels that are more common in the Sri Lankan context
    "al": ["advanced level", "a/l", "a.l", "gce a/l", "gce advanced level", "gce (a/l)", "gce(al)", "gce-a/l"],
    "ol": ["ordinary level", "o/l", "o.l", "gce o/l", "gce ordinary level", "gce (o/l)", "gce(ol)", "gce-o/l"],
    "nvq": ["nvq", "nvq level 3", "nvq level 4", "nvq level 5", "nvq level 6", "national vocational qualification", "nvq diploma"],
    "hnd": ["hnd", "higher national diploma", "hnd in", "higher national diploma in"],
    "cima": ["cima", "chartered institute of management accountants", "cima qualification"],
    "acca": ["acca", "association of chartered certified accountants"],
    "ca": ["chartered accountant", "institute of chartered accountants of sri lanka", "ica", "ca sri lanka"],
    "slim": ["slim", "slim diploma", "sri lanka institute of marketing", "slim pgd"],
    "nibt": ["nibt", "national institute of business & technology", "nibt diploma"],
    "bit": ["bit", "bachelor of information technology", "bit degree", "bit (colombo university)"]

}

# EDUCATION_RANKS = {
EDUCATION_RANKS = {
    "others": 0,
    
    "high school": 1,
    "certificate": 1,
    "ol": 1,
    
    "al": 2,
    
    "diploma": 3,
    "associate": 3,
    "nvq": 3,
    "hnd": 3,
    "aa": 3,
    "aas": 3,
    "as": 3,
    "slim": 3,  
    "nibt": 3,   
    
    "bsc": 4,
    "bs": 4,
    "ba": 4,
    "be": 4,
    "btech": 4,
    "bit": 4,    
    "cima": 4,   
    "acca": 4,      
    "bcom": 4,
    "bba": 4,
    "bca": 4,
    
    "msc": 5,
    "ms": 5,
    "ma": 5,
    "me": 5,
    "mtech": 5,
    "mcom": 5,
    "mba": 5,
    "mca": 5,
    "ca": 5,
    
    "phd": 6
}

In [5]:
# Flattenning the aliases for easy reverse lookup
FLATTENED_EDU_ALIASES = {
    synonym: canonical
    for canonical, synonyms in EDUCATION_ALIASES.items()
    for synonym in synonyms
}

def encode_ed_req(text):
    if not isinstance(text, str) or not text.strip():
        return EDUCATION_RANKS["others"]

    best_rank = EDUCATION_RANKS["others"]

    parts = re.split(r'[,\n/•;]', text.lower())

    for part in parts:
        part_clean = re.sub(r'[^\w\s]', '', part.strip())  # remove punctuation

        # Match from aliases
        for synonym, canonical in FLATTENED_EDU_ALIASES.items():
            if synonym in part_clean:
                rank = EDUCATION_RANKS.get(canonical, 0)
                best_rank = max(best_rank, rank)

        # Fallback direct canonical match
        for canonical, rank in EDUCATION_RANKS.items():
            if canonical in part_clean:
                best_rank = max(best_rank, rank)

    return best_rank

# Cleaning major_field_of_study

In [6]:
def clean_major_fields(df, column_name='major_field_of_studies'):
    df_cleaned = df.copy()

    abbreviations = {
        "cse": "computer science",
        "cs": "computer science",
        "it": "information technology",
        "computer": "computer science",
        "computers": "computer science",
        "ai": "artificial intelligence",
        "ml": "machine learning",
        "ds": "data science",
        "ece": "electronics engineering",
        "eee": "electrical engineering",
        "ee": "electrical engineering",
        "electrical": "electrical engineering",
        "electronics": "electronics engineering",
        "me": "mechanical engineering",
        "ce": "civil engineering",
        "che": "chemical engineering",
        "mechanical": "mechanical engineering",
        "civil": "civil engineering",
        "chemical": "chemical engineering",
        "finance": "finance",
        "accounting": "accounting",
        "business": "business administration",
        "management": "business administration",
        "marketing": "marketing",
        "statistics": "statistics",
        "economics": "economics",
        "biology": "biology",
        "chemistry": "chemistry",
        "physics": "physics",
        "math": "maths",
        "mathematics": "maths"
    }

    split_pattern = re.compile(r"[\/,&;|\s]+")

    empty_values = {"n/a", "none", "na", "null", "", "nan", "n, a", "n,a", "n", "a"}

    def process_value(value):
        if pd.isna(value) or value is None:
            return []
        
        value_str = str(value).lower().strip()

        if value_str in empty_values:
            return []

        value_str = value_str.strip("[]\"'")

        items = [item.strip() for item in split_pattern.split(value_str) if item.strip()]

        cleaned_items = []
        for item in items:
            if item in empty_values:
                continue
            cleaned_item = abbreviations.get(item, item)
            if cleaned_item:
                cleaned_items.append(cleaned_item)

        seen = set()
        unique_items = [x for x in cleaned_items if not (x in seen or seen.add(x))]

        return unique_items if unique_items else []

    df_cleaned[column_name] = df_cleaned[column_name].apply(process_value)
    
    return df_cleaned

# Cleaning experience_requirements

In [7]:
def clean_experience_min_only(df, column_name='experience', default_value=0):
    """
    Cleans experience column to extract the minimum number of years mentioned.
    Examples:
        'At least 3 years' → 3
        '3 to 5 years' → 3
        '1 to 2 years' → 1
        NaN or invalid → default_value (e.g., 0)

    Args:
        df (pd.DataFrame): Input DataFrame
        column_name (str): The column name to clean
        default_value (int): Value to assign to missing/invalid entries

    Returns:
        pd.DataFrame: DataFrame with cleaned experience column
    """
    def extract_min_years(value):
        if pd.isna(value):
            return default_value
        # Find all numbers in the string
        numbers = re.findall(r'\d+', str(value))
        if numbers:
            return int(numbers[0])  # take the minimum
        return default_value

    df_cleaned = df.copy()
    df_cleaned[column_name] = df_cleaned[column_name].apply(extract_min_years)
    return df_cleaned

# Job Experience

In [8]:
import ast

def remove_na_and_none_from_list(col):
    def clean(entry):
        try:
            # if already a list, skip parsing
            if isinstance(entry, list):
                items = entry
            else:
                items = ast.literal_eval(entry)
            return [i for i in items if str(i).strip().upper() not in ['N/A', 'NONE'] and i is not None]
        except:
            return entry  # return original if parsing fails
    return col.apply(clean)

df['start_dates'] = remove_na_and_none_from_list(df['start_dates'])
df['end_dates'] = remove_na_and_none_from_list(df['end_dates'])

In [9]:
import re

def replace_seasons_with_months(col):
    # Mapping of season to month
    season_to_month = {
        'spring': '03',
        'summer': '06',
        'fall': '09',
        'autumn': '09',
        'winter': '12',
    }

    def replace_season(entry):
        try:
            items = ast.literal_eval(entry) if isinstance(entry, str) else entry
            updated_items = []
            for item in items:
                if item is None:
                    updated_items.append(item)
                    continue
                # Match season followed by year, e.g., "Summer 2013"
                match = re.match(r'(?i)\b(spring|summer|fall|autumn|winter)\b\s+(\d{4})', str(item).strip())
                if match:
                    season = match.group(1).lower()
                    year = match.group(2)
                    month = season_to_month.get(season)
                    updated_items.append(f"{month}/{year}")
                else:
                    updated_items.append(item)
            return updated_items
        except:
            return entry  # keep original if parsing fails

    return col.apply(replace_season)

df['start_dates'] = replace_seasons_with_months(df['start_dates'])
df['end_dates'] = replace_seasons_with_months(df['end_dates'])

In [10]:
from datetime import datetime
import ast

def replace_current_terms_with_today(col):
    today_str = datetime.today().strftime('%b %d %Y')  # Example: 'Apr 22 2025'
    keywords = {'till date', 'current', 'ongoing', 'present', '∞'}

    def clean(entry):
        try:
            items = ast.literal_eval(entry) if isinstance(entry, str) else entry
            return [
                today_str if isinstance(i, str) and i.strip().lower() in keywords else i
                for i in items
            ]
        except:
            return entry  # return as-is if parsing fails

    return col.apply(clean)

# Apply it
df['start_dates'] = replace_current_terms_with_today(df['start_dates'])
df['end_dates'] = replace_current_terms_with_today(df['end_dates'])

filter and display rows where either start_dates or end_dates columns contains any item with the substring"20XX"

In [11]:
import ast

def contains_20xx(entry):
    try:
        items = ast.literal_eval(entry) if isinstance(entry, str) else entry
        return any(isinstance(i, str) and '20xx' in i.lower() for i in items)
    except:
        return False

mask = df['start_dates'].apply(contains_20xx) | df['end_dates'].apply(contains_20xx)
df_with_20xx = df.loc[mask, ['start_dates', 'end_dates']]

print(df_with_20xx)

                 start_dates                         end_dates
81    [May 20XX, April 20XX]           [August 20XX, May 20XX]
180       [20xx, 20xx, 20xx]  [May 01 2025, May 01 2025, 20xx]
222   [May 20XX, April 20XX]           [August 20XX, May 20XX]
548       [20xx, 20xx, 20xx]  [May 01 2025, May 01 2025, 20xx]
632         [September 20XX]                      [April 20XX]
...                      ...                               ...
9267        [September 20XX]                      [April 20XX]
9324        [September 20XX]                      [April 20XX]
9373  [May 20XX, April 20XX]           [August 20XX, May 20XX]
9453        [September 20XX]                      [April 20XX]
9508      [20xx, 20xx, 20xx]  [May 01 2025, May 01 2025, 20xx]

[84 rows x 2 columns]


In [12]:
import ast

def replace_array_with_unknown(entry):
    try:
        items = ast.literal_eval(entry) if isinstance(entry, str) else entry
        if any(isinstance(i, str) and '20xx' in i.lower() for i in items):
            return 'unknown'  # Replace entire array with the string 'unknown'
        return items
    except:
        return entry

df['start_dates'] = df['start_dates'].apply(replace_array_with_unknown)
df['end_dates'] = df['end_dates'].apply(replace_array_with_unknown)

replace each record in start_dates and end_dates with 'unknown' if either one of them contains an empty array.

In [13]:
import ast

def replace_empty_array_with_unknown(row):
    try:
        # Parse the entries if they are string representations of lists
        start = ast.literal_eval(row['start_dates']) if isinstance(row['start_dates'], str) else row['start_dates']
        end = ast.literal_eval(row['end_dates']) if isinstance(row['end_dates'], str) else row['end_dates']
        
        # Check if either start or end is an empty list
        if isinstance(start, list) and len(start) == 0 or isinstance(end, list) and len(end) == 0:
            return pd.Series({'start_dates': 'unknown', 'end_dates': 'unknown'})
        else:
            return pd.Series({'start_dates': row['start_dates'], 'end_dates': row['end_dates']})
    except:
        # In case of any parsing error, return the original values
        return pd.Series({'start_dates': row['start_dates'], 'end_dates': row['end_dates']})

# Apply the function to each row
df[['start_dates', 'end_dates']] = df.apply(replace_empty_array_with_unknown, axis=1)

converting all dates in string format to YYYY-MM

In [14]:
import numpy as np
import pandas as pd
from dateutil import parser
import ast

def standardize_date_array(entry):
    if isinstance(entry, str) and entry == 'unknown':
        return entry
    if isinstance(entry, float) and pd.isna(entry):
        return entry

    try:
        items = ast.literal_eval(entry) if isinstance(entry, str) else entry
        if not isinstance(items, list):  # guard clause
            return entry

        cleaned = []
        for item in items:
            if isinstance(item, str):
                try:
                    parsed = parser.parse(item, fuzzy=True)
                    cleaned.append(parsed.strftime('%Y-%m'))
                except:
                    cleaned.append(item)
            else:
                cleaned.append(item)
        return cleaned
    except:
        return entry
    
df['start_dates'] = df['start_dates'].apply(standardize_date_array)
df['end_dates'] = df['end_dates'].apply(standardize_date_array)

check if there are arrays with unequal length

In [15]:
import ast

def count_unequal_length_date_arrays(df):
    count = 0
    for start, end in zip(df['start_dates'], df['end_dates']):
        if isinstance(start, list) and isinstance(end, list):
            if len(start) != len(end):
                count += 1
    return count

unequal_length_count = count_unequal_length_date_arrays(df)
print("Total records with unequal-length date arrays:", unequal_length_count)

Total records with unequal-length date arrays: 452


replace the entries having unequal array length with string 'unknown'.

In [16]:
def replace_unequal_length_records(df):
    for idx, (start, end) in enumerate(zip(df['start_dates'], df['end_dates'])):
        if isinstance(start, list) and isinstance(end, list):
            if len(start) != len(end):
                df.at[idx, 'start_dates'] = 'unknown'
                df.at[idx, 'end_dates'] = 'unknown'
    return df

# Apply the function to your dataframe
df = replace_unequal_length_records(df)

sorting the dates arrays in accending order in both columns

In [17]:
from datetime import datetime

def sort_dates_in_array(entry):
    try:
        # Convert string dates to datetime objects for sorting
        items = ast.literal_eval(entry) if isinstance(entry, str) else entry
        # Sort the items based on datetime conversion
        sorted_items = sorted(items, key=lambda x: datetime.strptime(x, '%b %Y') if ' ' in x else datetime.strptime(x, '%m/%Y') if '/' in x else datetime.strptime(x, '%Y-%m'))
        return sorted_items
    except:
        return entry  # If any error occurs, return the original entry

# Apply the sorting function to both start_dates and end_dates
df['start_dates'] = df['start_dates'].apply(sort_dates_in_array)
df['end_dates'] = df['end_dates'].apply(sort_dates_in_array)
from datetime import datetime

def sort_dates_in_array(entry):
    try:
        # Convert string dates to datetime objects for sorting
        items = ast.literal_eval(entry) if isinstance(entry, str) else entry
        # Sort the items based on datetime conversion
        sorted_items = sorted(items, key=lambda x: datetime.strptime(x, '%b %Y') if ' ' in x else datetime.strptime(x, '%m/%Y') if '/' in x else datetime.strptime(x, '%Y-%m'))
        return sorted_items
    except:
        return entry  # If any error occurs, return the original entry

# Apply the sorting function to both start_dates and end_dates
df['start_dates'] = df['start_dates'].apply(sort_dates_in_array)
df['end_dates'] = df['end_dates'].apply(sort_dates_in_array)

In [18]:
def calculate_experience(start_dates, end_dates):
    # Check for 'unknown' in strings
    if isinstance(start_dates, str) and start_dates.lower() == 'unknown':
        return 'unknown'
    if isinstance(end_dates, str) and end_dates.lower() == 'unknown':
        return 'unknown'

    # Check for 'unknown' inside lists
    if isinstance(start_dates, list) and any(str(d).lower() == 'unknown' for d in start_dates):
        return 'unknown'
    if isinstance(end_dates, list) and any(str(d).lower() == 'unknown' for d in end_dates):
        return 'unknown'

    # Check for NaNs safely
    if isinstance(start_dates, list):
        if any(pd.isna(d) for d in start_dates):
            return np.nan
    elif pd.isna(start_dates):
        return np.nan

    if isinstance(end_dates, list):
        if any(pd.isna(d) for d in end_dates):
            return np.nan
    elif pd.isna(end_dates):
        return np.nan

    # Calculate total years of experience
    total_years = 0
    for start, end in zip(start_dates, end_dates):
        try:
            start_dt = datetime.strptime(str(start).strip(), '%Y-%m')
            end_dt = datetime.strptime(str(end).strip(), '%Y-%m')
            total_years += (end_dt - start_dt).days / 365.25
        except Exception:
            return np.nan  # Invalid date format

    return round(total_years, 2) if total_years != 0 else np.nan

df['experience_years'] = df.apply(
    lambda row: calculate_experience(row['start_dates'], row['end_dates']),
    axis=1
)

In [19]:
# Count how many 'unknown' strings are in the experience_years column
unknown_experience_count = (df['experience_years']=='unknown').sum()

print("Total 'unknown' in experience_years:", unknown_experience_count)

Total 'unknown' in experience_years: 788


In [20]:
df = df.drop(['start_dates', 'end_dates'], axis=1)

total 'unknown' s are 788 and total 'nan's 112.

# Cleaning Skills

In [21]:
import ast
import re

def clean_and_merge_skills(df, col1, col2, col3, new_col='merged_skills'):
    # Helper: Parse and clean a single cell
    def parse_and_clean(cell):
        try:
            parsed = ast.literal_eval(cell)
        except Exception:
            return []

        def flatten(x):
            if isinstance(x, list):
                return [item for sub in x for item in flatten(sub)]
            return [x]

        flat = flatten(parsed)
        cleaned = [s.strip().lower() for s in flat if isinstance(s, str)]
        return cleaned

    # Helper: Check if the list only contains generic skill patterns
    def is_generic_skills_list(skill_list):
        pattern = re.compile(r'^skill\s*\d+$')
        return all(pattern.match(s) for s in skill_list) and len(skill_list) > 0

    # Clean both columns
    cleaned_col1 = df[col1].apply(parse_and_clean)
    cleaned_col2 = df[col2].apply(parse_and_clean)
    cleaned_col3 = df[col3].apply(parse_and_clean)

    # Merge and deduplicate
    df[new_col] = [
        list(dict.fromkeys(c1 + c2 + c3)) for c1, c2, c3 in zip(cleaned_col1, cleaned_col2, cleaned_col3)
    ]

    # Remove generic skill lists
    df[new_col] = df[new_col].apply(lambda skills: [] if is_generic_skills_list(skills) else skills)

    # Drop original columns
    df.drop(columns=[col1, col2, col3], inplace=True)

    return df

clean_and_merge_skills(df, 'skills', 'related_skils_in_job', 'certification_skills', new_col='merged_skills')

Unnamed: 0,address,career_objective,educational_institution_name,degree_names,passing_years,educational_results,result_types,major_field_of_studies,professional_company_names,company_urls,...,expiry_dates,﻿job_position_name,educationaL_requirements,experiencere_requirement,age_requirement,responsibilities.1,skills_required,matched_score,experience_years,merged_skills
0,,Big data analytics working and database wareho...,['The Amity School of Engineering & Technology...,['B.Tech'],['2019'],['N/A'],[None],['Electronics'],['Coca-COla'],[None],...,,Senior Software Engineer,B.Sc in Computer Science & Engineering from a ...,At least 1 year,,Technical Support\nTroubleshooting\nCollaborat...,,0.850000,5.5,"[big data, hadoop, hive, python, mapreduce, sp..."
1,,Fresher looking to join as a data analyst and ...,"['Delhi University - Hansraj College', 'Delhi ...","['B.Sc (Maths)', 'M.Sc (Science) (Statistics)']","['2015', '2018']","['N/A', 'N/A']","['N/A', 'N/A']","['Mathematics', 'Statistics']",['BIB Consultancy'],['N/A'],...,,Machine Learning (ML) Engineer,M.Sc in Computer Science & Engineering or in a...,At least 5 year(s),,Machine Learning Leadership\nCross-Functional ...,,0.750000,5.66,"[data analysis, data analytics, business analy..."
2,,,"['Birla Institute of Technology (BIT), Ranchi']",['B.Tech'],['2018'],['N/A'],['N/A'],['Electronics/Telecommunication'],['Axis Bank Limited'],['N/A'],...,,"Executive/ Senior Executive- Trade Marketing, ...",Master of Business Administration (MBA),At least 3 years,,"Trade Marketing Executive\nBrand Visibility, S...",Brand Promotion\nCampaign Management\nField Su...,0.416667,6.92,"[software development, machine learning, deep ..."
3,,To obtain a position in a fast-paced business ...,"['Martinez Adult Education, Business Training ...",['Computer Applications Specialist Certificate...,['2008'],[None],[None],['Computer Applications'],"['Company Name ï¼ City , State', 'Company Name...","[None, None, None, None, None, None]",...,,Business Development Executive,Bachelor/Honors,1 to 3 years,Age 22 to 30 years,Apparel Sourcing\nQuality Garment Sourcing\nRe...,Fast typing skill\nIELTSInternet browsing & on...,0.760000,13.83,"[accounts payables, accounts receivables, acco..."
4,,Professional accountant with an outstanding wo...,['Kent State University'],['Bachelor of Business Administration'],[None],['3.84'],[None],['Accounting'],"['Company Name', 'Company Name', 'Company Name...","[None, None, None, None, None]",...,"['February 15, 2021']",Senior iOS Engineer,Bachelor of Science (BSc) in Computer Science,At least 4 years,,iOS Lifecycle\nRequirement Analysis\nNative Fr...,iOS\niOS App Developer\niOS Application Develo...,0.650000,17.33,"[analytical reasoning, compliance testing know..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9539,,,['Sanghvi College of Engineering'],['B.Tech'],['2019'],['N/A'],['N/A'],['N/A'],['BPM Foundation'],['N/A'],...,,Data Engineer,Bachelor of Science (BSc),5 to 8 years,,Data Platform Design\nData Pipeline Developmen...,Azure\nBig Data\nData Analytics\nETL Tools\nPo...,0.683333,4.91,"[mathematical modelling, machine learning, pre..."
9540,,Expertise EDA modeler. I like to learn what my...,"['KVoCT, Pune', 'KVoCT, Pune']","['B.CA', 'M.CA']","['2018', '2020']","[None, None]","[None, None]","[None, None]",['Passionate Solution'],[None],...,,Executive/ Sr. Executive -IT,Bachelor of Science (BSc) in Computer Science ...,3 to 5 years,Age at most 40 years,Hardware & Software Installation\nSystem Monit...,,0.650000,5.25,"[data analysis, business analysis, machine lea..."
9541,,Looking for roles related to application devel...,['PGG College Mysore'],['B.BA'],['2019'],['N/A'],['N/A'],['N/A'],['ZigSAW'],['N/A'],...,,Executive - VAT,BBA in Accounting and Finance,1 to 3 years,,Mushak Forms Maintenance\nVAT Software & MS Of...,VAT and Tax,0.650000,1.67,"[business analyst, data analytics, data cleans..."
9542,,,"['Rajiv Gandhi Memorial University, Delhi']",['B.TECH'],['2020'],['N/A'],['N/A'],['Electrical'],['Zynta Labs'],['N/A'],...,[None],Asst. Manager/ Manger (Administrative),Bachelor/Honors,At least 5 years,Age at least 28 years,Administrative Support\nScheduling\nFiling & D...,•Administration\n•Health Safety and Environmen...,0.650000,0.59,"[machine learning, natural language processing..."


# Skills Required

In [22]:
def preprocess_skills_required_column(df):
    
    def process_skill_entry(entry):
        if pd.isna(entry) or entry == "":
            return []
        
        # Split by newlines and filter out empty strings
        skills = [s.strip() for s in entry.split('\n') if s.strip()]
        
        # Clean each skill
        cleaned_skills = []
        for skill in skills:
            # Remove bullet points and other unwanted characters
            skill = skill.replace('•', '').strip()
            if skill:  # Only add non-empty skills
                cleaned_skills.append(skill)
        
        # Remove duplicates while preserving order
        seen = set()
        unique_skills = [x for x in cleaned_skills if not (x in seen or seen.add(x))]
        
        return unique_skills
    
    # Apply the processing function to the skills_required column
    df['skills_required_list'] = df['skills_required'].apply(process_skill_entry)

    # Drop the original column
    df.drop(columns=['skills_required'], inplace=True)
    
    return df

df = preprocess_skills_required_column(df)

# Get the cosine similarity between Skills and Skills Required

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_skill_cosine_similarity(df):
    # 1. Convert lists of skills to space-separated strings
    df['merged_skills_str'] = df['merged_skills'].apply(lambda lst: " ".join(lst))
    df['skills_required_str'] = df['skills_required_list'].apply(lambda lst: " ".join(lst))

    # 2. Prepare and fit a TF-IDF vectorizer on all skill-texts
    vectorizer = TfidfVectorizer()
    all_skill_texts = df['merged_skills_str'].tolist() + df['skills_required_str'].tolist()
    vectorizer.fit(all_skill_texts)

    # 3. Transform each column of skill-strings into TF-IDF vectors
    tfidf_merged = vectorizer.transform(df['merged_skills_str'])
    tfidf_required = vectorizer.transform(df['skills_required_str'])

    # 4. Compute cosine similarity for each row
    similarity_scores = []
    for i in range(len(df)):
        vec1 = tfidf_merged[i]
        vec2 = tfidf_required[i]
        sim_score = cosine_similarity(vec1, vec2)[0][0]
        similarity_scores.append(sim_score)

    # Add the similarity scores as a new column in the DataFrame
    df['cosine_similarity'] = similarity_scores

    # Drop intermediate columns
    df = df.drop(['merged_skills_str', 'skills_required_str'], axis=1)

    return df

df = compute_skill_cosine_similarity(df)

# Cleaning the Training Dataset

In [28]:
df['highest_degree'] = df['degree_names'].apply(get_highest_education)
df['ed_req_encoded'] = df['educationaL_requirements'].apply(encode_ed_req)

df_cleaned = clean_major_fields(df, column_name='major_field_of_studies')
df['major_field_encoded'] = df_cleaned['major_field_of_studies']

df_cleaned = clean_experience_min_only(df_cleaned, column_name='experiencere_requirement')
df['exp_req_encoded'] = df_cleaned['experiencere_requirement']

In [29]:
df.columns

Index(['address', 'career_objective', 'educational_institution_name',
       'degree_names', 'passing_years', 'educational_results', 'result_types',
       'major_field_of_studies', 'professional_company_names', 'company_urls',
       'positions', 'locations', 'responsibilities',
       'extra_curricular_activity_types',
       'extra_curricular_organization_names',
       'extra_curricular_organization_links', 'role_positions', 'languages',
       'proficiency_levels', 'certification_providers', 'online_links',
       'issue_dates', 'expiry_dates', '﻿job_position_name',
       'educationaL_requirements', 'experiencere_requirement',
       'age_requirement', 'responsibilities.1', 'matched_score',
       'experience_years', 'merged_skills', 'skills_required_list',
       'cosine_similarity', 'highest_degree', 'ed_req_encoded',
       'major_field_encoded', 'exp_req_encoded'],
      dtype='object')

In [32]:
df = df.drop([
    'address', 'career_objective', 'educational_institution_name', 'degree_names', 'passing_years', 
    'educational_results', 'result_types', 'major_field_of_studies', 'professional_company_names', 
    'company_urls','positions', 'locations', 'responsibilities', 'extra_curricular_activity_types', 
    'extra_curricular_organization_names', 'extra_curricular_organization_links', 'role_positions', 
    'languages', 'proficiency_levels', 'certification_providers', 'online_links', 'issue_dates', 
    'expiry_dates', '﻿job_position_name', 'educationaL_requirements', 'experiencere_requirement', 
    'age_requirement', 'responsibilities.1', 'merged_skills', 'skills_required_list'
], axis=1)

In [33]:
df.head()

Unnamed: 0,matched_score,experience_years,merged_skills,skills_required_list,cosine_similarity,highest_degree,ed_req_encoded,major_field_encoded,exp_req_encoded
0,0.85,5.5,"[big data, hadoop, hive, python, mapreduce, sp...",[],0.0,4,4,[electronics engineering],1
1,0.75,5.66,"[data analysis, data analytics, business analy...",[],0.0,5,5,"[mathematics', 'statistics]",5
2,0.416667,6.92,"[software development, machine learning, deep ...","[Brand Promotion, Campaign Management, Field S...",0.0,4,5,"[electronics engineering, telecommunication]",3
3,0.76,13.83,"[accounts payables, accounts receivables, acco...","[Fast typing skill, IELTSInternet browsing & o...",0.0,5,4,"[computer science, applications]",1
4,0.65,17.33,"[analytical reasoning, compliance testing know...","[iOS, iOS App Developer, iOS Application Devel...",0.0,4,4,[accounting],4
