In [70]:
# Importing Dependencies
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# **Data Cleaning**
### **Data Ingestion**


In [71]:
# Load the data
job_postings = pd.read_csv('../../../data/job_postings.csv')
job_skills = pd.read_csv('../../../data/job_skills.csv')

## **Title Classification (input for ticket 1.1)**
### **Target Jobs Classification Regex**
#### Input job_postings.csv dataframe with job_title column

In [72]:
# target job title regex list
target_job_titles_regex = {
    "MLOps Engineer": r"(?i)(MLOps|Machine Learning Operations|Machine Learning Infrastructure Engineer|ML Infrastructure|ML Platform|ML Systems|ML Platform Engineer|AIML Ops Engineer|Machine Learning Software Developer)\w*[-\s]?",

    "Machine Learning Engineer": r"(?i)(Machine Learning Engineer|ML Engineer|Machine Learning Engineering|ML Developer|Machine Learning Software Engineer|AIML Engineer|AIML Data Scientist|AI Data Science Lead)\w*[-\s]?",

    "Data Architect": r"(?i)(Data Architect|Senior Data Architect|Cloud Data Architect|Big Data Architect|Enterprise Data Architect|Principal Data Architect|Lead Data Architect|Data Warehouse Architect|Data Architecture|Data Lake Architect|Data Streaming Architect)\w*[-\s]?",

    "Database Engineer / Administrator": r"(?i)(Database|Database Architect|DBA\b|Cloud Database|Azure Database|AWS Database|Databases|GCP Database|Oracle Database Engineer)\w*[-\s]?",

    "Data Engineer": r"(?i)(Data Engineer|Senior Data Engineer|Lead Data Engineer|Big Data Engineer|Data Engineering|Data Engineering Manager|Data Engineering Architect|Data Pipeline Engineer|Big Data Developer|Data Engineers|Data Integrations|Data Infrastructure|ETL Developer)\w*[-\s]?",

    "Data Governance & Security": r"(?i)(Data Governance|Data Privacy|Data Steward|Data Protection|Data Security|Master Data Management|Data Governance Manager|Data Compliance|Data Lifecycle Manager)\w*[-\s]?",

    "Data Operations & Management": r"(?i)(Data Manager|Enterprise Data Manager|Data Operations|Data Operations Manager|Data Operations Analyst|Data Management Engineer|Data Strategy Manager|Data Solution Architect|Data Deployment|Data Conversion|Data Replication Engineer|DevOps Engineer|Distributed Systems|Storage)\w*[-\s]?",

    "Data Modeling & Warehousing": r"(?i)(Data Modeling|Data Warehouse|Big Data Developer|Data Warehouse Architect|Cloud Datawarehouse|Data Platform Developer)\w*[-\s]?",

    "Data Specialist": r"(?i)(Data Specialist|Data Processing|Data Consultant|Data Quality Manager|Data Coordinator|Data Entry Specialist)\w*[-\s]?",

    "Data Scientist": r"(?i)(Data Scientist|Data Scientists|Data Science Engineer|Data Science Manager|Data Science Analyst|Data Science Practitioner|Customer Data Scientist)\w*[-\s]?",

    "Data Analyst": r"(?i)(Data Analyst|Data Analysts|Financial Data Analyst|Business Intelligence|BI Analyst|Data Business Analyst|Data Insights Analyst)\w*[-\s]?",

    "Software & Platform Engineering": r"(?i)(Software Engineer|Software Engineering|Software Developer|Software Engineer Data Science|Software Engineer Data Platforms|Platform Engineer|Application Developer|Backend Engineer|Systems Developer)\w*[-\s]?",

    "Cloud & Infrastructure Engineering": r"(?i)(Cloud Data|Cloud Data Architect|Azure Data|AWS Data|Azure Databricks|AWS Databricks|Cloud Engineer|Cloud Platform Engineer|Infrastructure Engineer|Datacenter Technician|Datacenter Engineer|Datacenter Network Engineer|Datacenter Engineering|Site Reliability Engineer|SRE)\w*[-\s]?",

    "Risk & Compliance Analytics": r"(?i)(Risk Analyst|AML\b|BSA|Risk Modeling|Financial Analyst|Hedge Fund|Data Loss Prevention|DLP)\w*[-\s]?"
}


### **Classification Function**

In [73]:
# Function to Classify Job Titles
def classify(job_title, keywords_list=target_job_titles_regex):
    for industry, keyword in keywords_list.items():
        match = re.search(keyword, str(job_title))
        if match:
            keyword = re.sub(r'[^a-zA-Z\s]', '', match.group()).strip().title()   # using match.group() to return the actual keyword that was matched rather than the regex pattern
            return industry, keyword              
    return "unclassified", "unclassified"


In [74]:
# Copy Dataframe and Execute the classification function
classified_job_titles = job_postings.copy()
classified_job_titles['job_classification'], classified_job_titles['job_keyword'] = zip(*classified_job_titles['job_title'].apply(classify))

### **Seniority Level Classification Regex**

In [75]:
# seniority level regex list
seniority_levels_regex = {
    # 🔹 Principal / Staff-Level Roles (Must Be Checked First)
    "Principal / Staff-Level": r"(?i)(Principal|Staff|Sr[-\s]?Staff|Distinguished|Fellow|Master|L4|Level 4|Chief[-\s]?Architect|Chief[-\s]?Scientist)\w*[-\s]?",

    # 🔹 Lead / Supervisor Roles (Checked Before Senior)
    "Lead": r"(?i)(Lead|Tech[-\s]?Lead|Team[-\s]?Lead|Supervisor|Group[-\s]?Lead|Project[-\s]?Lead|Engineering[-\s]?Lead|Squad[-\s]?Lead|Chapter[-\s]?Lead|Manager|Head[-\s]?of[-\s]?Team)\w*[-\s]?",

    # 🔹 Senior-Level Roles (Checked Before Mid-Level)
    "Senior-Level": r"(?i)(Senior|Sr\.?|SNR|SEN|L3|Level 3|Expert|Specialist|Advanced|Seasoned|Experienced)\w*[-\s]?",

    # 🔹 Mid-Level Roles (Checked Before Junior)
    "Mid-Level": r"(?i)(Mid[-\s]?Level|Intermediate|Mid|L2|Level 2|Professional|Regular)\w*[-\s]?",

    # 🔹 Entry-Level / Junior Roles (Checked After Principal & Senior)
    "Entry-Level / Junior": r"(?i)(Junior|Jr\.?|Entry[-\s]?Level|Associate|Graduate|Trainee|Fresher|New Grad|Early[-\s]?Career|L1|Level 1)\w*[-\s]?",

    # 🔹 Intern / Internship Roles (Checked Last)
    "Intern": r"(?i)(Intern|Internship|Co[-\s]?Op|Apprentice|Trainee)\w*[-\s]?",

    # 🔹 Director / Executive Roles (Checked Last for Highest Priority)
    "Director / Executive": r"(?i)(Director|Head|VP|Vice[-\s]?President|CIO|CTO|CISO|CEO|Chief|Executive|C[-]?Level|Managing[-\s]?Director|Global[-\s]?Head|President|Founder|Partner)\w*[-\s]?"
}


### **Seniority Level Classification Function**

In [76]:
# Create single use function to classify seniority level
def classify_seniority_level(job_title):
    return classify(job_title, seniority_levels_regex)

In [77]:
# Execute the classification function on the copied dataframe
classified_job_titles['seniority_level'], classified_job_titles['seniority_level_keyword'] = zip(*classified_job_titles['job_title'].apply(classify_seniority_level))

# Check the results
# classified_job_titles.head()

### **Classify the Unclassified Seniority Titles**

In [78]:
# Create a For Loop and If Conditional to classify the unclassified seniority titles
for index, row in classified_job_titles.iterrows():
    if row['seniority_level'] == 'unclassified':
        if row['job_keyword'] == 'Data Analyst' or row['job_keyword'] == 'Data Security' or row['job_keyword'] == 'Database' or row['job_keyword'] == 'Cloud Engineer' or \
                row['job_keyword'] == 'Financial Data Analyst' or row['job_keyword'] == 'Bsa' or row['job_keyword'] == 'Machine Learning Engineer' or row['job_keyword'] == 'Data Processing' or \
                row['job_keyword'] == 'Backend Engineer' or row['job_keyword'] == 'Ml Engineering' or row['job_keyword'] == 'Data Governance' or row['job_keyword'] == 'Big Data Engineer' or \
                row['job_keyword'] == 'Aml' or row['job_keyword'] == 'Data Privacy' or row['job_keyword'] == 'Data Business Analyst' or row['job_keyword'] == 'Data Engineers' or \
                row['job_keyword'] == 'Data Engineer' or row['job_keyword'] == 'Infrastructure Engineer' or row['job_keyword'] == 'Datacenter Technician' or \
                row['job_keyword'] == 'Data Operations' or row['job_keyword'] == 'Data Science Engineer' or row['job_keyword'] == 'Data Consultant' or \
                row['job_keyword'] == 'Software Developer' or row['job_keyword'] == 'Data Science Analyst' or row['job_keyword'] == 'Bi Analyst' or \
                row['job_keyword'] == 'Ml Developer' or row['job_keyword'] == 'Ml Engineer' or row['job_keyword'] == 'Datacenter Engineer' or row['job_keyword'] == 'Platform Engineer' or \
                row['job_keyword'] == 'Cloud Data' or row['job_keyword'] == 'Etl Developer' or row['job_keyword'] == 'Dba' or row['job_keyword'] == 'Databases' or \
                row['job_keyword'] == 'Financial Analyst' or row['job_keyword'] == 'Devops Engineer' or row['job_keyword'] == 'Data Insights Analyst' or \
                row['job_keyword'] == 'Risk Analyst' or row['job_keyword'] == 'Data Analysts' or row['job_keyword'] == 'Cloud Database' or \
                row['job_keyword'] == 'Site Reliability Engineer' or row['job_keyword'] == 'Data Analystat' or row['job_keyword'] == 'Data Pipeline Engineer' or \
                row['job_keyword'] == 'Big Data Engineering':	
            classified_job_titles.loc[index, 'seniority_level'] = "Entry-Level / Junior" 
        elif row['job_keyword'] == 'Data Scientist' or row['job_keyword'] == 'Data Engineering' or row['job_keyword'] == 'MLOps Engineer' or \
                row['job_keyword'] == 'Business Intelligence' or row['job_keyword'] == 'Data Coordinator' or row['job_keyword'] == 'Data Steward' or \
                row['job_keyword'] == 'Machine Learning Infrastructure Engineer' or row['job_keyword'] == 'Machine Learning Software Developer' or row['job_keyword'] == 'Software Engineer' or \
                row['job_keyword'] == 'Customer Data Scientist' or row['job_keyword'] == 'Data Warehouse Architect'  or row['job_keyword'] == 'Ml Systems' or \
                row['job_keyword'] == 'Data Compliance' or row['job_keyword'] == 'Big Data Architect' or row['job_keyword'] == 'Aws Databricks' or \
                row['job_keyword'] == 'Big Data Developer' or row['job_keyword'] == 'Azure Data' or row['job_keyword'] == 'Data Replication Engineer' or \
                row['job_keyword'] == 'Data Science Practitioner' or row['job_keyword'] == 'Data Integrations' or row['job_keyword'] == 'Data Modeling' or \
                row['job_keyword'] == 'Machine Learning Operations' or row['job_keyword'] == 'Mlops' or row['job_keyword'] == 'Data Loss Prevention' or \
                row['job_keyword'] == 'Ml Infrastructure' or row['job_keyword'] == 'Machine Learning Software Engineer' or row['job_keyword'] == 'Data Deployment' or \
                row['job_keyword'] == 'Data Architecture' or row['job_keyword'] == 'Datacenter Network Engineer' or row['job_keyword'] == 'Azure Databricks' or \
                row['job_keyword'] == 'Data Stewardship' or row['job_keyword'] == 'Ml Platform' or row['job_keyword'] == 'Data Conversion' or \
                row['job_keyword'] == 'Data Management Engineer':
            classified_job_titles.loc[index, 'seniority_level'] = "Mid-Level"
        elif row['job_keyword'] == 'Data Architect' or row['job_keyword'] == 'Data Warehouse' or row['job_keyword'] == 'Cloud Data Architect' or \
                row['job_keyword'] == 'Data Protection' or row['job_keyword'] == 'Data Lake Architect' or row['job_keyword'] == 'Enterprise Data Architect' or \
                row['job_keyword'] == 'Data Solution Architect' or row['job_keyword'] == 'Data Streaming Architect':
            classified_job_titles.loc[index, 'seniority_level'] = "Senior-Level"



### **Check Group**

In [79]:
# check group
job_check = 'Cloud & Infrastructure Engineering'
# pd.DataFrame(classified_job_titles[classified_job_titles['job_classification'] == job_check].head())

### **Drop unclassified rows**
#### Title Classification Section - Outputs Classifieds Dataframe with Added Job Classification & Seniority Level Columns


In [80]:
# Dropping rows
classified_job_titles = classified_job_titles[classified_job_titles['job_classification'] != 'unclassified']
# classified_job_titles = classified_job_titles[classified_job_titles['seniority_level'] != 'unclassified']

# check results
classified_job_titles

Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_classification,job_keyword,seniority_level,seniority_level_keyword
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,Machine Learning Engineer,Machine Learning Engineer,Senior-Level,Senior
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,Software & Platform Engineering,Software Engineer,Principal / Staff-Level,Principal
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,Data Modeling & Warehousing,Data Warehouse,Senior-Level,Senior
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,Data Modeling & Warehousing,Data Warehouse,Senior-Level,Senior
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,Data Engineer,Lead Data Engineer,Lead,Lead
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12209,https://www.linkedin.com/jobs/view/data-archit...,2024-01-21 08:08:07.523737+00,Finished NER,t,t,f,Data Architect,General Dynamics Information Technology,"St Louis, MO",2024-01-14,Collinsville,United States,Interior Designer,Mid senior,Onsite,Data Architect,Data Architect,Senior-Level,unclassified
12211,https://ca.linkedin.com/jobs/view/senior-data-...,2024-01-20 05:10:03.58781+00,Finished NER,t,t,f,Senior Data Insights Analyst,CARFAX Canada,"London, Ontario, Canada",2024-01-14,London,Canada,Data Entry Clerk,Mid senior,Onsite,Data Analyst,Data Insights Analyst,Senior-Level,Senior
12213,https://www.linkedin.com/jobs/view/corporate-a...,2024-01-19 15:10:41.177008+00,Finished NER,t,t,f,Corporate AML Alert Investigation Specialist,"Glacier Bancorp, Inc.","Kalispell, MT",2024-01-14,Montana,United States,Teller,Mid senior,Onsite,Risk & Compliance Analytics,Aml,Senior-Level,Specialist
12214,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:20:19.036168+00,Finished NER,t,t,f,Senior Data Scientist,Highnote,"San Francisco, CA",2024-01-16,San Rafael,United States,Mathematician,Mid senior,Onsite,Data Scientist,Data Scientist,Senior-Level,Senior


In [81]:
# Show unclassified seniority titles
unclassified_seniority = classified_job_titles[classified_job_titles['seniority_level'] == 'unclassified']
unclassified_seniority_df = pd.DataFrame(unclassified_seniority[['job_title', 'job_classification', 'job_keyword', 'seniority_level', 'seniority_level_keyword']])
unclassified_seniority_df

Unnamed: 0,job_title,job_classification,job_keyword,seniority_level,seniority_level_keyword


### **Create job_classification and job_keyword ids**

In [82]:
# Classify job titles and create a new DataFrame with classification results
classified_job_titles['job_classification'], classified_job_titles['job_keyword'] = zip(*classified_job_titles['job_title'].apply(classify))

# Get unique job classifications and assign title_ids
unique_classifications = classified_job_titles[['job_classification']].drop_duplicates().reset_index(drop=True)
unique_classifications['title_id'] = range(1, len(unique_classifications) + 1)

# Get unique job keywords and assign keyword_ids
unique_keywords = classified_job_titles[['job_keyword']].drop_duplicates().reset_index(drop=True)
unique_keywords['keyword_id'] = range(1, len(unique_keywords) + 1)

# Merge the unique classifications back to the original DataFrame
classified_job_titles = pd.merge(classified_job_titles, unique_classifications, on='job_classification', how='left')

# Merge the unique keywords back to the original DataFrame
classified_job_titles = pd.merge(classified_job_titles, unique_keywords, on='job_keyword', how='left')

# Check the results
classified_job_titles.head(10)

Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,...,search_country,search_position,job_level,job_type,job_classification,job_keyword,seniority_level,seniority_level_keyword,title_id,keyword_id
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,...,United States,Agricultural-Research Engineer,Mid senior,Onsite,Machine Learning Engineer,Machine Learning Engineer,Senior-Level,Senior,1,1
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,...,United States,Set-Key Driver,Mid senior,Onsite,Software & Platform Engineering,Software Engineer,Principal / Staff-Level,Principal,2,2
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,...,United States,Technical Support Specialist,Associate,Onsite,Data Modeling & Warehousing,Data Warehouse,Senior-Level,Senior,3,3
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,...,United States,Architect,Mid senior,Onsite,Data Modeling & Warehousing,Data Warehouse,Senior-Level,Senior,3,3
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,...,United States,Maintenance Data Analyst,Mid senior,Onsite,Data Engineer,Lead Data Engineer,Lead,Lead,4,4
5,https://www.linkedin.com/jobs/view/senior-data...,2024-01-21 07:14:11.378097+00,Finished NER,t,t,f,Senior Data Engineer,University of Chicago,"Chicago, IL",2024-01-14,...,United States,Data Base Administrator,Mid senior,Onsite,Data Engineer,Senior Data Engineer,Senior-Level,Senior,4,5
6,https://www.linkedin.com/jobs/view/principal-a...,2024-01-21 07:39:58.478064+00,Finished NER,t,t,f,"Principal Associate, Data Loss Prevention (DLP...",Jobs for Humanity,"Scranton, PA",2024-01-14,...,United States,Architect,Mid senior,Onsite,Risk & Compliance Analytics,Data Loss Prevention,Principal / Staff-Level,Principal,5,6
7,https://www.linkedin.com/jobs/view/senior-fina...,2024-01-21 07:14:50.991803+00,Finished NER,t,t,f,Senior Financial Data Analyst,The Walt Disney Company,"Lake Buena Vista, FL",2024-01-15,...,United States,Budget Officer,Mid senior,Onsite,Data Analyst,Financial Data Analyst,Senior-Level,Senior,6,7
8,https://www.linkedin.com/jobs/view/machine-lea...,2024-01-21 07:40:40.017291+00,Finished NER,t,t,f,Machine Learning Infrastructure Engineer,L&T Technology Services,"Sunnyvale, CA",2024-01-14,...,United States,Test Fixture Designer,Mid senior,Onsite,MLOps Engineer,Machine Learning Infrastructure Engineer,Mid-Level,unclassified,7,8
9,https://www.linkedin.com/jobs/view/sr-workforc...,2024-01-21 07:15:01.601597+00,Finished NER,t,t,f,Sr. Workforce Management Data Analyst,"Wisemen Multimedia, LLC","Atlanta, GA",2024-01-13,...,United States,Management Analyst,Mid senior,Onsite,Data Analyst,Data Analyst,Senior-Level,Sr,6,9
