In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
ds_df_posts = pd.read_csv('data/job_postings.csv')
ds_df_skills = pd.read_csv('data/job_skills.csv')
ds_df_summary = pd.read_csv('data/job_summary.csv')
ml_df = pd.read_csv('data/1000_ml_jobs_us.csv')

In [4]:
df = ds_df_posts.merge(ds_df_skills, on = 'job_link').merge(ds_df_summary, on = 'job_link')

In [5]:
df.head()

Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills,job_summary
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,"Machine Learning, Programming, Python, Scala, ...",Company Description\nJobs for Humanity is part...
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,...",Who We Are\nAurora (Nasdaq: AUR) is delivering...
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,"ETL, Data Integration, Data Transformation, Da...","Location: New York City, NY\nPosition Summary\..."
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,"Data Lakes, Data Bricks, Azure Data Factory Pi...",Responsibilities:\nCandidate must have signifi...
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S...",Dice is the leading career destination for tec...


In [6]:
df['job_title'].value_counts()

job_title
Senior Data Engineer                                       285
Senior Data Analyst                                        163
Data Engineer                                              149
Senior MLOps Engineer                                      138
Data Analyst                                               137
                                                          ... 
Medical Scientist/MLT - Cranberry (FT and PT available)      1
IMLS Graduate Fellow in Objects Conservation                 1
SCHOOL DATA MANAGER II 2023-2024 EOY                         1
Sr Systems Administrator (Data Management)                   1
Medical Technologist, MLS or MLT                             1
Name: count, Length: 6484, dtype: int64

In [7]:
ml_df.head()

Unnamed: 0.1,Unnamed: 0,job_posted_date,company_address_locality,company_address_region,company_name,company_website,company_description,job_description_text,seniority_level,job_title
0,0,2024-10-31,Indianapolis,Indiana,Upper Hand,https://upperhand.com,Upper Hand is the leading provider of full-sui...,OverviewUpper Hand is embarking on an exciting...,Internship,Internship - Machine Learning Engineer & Data ...
1,1,2025-03-14,San Francisco,California,Ikigai,https://www.ikigailabs.io,"Built upon years of MIT research, Ikigai is a ...",Company DescriptionThe Ikigai platform unlocks...,Mid-Senior level,Machine Learning Engineer
2,2,2025-04-09,San Jose,CA,Adobe,http://www.adobe.com,Adobe is the global leader in digital media an...,Our CompanyChanging the world through digital ...,Entry level,Machine Learning Engineer
3,3,2025-03-22,Mountain View,California,Waymo,https://waymo.com/careers/,On the journey to be the world's most trusted ...,Waymo is an autonomous driving technology comp...,Entry level,"Machine Learning Engineer, Training"
4,4,2025-03-28,Boston,Massachusetts,HMH,http://www.hmhco.com,We are an adaptive learning company that empow...,Job Title: Machine Learning EngineerLocation: ...,Mid-Senior level,Machine Learning Engineer


In [8]:
ml_df['job_title'].value_counts()

job_title
Machine Learning Engineer                                        243
Data Scientist                                                    53
Software Engineer, Machine Learning                               30
Senior Machine Learning Engineer                                  22
Software Engineer, Machine Learning (Multiple Levels) - Slack      9
                                                                ... 
Machine Learning Engineer, GenAI Platform                          1
Machine Learning Engineer - NLP, TikTok Business Integrity         1
Lead Machine Learning Engineer - Python                            1
Mid-Career Machine Learning Engineer - Recommendation Systems      1
Robotics Machine Learning Engineer                                 1
Name: count, Length: 450, dtype: int64

In [9]:
def clean_job_titles(df, column='job_title'):
    """
    Standardizes job titles into broader categories.
    Removes prefixes (Senior, Lead, Manager, etc.)
    Cleans leftover commas/spaces.
    Maps common variants into core roles.
    Everything else → "Other".
    """

    df[column] = df[column].str.lower()

    # Remove common prefixes
    df[column] = df[column].str.replace(
        r'\b(senior|sr\.|lead|staff|principal|junior|jr\.|entry-level|intern|associate|manager|director|distinguished)\b',
        '',
        regex=True
    )

    # Remove leftover commas and extra whitespace
    df[column] = df[column].str.replace(r'^[,\s]+', '', regex=True).str.strip()

    conditions = [
        # ML/AI Engineers
        df[column].str.contains(
            r'\b(machine learning|mlops|ml engineer|mls engineer|ai/ml|ai software engineer)\b',
            regex=True
        ),

        # Data Scientists
        df[column].str.contains(
            r'\b(data scientist|applied scientist|data science)\b',
            regex=True
        ),

        # Software Engineers
        df[column].str.contains(
            r'\b(software engineer|software developer|swe)\b',
            regex=True
        ),

        # Data Engineers (catch wide variants: architect, infra, db, big data, cloud, azure, etc.)
        df[column].str.contains(
            r'\b(data engineer|data engineering|big data engineer|azure data engineer|python data engineer|'
            r'data infrastructure|data architecture|data architect|enterprise data architect|solutions architect - data|'
            r'database( engineer| administrator| developer)?|data center engineer|datacenter technician|data cabling engineer|'
            r'ontology|data modeling|data steward|data management|data governance|card tech data|platform engineer|teradata architect)\b',
            regex=True
        ),

        # Data Analysts
        df[column].str.contains(
            r'\b(data analyst|data analytics|data analysis|data reporting analyst|data quality analyst|data quality engineer'
            r'analytics specialist|bi analyst|business intelligence|analytics engineer|data entry|data modeler|'
            r'research analysis|data collector|^data$|clinical data|data & analytics|data security analyst)\b',
            regex=True
        ),
    ]

    choices = [
        'Machine Learning Engineer',
        'Data Scientist',
        'Software Engineer',
        'Data Engineer',
        'Data Analyst'
    ]
    return np.select(conditions, choices, default='Other')

In [10]:
ml_df['job_title'] = clean_job_titles(ml_df)

  df[column].str.contains(
  df[column].str.contains(
  df[column].str.contains(
  df[column].str.contains(
  df[column].str.contains(


In [11]:
ml_df['job_title'].value_counts()

job_title
Machine Learning Engineer    762
Data Scientist               125
Software Engineer             99
Data Engineer                  6
Other                          4
Data Analyst                   1
Name: count, dtype: int64

In [12]:
df['job_title'] = clean_job_titles(df)

  df[column].str.contains(
  df[column].str.contains(
  df[column].str.contains(
  df[column].str.contains(
  df[column].str.contains(


In [13]:
df['job_title'].value_counts().head(40)

job_title
Other                        3961
Data Engineer                3439
Data Analyst                 2556
Machine Learning Engineer    1092
Data Scientist                993
Software Engineer             176
Name: count, dtype: int64

In [14]:
df

Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills,job_summary
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,"Machine Learning, Programming, Python, Scala, ...",Company Description\nJobs for Humanity is part...
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,Software Engineer,Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,...",Who We Are\nAurora (Nasdaq: AUR) is delivering...
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Other,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,"ETL, Data Integration, Data Transformation, Da...","Location: New York City, NY\nPosition Summary\..."
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Other,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,"Data Lakes, Data Bricks, Azure Data Factory Pi...",Responsibilities:\nCandidate must have signifi...
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S...",Dice is the leading career destination for tec...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12212,https://uk.linkedin.com/jobs/view/data-reporti...,2024-01-21 07:11:22.099082+00,Finished NER,t,t,f,Other,Guardian Jobs,"Wembley, England, United Kingdom",2024-01-16,High Wycombe,United Kingdom,Manager Forms Analysis,Mid senior,Onsite,"Dashboard development, Reporting, Power BI, SQ...","FOOTBALL ASSOCIATION\nMake an impact, on and o..."
12213,https://www.linkedin.com/jobs/view/corporate-a...,2024-01-19 15:10:41.177008+00,Finished NER,t,t,f,Other,"Glacier Bancorp, Inc.","Kalispell, MT",2024-01-14,Montana,United States,Teller,Mid senior,Onsite,"Investigation, Antimoney laundering, Fraud, Ba...",About The Role\nPlease note: review of applica...
12214,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:20:19.036168+00,Finished NER,t,t,f,Data Scientist,Highnote,"San Francisco, CA",2024-01-16,San Rafael,United States,Mathematician,Mid senior,Onsite,"Data Science, Quantitative Modeling, SQL, Data...",About Highnote\nFounded in 2020 by a team of l...
12215,https://www.linkedin.com/jobs/view/senior-data...,2024-01-19 23:25:28.107523+00,Finished NER,t,t,f,Data Engineer,CompSource Mutual Insurance Company,"Oklahoma City, OK",2024-01-16,Arcadia,United States,Protection Engineer,Mid senior,Onsite,"Data Engineering, Data Quality, SQL, Python, T...",Are you an experienced data engineer in Oklaho...


In [15]:
ml_df = ml_df[['job_title', 'job_description_text', 'seniority_level']]
df = df[['job_title', 'job_level', 'job_summary']]

ml_df = ml_df.rename(columns={"job_description_text": "job_text",
                              "seniority_level": "job_level_final"})

df = df.rename(columns={"job_summary": "job_text",
                        "job_level": "job_level_final"})

final_df = pd.concat([ml_df, df], axis=0, ignore_index=True)
# keep only the final 3
final_df = pd.concat([ml_df, df], axis=0, ignore_index=True)


In [16]:
def drop_rows(df):
    df = df[df['job_title'] != 'Other']
    return df

In [17]:
final_df = drop_rows(final_df)

In [18]:
final_df['job_level_final'].value_counts()

job_level_final
Mid senior          7349
Associate            939
Mid-Senior level     368
Entry level          300
Not Applicable       208
Internship            70
Director               5
Executive              1
Name: count, dtype: int64

In [19]:
final_df['job_title'].value_counts()

job_title
Data Engineer                3445
Data Analyst                 2557
Machine Learning Engineer    1854
Data Scientist               1118
Software Engineer             275
Name: count, dtype: int64

In [20]:
final_df

Unnamed: 0,job_title,job_text,job_level_final
0,Machine Learning Engineer,OverviewUpper Hand is embarking on an exciting...,Internship
1,Machine Learning Engineer,Company DescriptionThe Ikigai platform unlocks...,Mid-Senior level
2,Machine Learning Engineer,Our CompanyChanging the world through digital ...,Entry level
3,Machine Learning Engineer,Waymo is an autonomous driving technology comp...,Entry level
4,Machine Learning Engineer,Job Title: Machine Learning EngineerLocation: ...,Mid-Senior level
...,...,...,...
13204,Software Engineer,Description\nAs a lead team member of the Data...,Associate
13206,Data Engineer,Job Description:\nType of Requisition:\nRegula...,Mid senior
13207,Data Analyst,"Benefits\n: (Medical, Dental, and Vision cover...",Associate
13211,Data Scientist,About Highnote\nFounded in 2020 by a team of l...,Mid senior


In [21]:
final_df.to_csv('jobs_listing.csv')