In [1]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv("../Processed/step1_thin.csv")
df.shape


(123849, 3)

In [2]:
keep_cols = ["job_id", "company_name", "title", "description", "location", "formatted_experience_level",
             "min_salary", "max_salary", "pay_period", "normalized_salary", "currency"]

keep_cols = [c for c in keep_cols if c in df.columns]
df = df[keep_cols].copy()

df = df.rename(columns={
    "title": "job_title",
    "description": "job_description"
})

df.head(2)


Unnamed: 0,job_title,job_description,location
0,Marketing Coordinator,Job descriptionA leading real estate firm in N...,"Princeton, NJ"
1,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...","Fort Collins, CO"


In [3]:
def clean_text(x):
    if pd.isna(x):
        return ""
    x = str(x).lower()
    x = re.sub(r"[\r\n\t]+", " ", x)
    x = re.sub(r"\s+", " ", x).strip()
    return x

df["job_title"] = df["job_title"].apply(clean_text)
df["job_description"] = df["job_description"].apply(clean_text)

if "location" in df.columns:
    df["location"] = df["location"].apply(clean_text)

df = df.drop_duplicates()
df = df[(df["job_title"] != "") & (df["job_description"] != "")]
df.shape


(118851, 3)

In [4]:
entry_keywords = [
    "entry", "entry level", "junior", "jr", "new grad", "new graduate",
    "graduate", "intern", "internship", "associate", "co op", "coop", "fresher"
]

def is_entry_level(title):
    return any(k in title for k in entry_keywords)

df["is_entry_level"] = df["job_title"].apply(is_entry_level)

df["is_entry_level"].value_counts()


is_entry_level
False    109041
True       9810
Name: count, dtype: int64

In [5]:
def extract_min_years(text: str):
    if not text:
        return np.nan

    # Common patterns
    patterns = [
        r"(\d+)\s*\+\s*years?",                     # 3+ years
        r"minimum\s*(\d+)\s*years?",                # minimum 3 years
        r"at least\s*(\d+)\s*years?",               # at least 2 years
        r"(\d+)\s*-\s*(\d+)\s*years?",              # 2-4 years
        r"(\d+)\s*to\s*(\d+)\s*years?"              # 2 to 4 years
    ]

    # Ranges first (take the minimum)
    m = re.search(patterns[3], text) or re.search(patterns[4], text)
    if m:
        return float(m.group(1))

    # Single number patterns
    for p in patterns[:3]:
        m = re.search(p, text)
        if m:
            return float(m.group(1))

    # Special cases
    if "no experience" in text or "zero experience" in text or "0 years" in text:
        return 0.0
    if "freshers welcome" in text or "fresher" in text:
        return 0.0

    return np.nan

df["min_experience_years"] = df["job_description"].apply(extract_min_years)

df["min_experience_years"].describe()


count     50562.000000
mean        172.977889
std        3854.608062
min           0.000000
25%           2.000000
50%           3.000000
75%           5.000000
max      300338.000000
Name: min_experience_years, dtype: float64

In [6]:
def bucket_exp(x):
    if pd.isna(x):
        return "not specified"
    if x <= 1:
        return "0–1 years"
    if x <= 3:
        return "2–3 years"
    if x <= 5:
        return "4–5 years"
    return "5+ years"

df["experience_bucket"] = df["min_experience_years"].apply(bucket_exp)
df["experience_bucket"].value_counts()


experience_bucket
not specified    68289
2–3 years        14879
5+ years         12610
0–1 years        12390
4–5 years        10683
Name: count, dtype: int64

In [7]:
entry_df = df[df["is_entry_level"] == True].copy()

pct_entry_3plus = (entry_df["min_experience_years"] >= 3).mean() * 100
pct_missing_exp = entry_df["min_experience_years"].isna().mean() * 100

pct_entry_3plus, pct_missing_exp


(np.float64(13.445463812436289), np.float64(66.69724770642202))

In [8]:
import os
os.makedirs("../Processed", exist_ok=True)

df.to_csv("../Processed/jobs_enriched.csv", index=False)
df.head(2)


Unnamed: 0,job_title,job_description,location,is_entry_level,min_experience_years,experience_bucket
0,marketing coordinator,job descriptiona leading real estate firm in n...,"princeton, nj",False,,not specified
1,mental health therapist/counselor,"at aspen therapy and wellness , we are committ...","fort collins, co",False,,not specified


In [9]:
explicit_exp = entry_df[entry_df["min_experience_years"].notna()].copy()

explicit_exp.shape


(3267, 6)

In [10]:
explicit_exp["experience_bucket"].value_counts(normalize=True) * 100


experience_bucket
0–1 years    41.567187
2–3 years    31.068258
5+ years     17.875727
4–5 years     9.488828
Name: proportion, dtype: float64

In [12]:
explicit_exp.columns.tolist()


['job_title',
 'job_description',
 'location',
 'is_entry_level',
 'min_experience_years',
 'experience_bucket']