### Imports

In [12]:
import os
import re
from pathlib import Path
import pandas as pd
import numpy as np
import altair as alt
from typing import Dict

### Load dataset

In [13]:
df = pd.read_csv("../data/data_science_job_posts_2025.csv")

### Schema overview & missing values profile

## Dataset Cleaning 

This module provides a structured data-cleaning pipeline for the
*Data Science Job Postings 2025* dataset.

**Functionality:**

- **State Extraction:** Uses regular expressions to extract U.S. state abbreviations from the `headquarter` column and maps them to FIPS codes (`state`, `fips`, `fips_int` columns).
- **Salary Parsing:** Converts salary text ranges (e.g., "$80K–$120K") into numeric midpoints via normalization and numeric extraction (`salary_mid`).
- **Seniority Normalization:** Cleans and standardizes the `seniority_level` column to lowercase values, replacing invalid entries.
- **Skills Parsing:** Transforms stringified skill lists into clean Python lists (`skills_list`) by stripping brackets, quotes, and whitespace.
- **Deduplication:** Removes duplicate job postings using key columns (`job_title`, `company`, `location`, `post_date`, `salary`, `skills`).

In [14]:
from __future__ import annotations
import re
import pandas as pd

# --- Constants ---
STATE_ABBR = r"\b(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY|DC)\b"
_ABBR_TO_FIPS = {
    'AL':'01','AK':'02','AZ':'04','AR':'05','CA':'06','CO':'08','CT':'09','DE':'10','DC':'11','FL':'12','GA':'13',
    'HI':'15','ID':'16','IL':'17','IN':'18','IA':'19','KS':'20','KY':'21','LA':'22','ME':'23','MD':'24','MA':'25',
    'MI':'26','MN':'27','MS':'28','MO':'29','MT':'30','NE':'31','NV':'32','NH':'33','NJ':'34','NM':'35','NY':'36',
    'NC':'37','ND':'38','OH':'39','OK':'40','OR':'41','PA':'42','RI':'44','SC':'45','SD':'46','TN':'47','TX':'48',
    'UT':'49','VT':'50','VA':'51','WA':'53','WV':'54','WI':'55','WY':'56'
}

_state_pat = re.compile(STATE_ABBR)

def clean_skills(s):
    s = str(s)
    s = s.replace("[", "").replace("]", "").replace("'", "")
    s = s.replace(",", " ")
    return s

df['skills_clean'] = df['skills'].apply(clean_skills)

# Parse salary: handle "€70,000-€90,000" or "€85,000"
def parse_salary_to_number(s):
    s = str(s).replace("€", "").replace(" ", "")
    # Range: "70000-90000"
    if "-" in s:
        parts = s.split("-")
        try:
            low = float(parts[0].replace(",", ""))
            high = float(parts[1].replace(",", ""))
            return (low + high) / 2.0
        except:
            return np.nan
    else:
        try:
            return float(s.replace(",", ""))
        except:
            return np.nan

def extract_state(headquarter: pd.Series) -> pd.Series:
    def _one(x):
        s = str(x).upper()
        m = _state_pat.search(s)
        return m.group(1) if m else None
    return headquarter.astype(str).map(_one)


def parse_salary(s):
    """Return a numeric midpoint if salary is a range or single number, else None."""
    s = str(s)
    if not s or s.lower() in {"nan", "none"}:
        return None
    s_norm = s.replace("–", "-").lower()
    nums = [float(x.replace(",", "")) for x in re.findall(r"\d[\d,]*\.?\d*", s_norm)]
    if not nums:
        return None
    mid = (nums[0] + nums[1]) / 2 if len(nums) >= 2 else nums[0]
    if "k" in s_norm:
        mid *= 1000.0
    return mid

def company_size_group(size):
    size_str = str(size)
    
    # Very big ones like "€352.44B" → treat as large
    if "€" in size_str or "b" in size_str.lower() or "m" in size_str.lower():
        return "large"
    
    cleaned = size_str.replace(",", "")
    try:
        n = int(cleaned)
    except:
        return "unknown"
    
    if n < 1000:
        return "small"
    elif n < 10000:
        return "medium"
    else:
        return "large"

def parse_skills(s):
    """
    Turn "['python', 'sql']" or '["python","sql"]' or "python, sql" into a list.
    Keep it simple and lowercase.
    """
    if not isinstance(s, str) or not s.strip():
        return []
    parts = [x.strip(" '\"").lower() for x in s.strip("[]").split(",") if x.strip(" '\"")]
    return parts

def parse_company_size(size):
    if pd.isna(size):
        return np.nan

    s = str(size).strip()

    # Case: "Private", "Public", "Unknown", "Self-employed"
    if s.isalpha():
        return np.nan

    # Remove currency, scale symbols
    s = s.replace("€", "").replace("$", "")
    s = s.replace("B", "").replace("M", "").replace("K", "")
    s = s.replace(",", "").strip()

    # If contains letters after cleaning → invalid
    if any(c.isalpha() for c in s):
        return np.nan

    # Empty after cleaning
    if s == "":
        return np.nan

    try:
        return float(s)
    except:
        return np.nan


def clean_jobs_df(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()

    # --- state + fips ---
    df["state"] = extract_state(df.get("headquarter"))
    df["fips"] = df["state"].map(_ABBR_TO_FIPS)
    df["fips_int"] = pd.to_numeric(df["fips"], errors="coerce").astype("Int64")

    # --- salary ---
    df["salary_mid"] = df.get("salary", pd.Series([None] * len(df))).apply(parse_salary)

    # --- seniority ---
    df["seniority_level_norm"] = (
        df.get("seniority_level", pd.Series([None] * len(df)))
          .astype(str).str.lower().str.strip()
          .replace({"nan": pd.NA, "": pd.NA})
    )

    # --- skills ---
    skills_parsed = df.get("skills", pd.Series([None] * len(df))).apply(parse_skills)
    df["skills_list"] = skills_parsed
    # CSV-friendly version for Altair notebook
    df["skills_clean"] = skills_parsed.apply(lambda lst: "|".join(lst))

    # --- de-dup ---
    dedup_keys = ["job_title", "company", "location", "post_date", "salary", "skills"]
    existing = [c for c in dedup_keys if c in df.columns]
    if existing:
        df = df.drop_duplicates(subset=existing)

    df["company_size_num"] = df["company_size"].apply(parse_company_size)
    df['salary_usd'] = df['salary'].apply(parse_salary_to_number)
    df = df.dropna(subset=['salary_usd'])
    df['company_size_group'] = df['company_size'].apply(company_size_group)
    df = df[df['company_size_group'] != 'unknown']

    return df

if __name__ == "__main__":
    in_path = "../data/data_science_job_posts_2025.csv"
    out_path = "../data/data_science_job_posts_2025_clean.csv"
    raw = pd.read_csv(in_path, low_memory=False)
    clean = clean_jobs_df(raw)
    clean.to_csv(out_path, index=False)
    print(f"Completed cleaning. new cleaned Dataset created at → {out_path}\n")

Completed cleaning. new cleaned Dataset created at → ../data/data_science_job_posts_2025_clean.csv

