## 1) Imports & paths
##### Before manipulating data, we must set up the environment — define where data lives and where cleaned outputs will go. Think of this like laying out your lab tools before starting an experiment. Without consistent paths, reproducibility collapses.

In [1]:
import pandas as pd, numpy as np, re, json
from pathlib import Path
RNG = 42

ROOT = Path.cwd().parents[0] if (Path.cwd().name == "notebooks") else Path.cwd()
DATA_RAW = ROOT / "data" / "raw"
DATA_PROC = ROOT / "data" / "processed"
REPORTS = ROOT / "reports"
FIGS = ROOT / "outputs" / "figures"

DATA_PROC.mkdir(parents=True, exist_ok=True)
REPORTS.mkdir(parents=True, exist_ok=True)
FIGS.mkdir(parents=True, exist_ok=True)

## 2) Load raw datasets

##### At its core, “loading” isn’t just reading CSVs — it’s about validating the data contract.
Each dataset has a schema (columns, types, units). Before merging, you must check whether those schemas align or conflict.

In [None]:
df1 = pd.read_csv(DATA_RAW / "dataset1_data_science_job.csv")
df2 = pd.read_csv(DATA_RAW / "dataset2_all_job_post.csv")
df3 = pd.read_csv(DATA_RAW / "dataset3_ai_job_dataset.csv")

for name, df in {"df1": df1, "df2": df2, "df3": df3}.items():
    print(name, df.shape); display(df.head(2)); display(df.info())

## 3) Profiling snapshot (lightweight)

##### Profiling is the diagnostic stage of cleaning — like running blood tests before prescribing medicine.
It tells you what’s wrong: missing values, strange datatypes, duplicates, etc.
Without this, cleaning becomes random guessing.

In [None]:
def profile(df: pd.DataFrame, name: str) -> dict:
    """Return basic profile stats for df. 
    Time: O(n * c). Space: O(c)."""
    return {
        "rows": len(df),
        "cols": df.shape[1],
        "na_counts": df.isna().sum().to_dict(),
        "dup_rows": int(df.duplicated().sum()),
        "numeric_cols": df.select_dtypes(include="number").columns.tolist(),
        "object_cols": df.select_dtypes(include="object").columns.tolist(),
    }

profiles = {k: profile(v, k) for k, v in {"df1": df1, "df2": df2, "df3": df3}.items()}
print(json.dumps(profiles, indent=2)[:2000], "...")
