In [3]:
import matplotlib.pyplot as plt
import numpy as np
import joblib

# ============================================================
# Bias measurement based on TEXT QUERIES using search_jobs()
# ============================================================
df, X_hybrid = joblib.load("../data/hybrid_embeddings.pkl")

def measure_exposure_bias_text_queries(queries, top_k=20):
    """
    Measure exposure bias using text-based recommendations.

    For each query in `queries`:
      - call search_jobs(query, top_k)
      - count how many small/medium/large company jobs appear
    Then:
      - compare exposure distribution to dataset distribution
      - return summary DataFrame
    """

    # ---- 1. Dataset-level distribution (exclude 'unknown') ----
    df_valid = df[df["company_size_group"] != "unknown"].copy()

    dataset_counts = (
        df_valid["company_size_group"]
        .value_counts()
        .sort_index()
    )
    dataset_share = dataset_counts / dataset_counts.sum()

    # ---- 2. Aggregate exposure across all queries ----
    exposure_counts = {}
    per_query_exposure = []  # for per-query plot

    for q in queries:
        recs = search_jobs(q, top_k=top_k)

        # search_jobs already drops 'unknown', but we guard anyway
        recs_valid = recs[recs["company_size_group"] != "unknown"].copy()

        counts = recs_valid["company_size_group"].value_counts()
        exposure_counts_for_q = {
            "query": q,
            "small": counts.get("small", 0),
            "medium": counts.get("medium", 0),
            "large": counts.get("large", 0),
        }
        per_query_exposure.append(exposure_counts_for_q)

        for group, c in counts.items():
            exposure_counts[group] = exposure_counts.get(group, 0) + c

    # Convert overall exposure to shares
    exposure_series = pd.Series(exposure_counts).sort_index()
    total_recs = exposure_series.sum()
    exposure_share = exposure_series / total_recs

    # ---- 3. Build summary table (overall) ----
    groups = ["small", "medium", "large"]
    rows = []
    for g in groups:
        ds = float(dataset_share.get(g, 0.0))
        es = float(exposure_share.get(g, 0.0))
        ratio = es / ds if ds > 0 else np.nan
        rows.append({
            "company_size_group": g,
            "dataset_share": ds,
            "exposure_share": es,
            "exposure_ratio": ratio,
        })

    summary = pd.DataFrame(rows)

    print("\n=== Text-Query Based Exposure Bias Summary ===")
    print(summary.to_string(index=False))

    # ---- 4. Overall bar plot: dataset vs exposure ----
    x = np.arange(len(summary))
    width = 0.35

    plt.figure(figsize=(6, 4))
    plt.bar(x - width/2, summary["dataset_share"], width, label="Dataset share")
    plt.bar(x + width/2, summary["exposure_share"], width, label="Exposure share (recs)")

    plt.xticks(x, summary["company_size_group"])
    plt.ylabel("Proportion")
    plt.title(f"Dataset vs Recommendation Exposure (top-{top_k}, text queries)")
    plt.legend()
    plt.tight_layout()
    plt.show()

    # ---- 5. Per-query stacked bar plot ----
    per_query_df = pd.DataFrame(per_query_exposure)

    # Normalize per query to proportions
    per_query_df["total"] = per_query_df[["small", "medium", "large"]].sum(axis=1)
    for g in groups:
        per_query_df[g + "_share"] = np.where(
            per_query_df["total"] > 0,
            per_query_df[g] / per_query_df["total"],
            0.0,
        )

    # Shorten query labels for plotting
    per_query_df["query_label"] = per_query_df["query"].str.slice(0, 25) + np.where(
        per_query_df["query"].str.len() > 25, "...", ""
    )

    plt.figure(figsize=(8, 5))
    bottom = np.zeros(len(per_query_df))
    colors = {"small": "tab:blue", "medium": "tab:orange", "large": "tab:green"}

    for g in groups:
        plt.bar(
            per_query_df["query_label"],
            per_query_df[g + "_share"],
            bottom=bottom,
            label=g,
            color=colors[g],
        )
        bottom += per_query_df[g + "_share"].values

    plt.xticks(rotation=45, ha="right")
    plt.ylabel("Proportion in top-k")
    plt.title(f"Per-Query Exposure by Company Size Group (top-{top_k})")
    plt.legend(title="Company Size Group")
    plt.tight_layout()
    plt.show()

    return summary, per_query_df


# ============================================================
# Example: run bias measurement on text queries
# ============================================================

text_queries = [
    "Senior machine learning engineer with python",
    "Entry level data analyst with SQL and Excel",
    "Remote data scientist role with python",
    "Senior data engineer with spark and aws",
    
    # "Business intelligence analyst with tableau",
    # "NLP research scientist with transformers",
    "MLOps engineer with kubernetes and docker",
    # "Computer vision engineer with pytorch",
    "Analytics manager with leadership experience",
    "Intern data science role with basic python",
    "machine learning engineer with SQL",
]

bias_summary_text, per_query_detail = measure_exposure_bias_text_queries(
    queries=text_queries,
    top_k=20,
)


NameError: name 'search_jobs' is not defined