In [None]:
from src.data import load_adult_from_openml, clean_data, split_data, save_splits

df_raw = load_adult_from_openml()
print("Raw:", df_raw.shape)

df = clean_data(df_raw)
print("Clean:", df.shape)
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df)
print("Splits →", 
      "train:", X_train.shape, 
      "val:", X_val.shape, 
      "test:", X_test.shape)

save_splits(X_train, X_val, X_test, y_train, y_val, y_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from modeling import build_model_pipeline, evaluate_model

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(max_depth=5),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "HistGB": HistGradientBoostingClassifier(random_state=42),
}

pipelines = {name: build_model_pipeline(clf) for name, clf in models.items()}

# Train models
for name, pipe in pipelines.items():
    print(f"Training {name}...")
    pipe.fit(X_train, y_train)


In [None]:
import pandas as pd

results = []
for name, pipe in pipelines.items():
    metrics = evaluate_model(pipe, X_val, y_val)
    metrics["Model"] = name
    results.append(metrics)

df_results = pd.DataFrame(results).set_index("Model")
df_results


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(df_results.index, df_results["accuracy"], marker='o', label='Accuracy')
plt.plot(df_results.index, df_results["precision"], marker='o', label='Precision')
plt.title("Accuracy vs Precision Across Models")
plt.ylabel("Score")
plt.xlabel("Model")
plt.legend()
plt.grid(True)
plt.xticks(rotation=20)
plt.tight_layout()
plt.show()


In [None]:
from modeling import evaluate_by_group

race_results = {}
sex_results = {}

for name, pipe in pipelines.items():
    race_results[name] = evaluate_by_group(pipe, X_val, y_val, group_col="race")
    sex_results[name] = evaluate_by_group(pipe, X_val, y_val, group_col="sex")

race_results["HistGB"]


In [None]:
def plot_group_comparison(results_dict, metric, group_name):
    """
    Create side-by-side bar plots comparing the given metric across models and groups.
    """
    fig, ax = plt.subplots(figsize=(10, 6))

    df_combined = pd.concat(
        {model: df[metric] for model, df in results_dict.items()}, axis=1
    )
    df_combined.plot(kind="bar", ax=ax)
    
    ax.set_title(f"{metric.title()} by {group_name.title()} and Model")
    ax.set_ylabel(metric.title())
    ax.set_xlabel(group_name.title())
    ax.legend(title="Model")
    ax.grid(True)
    plt.tight_layout()
    plt.show()

# Example: Precision by race
plot_group_comparison(race_results, "precision", "race")

# Example: Recall by sex
plot_group_comparison(sex_results, "recall", "sex")


In [None]:
import pandas as pd

# Combine group metrics for each model into a single DataFrame
def collect_group_metrics(results_dict, group_name):
    records = []
    for model_name, df in results_dict.items():
        for group_value in df.index:
            row = df.loc[group_value].to_dict()
            row.update({
                "Model": model_name,
                group_name.title(): group_value
            })
            records.append(row)
    return pd.DataFrame(records)

race_report_df = collect_group_metrics(race_results, "race")
sex_report_df = collect_group_metrics(sex_results, "sex")


In [None]:
def plot_disparity(df, group_col, metric):
    disparity_df = (
        df.groupby("Model")[metric]
        .agg(lambda col: col.max() - col.min())
        .reset_index(name=f"{metric}_disparity")
    )

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.bar(disparity_df["Model"], disparity_df[f"{metric}_disparity"])
    ax.set_title(f"{metric.title()} Disparity Across {group_col.title()} Groups")
    ax.set_ylabel("Disparity (Max - Min)")
    ax.set_xlabel("Model")
    ax.grid(True)
    plt.tight_layout()
    plt.show()


plot_disparity(race_report_df, "race", "precision")
plot_disparity(sex_report_df, "sex", "recall")
