In [9]:
from utils import load_and_preprocess
from models import get_models
from sklearn.metrics import *
import pandas as pd
import joblib
import os

X_train, X_test, y_train, y_test = load_and_preprocess(
    "../data/adult.data",
    "../data/adult.test"
)

models = get_models()

results_train = {}
results_test = {}

os.makedirs("../saved_models", exist_ok=True)

for name, model in models.items():
    print("Training:", name)
    model.fit(X_train, y_train)

    # TRAIN metrics
    y_train_pred = model.predict(X_train)
    y_train_prob = model.predict_proba(X_train)[:,1]

    results_train[name] = {
        "Accuracy": accuracy_score(y_train, y_train_pred),
        "Precision": precision_score(y_train, y_train_pred),
        "Recall": recall_score(y_train, y_train_pred),
        "F1": f1_score(y_train, y_train_pred),
        "MCC": matthews_corrcoef(y_train, y_train_pred),
        "AUC": roc_auc_score(y_train, y_train_prob)
    }

    # TEST metrics
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:,1]

    results_test[name] = {
        "Accuracy": accuracy_score(y_test, y_test_pred),
        "Precision": precision_score(y_test, y_test_pred),
        "Recall": recall_score(y_test, y_test_pred),
        "F1": f1_score(y_test, y_test_pred),
        "MCC": matthews_corrcoef(y_test, y_test_pred),
        "AUC": roc_auc_score(y_test, y_test_prob)
    }

    # Save model
    joblib.dump(model, f"../saved_models/{name.replace(' ', '_')}.pkl")

# Save metrics
pd.DataFrame(results_train).T.to_csv("../saved_models/train_metrics.csv")
pd.DataFrame(results_test).T.to_csv("../saved_models/test_metrics.csv")

print("All models and metrics saved.")

  train_df["income"] = train_df["income"].replace(replace_map)
  test_df["income"] = test_df["income"].replace(replace_map)


Training: Logistic Regression
Training: Decision Tree
Training: KNN
Training: Naive Bayes
Training: Random Forest
Training: XGBoost
All models and metrics saved.


In [13]:
new_test_df = pd.DataFrame(X_test)
new_test_df["income"] = y_test.values
new_test_df.to_csv("official_test_with_labels.csv", index=False)