# Training of a Logistic Regression model with best hyperparameters

## Init config

In [None]:
import os
from datetime import datetime
import time
import pandas as pd
import numpy as np
import pickle
import json

from sklearn.model_selection import cross_val_score, StratifiedKFold

from src.config import (
    DATA_SPLIT_DIR,
    TRAIN_RAW_FILENAME,
    VALIDATION_RAW_FILENAME,
    MODELS_DIR,
    STUDY_DIR,
    MODEL_ALIASES
)
from src.model_evaluation import evaluate_classifier

## Pipeline

In [None]:
ordinal_orders = {
    "GenHlth": ["excellent", "very good", "good", "fair", "poor"],
    "Age": [
        "18-24",
        "25-29",
        "30-34",
        "35-39",
        "40-44",
        "45-49",
        "50-54",
        "55-59",
        "60-64",
        "65-69",
        "70-74",
        "75-79",
        "80+",
    ],
    "Education": [
        "no school",
        "elementary",
        "some high school",
        "high school graduate",
        "college",
        "college graduate",
    ],
    "Income": ["<$10k", "<$15k", "<$20k", "<$25k", "<$35k", "<$50k", "<$75k", ">$75k"],
}

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from src.transformers import MissingFlagTransformer, CategoryFromThresholdTransformer
from sklearn.linear_model import LogisticRegression

nominal_cols = [
    "HighBP",
    "HighChol",
    "CholCheck",
    "Smoker",
    "Stroke",
    "HeartDiseaseorAttack",
    "PhysActivity",
    "Fruits",
    "Veggies",
    "HvyAlcoholConsump",
    "AnyHealthcare",
    "NoDocbcCost",
    "DiffWalk",
    "Sex",
]
nominal_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        (
            "ohe",
            OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False),
        ),
    ]
)

ordinal_cols = ["GenHlth", "Age", "Education", "Income"]

ordinal_categories = [ordinal_orders[col] for col in ordinal_cols]

ordinal_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OrdinalEncoder(categories=ordinal_categories)),
    ]
)

missing_val_cols = ordinal_cols + nominal_cols
missing_val_pipe = Pipeline(
    [
        (
            "flags",
            MissingFlagTransformer(),
        ),
    ]
)


cat_gens = {"BMI": [20, 30, 40, 50, 60], "MentHlth": [0, 5], "PhysHlth": [0, 5]}
cat_gen_cols = ["BMI", "MentHlth", "PhysHlth"]
cat_gen_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="median")),
        (
            "cat_gen",
            CategoryFromThresholdTransformer([cat_gens[c] for c in cat_gen_cols]),
        ),
        (
            "enc",
            OrdinalEncoder(),
        ),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("ord", ordinal_pipe, ordinal_cols),
        ("nom", nominal_pipe, nominal_cols),
        ("miss", missing_val_pipe, missing_val_cols),
        ("cg", cat_gen_pipe, cat_gen_cols),
    ],
    remainder="drop",
)


classifier = LogisticRegression(
    penalty="l2",
    C=6.191485524607749,
    fit_intercept=True,
    random_state=0,
    max_iter=100000,
    n_jobs=-1,
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "sample",
            RandomOverSampler(
                random_state=5,
            ),
        ),
        (
            "clf",
            classifier,
        ),
    ]
)


preprocessor.set_output(transform="pandas")

## Load and transform train and validation data

In [None]:
from sklearn.preprocessing import LabelEncoder


df_train_raw = pd.read_csv(os.path.join(DATA_SPLIT_DIR, TRAIN_RAW_FILENAME))
features_train_raw = df_train_raw.drop("Diabetes_012", axis=1)
target_train_raw = df_train_raw["Diabetes_012"].replace({"pre": "dia"})


df_val_raw = pd.read_csv(os.path.join(DATA_SPLIT_DIR, VALIDATION_RAW_FILENAME))
features_val_raw = df_val_raw.drop("Diabetes_012", axis=1)
target_val_raw = df_val_raw["Diabetes_012"].replace({"pre": "dia"})


labelencoder = LabelEncoder()

target_train_enc = labelencoder.fit_transform(target_train_raw)
target_val_enc = labelencoder.transform(target_val_raw)

## Train model

In [None]:
model_purpose = ",".join(labelencoder.classes_)
clf_alias = MODEL_ALIASES[classifier.__class__.__name__]
special_features = "rndover,missingflags,catgen"

start_timestamp = datetime.now()

pipeline.fit(features_train_raw, target_train_enc)

train_end_timestamp = datetime.now()
training_duration = train_end_timestamp - start_timestamp

## Evaluation on validation data

In [None]:
target_val_pred = pipeline.predict(features_val_raw)

target_val_pred_proba = None

if hasattr(classifier, "predict_proba"):
    target_val_pred_proba = pipeline.predict_proba(features_val_raw)

    if target_train_raw.nunique() <= 2:
        target_val_pred_proba = target_val_pred_proba[:, 1]

results = evaluate_classifier(
    classifier=classifier,
    labels=list(labelencoder.classes_),
    target_truth=target_val_raw,
    target_pred=labelencoder.inverse_transform(target_val_pred),
    target_pred_proba=target_val_pred_proba,
    timestamp=train_end_timestamp,
    model_purpose=model_purpose,
    special_features=special_features,
)
results["training_duration"] = training_duration.seconds

print(
    f"training duration {training_duration.days} d {(training_duration.seconds // 3600)} h"
    f" {(training_duration.seconds % 3600) // 60} m {training_duration.seconds % 60} s"
)

In [None]:
from src.model_evaluation import (
    get_confusion_matrix_from_results_as_df,
    get_classification_report_from_results_as_df,
)
print("precision:", results["precision"])
print("F1:", results["f1"])
print("bal acc:", results["bal_accuracy"])
print("roc auc:", results["roc_auc_score"])

print()
print("confusion_matrix")
display(get_confusion_matrix_from_results_as_df(results))

print()
print("classification_report")
display(get_classification_report_from_results_as_df(results))

The model achieved an overall macro f1 score of 0.64 and macro recall of 0.75. The f1 score is lessened by the low precision of 0.32 of the positive class, indicating that while the model is good at identifying true positives, it also has a high rate of false positives. The macro recall of 0.75 suggests that the model is able to capture a significant portion of the actual positive cases.

## Save model and results

In [None]:
model_name = results["model_name"]

folder = os.path.join(MODELS_DIR, model_name)
filename = os.path.join(folder, model_name)
os.makedirs(folder, exist_ok=True)

with open(f"{filename}.model.pkl", "wb") as f:
    pickle.dump(classifier, f)

with open(f"{filename}.pipeline.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

with open(f"{filename}.label_encoder.pkl", "wb") as f:
    pickle.dump(labelencoder, f)

with open(f"{filename}.model.txt", "w") as f:
    f.write(str(classifier))

with open(f"{filename}.results.json", "w") as f:
    json.dump(results, f, indent=2)

with open(f"{filename}.pipeline_params.txt", "w") as f:
    f.write(preprocessor.get_params().__str__())

with open(f"{filename}.model_params.json", "w") as f:
    json.dump(classifier.get_params(), f, indent=2)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


feature_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
for r, rw in [
    ("num__", ""),
    ("cg__", ""),
    ("ord__", ""),
    ("nom__", ""),
    ("miss__", ""),
    ("_y", ""),
    ("_isna", " missing"),
]:
    feature_names = [x.replace(r, rw) for x in feature_names]
feature_importances = pd.DataFrame(index=feature_names)
feature_importances["importance"] = np.abs(classifier.coef_[0])
feature_importances.sort_values(by="importance", ascending=False, inplace=True)

fig, ax = plt.subplots(figsize=(16, 0.3 * len(feature_importances)))
sns.barplot(x=feature_importances["importance"], y=feature_importances.index, ax=ax)
ax.set_title(f"feature importances for model '{model_name}'")
for i, (val, label) in enumerate(
    zip(feature_importances["importance"], feature_importances.index)
):
    ax.text(val, i, f" {val:.4f}", va="center", ha="left", fontsize=8)

fig.tight_layout()
fig.show()

The feature with the highest importance in the Logistic Regression model is `Cholesterol Check`, indicating whether the patient underwent a cholesterol check within the last five years. The next most relevant features, though with a lower contribution, are `GenHlth_isna` (missing information on general health) and `HighBP_y` (presence of high blood pressure). `BMI_cat` ranks further down in importance, which is notable given that BMI is commonly regarded as a major risk factor for diabetes. Similarly, `Sex` and `Age_cat` show relatively low importance in this model, despite being widely recognized as key variables in diabetes risk assessment.