# Creation of a baseline model with smote
In this notebook the baseline model of `base_line_model.ipynb` is enhanced by SMOTE.

### Runtimes

| Cell | Runtime |
| --- | --- |
| Training | ~ 5 min |
| Learning Curves | ~ 6 min |
| Feature Importances | ~ 3 min |
| Everything else | < 1 min |

### Init config

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

from src.config import (
    DATA_SPLIT_DIR,
    TRAIN_RAW_FILENAME,
    VALIDATION_RAW_FILENAME,
    MODELS_DIR,
)

In [None]:
ordinal_orders = {
    "GenHlth": ["excellent", "very good", "good", "fair", "poor"],
    "Age": [
        "18-24",
        "25-29",
        "30-34",
        "35-39",
        "40-44",
        "45-49",
        "50-54",
        "55-59",
        "60-64",
        "65-69",
        "70-74",
        "75-79",
        "80+",
    ],
    "Education": [
        "no school",
        "elementary",
        "some high school",
        "high school graduate",
        "college",
        "college graduate",
    ],
    "Income": ["<$10k", "<$15k", "<$20k", "<$25k", "<$35k", "<$50k", "<$75k", ">$75k"],
}

## Pipeline


In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

nominal_cols = [
    "HighBP",
    "HighChol",
    "CholCheck",
    "Smoker",
    "Stroke",
    "HeartDiseaseorAttack",
    "PhysActivity",
    "Fruits",
    "Veggies",
    "HvyAlcoholConsump",
    "AnyHealthcare",
    "NoDocbcCost",
    "DiffWalk",
    "Sex",
]
nominal_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        (
            "ohe",
            OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False),
        ),
    ]
)

ordinal_cols = ["GenHlth", "Age", "Education", "Income"]

ordinal_categories = [ordinal_orders[col] for col in ordinal_cols]

ordinal_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OrdinalEncoder(categories=ordinal_categories)),
        ("scale", MinMaxScaler())
    ]
)


numeric_cols = ["BMI", "MentHlth", "PhysHlth"]
num_pipe = Pipeline(
    [("impute", SimpleImputer(strategy="median")), ("scale", MinMaxScaler())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_cols),
        ("ord", ordinal_pipe, ordinal_cols),
        ("nom", nominal_pipe, nominal_cols),
    ],
    remainder="drop",
)

encoded_nominal_column_names = [
    "nom__HighBP_y",
    "nom__HighChol_y",
    "nom__CholCheck_y",
    "nom__Smoker_y",
    "nom__Stroke_y",
    "nom__HeartDiseaseorAttack_y",
    "nom__PhysActivity_y",
    "nom__Fruits_y",
    "nom__Veggies_y",
    "nom__HvyAlcoholConsump_y",
    "nom__AnyHealthcare_y",
    "nom__NoDocbcCost_y",
    "nom__DiffWalk_y",
    "nom__Sex_m",
]

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "smote",
            SMOTENC(
                random_state=5,
                categorical_features=encoded_nominal_column_names,
            ),
            
        ),
    ]
)


pipeline.set_output(transform="pandas");

## Load and transform train and validation data

In [None]:
from sklearn.preprocessing import LabelEncoder


df_train_raw = pd.read_csv(os.path.join(DATA_SPLIT_DIR, TRAIN_RAW_FILENAME))
features_train_raw = df_train_raw.drop("Diabetes_012", axis=1)
target_train_raw = df_train_raw["Diabetes_012"].replace({"pre": "dia"})


df_val_raw = pd.read_csv(os.path.join(DATA_SPLIT_DIR, VALIDATION_RAW_FILENAME))
features_val_raw = df_val_raw.drop("Diabetes_012", axis=1)
target_val_raw = df_val_raw["Diabetes_012"].replace({"pre": "dia"})


features_train_sampled, target_train_sampled = pipeline.fit_resample(
    features_train_raw, target_train_raw
)

features_val_proc = preprocessor.transform(features_val_raw)

labelencoder = LabelEncoder()

target_train_enc = labelencoder.fit_transform(target_train_raw)
target_train_sampled_enc = labelencoder.transform(target_train_sampled)
target_val_enc = labelencoder.transform(target_val_raw)

## Training

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
)
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

classifiers = [
    GaussianNB(),
    DecisionTreeClassifier(random_state=0, class_weight="balanced"),
    LogisticRegression(random_state=0, class_weight="balanced"),
    RidgeClassifier(random_state=0, class_weight="balanced"),
    RandomForestClassifier(random_state=0, class_weight="balanced"),
    ExtraTreesClassifier(random_state=0, class_weight="balanced"),
    GradientBoostingClassifier(random_state=0),
    HistGradientBoostingClassifier(random_state=0),
    AdaBoostClassifier(random_state=0),
    BaggingClassifier(random_state=0),
    KNeighborsClassifier(),
    MLPClassifier(random_state=0),
#     SVC(random_state=0, class_weight="balanced"),
#     LinearSVC(random_state=0, class_weight="balanced"),
]

In [None]:
import pickle
import json
import os
from datetime import datetime
from src.model_evaluation import evaluate_classifier



for classifier in classifiers:
    print()
    print("#####################################")
    print(classifier.__class__.__name__)
    print()
    start_timestamp = datetime.now()

    classifier.fit(features_train_sampled, target_train_sampled_enc)


    train_end_timestamp = datetime.now()
    training_duration = train_end_timestamp - start_timestamp

    ## Prediction
    target_val_pred = classifier.predict(features_val_proc)

    ## Metrics
    model_purpose = ""
    special_features = "smote"

    target_val_pred_proba = None

    if hasattr(classifier, "predict_proba"):
        target_val_pred_proba = classifier.predict_proba(features_val_proc)

        if target_train_raw.nunique() <= 2:
            target_val_pred_proba = target_val_pred_proba[:, 1]

    results = evaluate_classifier(
        classifier=classifier,
        labels=list(labelencoder.classes_),
        target_truth=target_val_raw,
        target_pred=labelencoder.inverse_transform(target_val_pred),
        target_pred_proba=target_val_pred_proba,
        timestamp=train_end_timestamp,
        model_purpose=model_purpose,
        special_features=special_features,
    )
    results["training_duration"] = training_duration.seconds

    labels = results["predicts"]
    model_name = results["model_name"]

    ### Save the model and results
    folder = os.path.join(MODELS_DIR, model_name)
    filename = os.path.join(folder, model_name)
    os.makedirs(folder, exist_ok=True)

    with open(f"{filename}.model.pkl", "wb") as f:
        pickle.dump(classifier, f)

    with open(f"{filename}.pipeline.pkl", "wb") as f:
        pickle.dump(preprocessor, f)

    with open(f"{filename}.label_encoder.pkl", "wb") as f:
        pickle.dump(labelencoder, f)

    with open(f"{filename}.model.txt", "w") as file:
        file.write(str(classifier))

    with open(f"{filename}.results.json", "w") as f:
        json.dump(results, f, indent=2)

    with open(f"{filename}.pipeline_params.txt", "w") as f:
        f.write(preprocessor.get_params().__str__())

    with open(f"{filename}.model_params.json", "w") as f:
        json.dump(classifier.get_params(), f, indent=2)

    end_timestamp = datetime.now()
    td = end_timestamp - start_timestamp
    print(f"training duration {td.days} d {(td.seconds // 3600)} h"
          f" {(td.seconds % 3600) // 60} m {td.seconds % 60} s")


