Please run those two cells before running the Notebook!

As those plotting settings are standard throughout the book, we do not show them in the book every time we plot something.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = "retina"

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
# FIX: Use the official public API path from pandas.errors
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# feel free to modify, for example, change the context to "notebook"
sns.set_theme(context="talk", style="whitegrid", 
              palette="colorblind", color_codes=True, 
              rc={"figure.figsize": [12, 8]})

# Chapter 13 - Applied Machine Learning: Identifying Credit Default

## 13.0 Getting and preparing the data

This is a part not covered in the book. We download the considered dataset from the website of the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients). The dataset originally does not contain missing values and the categorical variables are already encoded as numbers. To show the entire pipeline of working and preparing potentially messy data, we apply some transformations:

* we encoded the gender, education and marital status related variables to strings
* we introduced missing values to some observations (0.5% of the entire sample, selected randomly per column - the total percentage of rows with at least one missing value will be higher)
* some observed values for features such as level of education, payment status, etc. are outside of the range of possible categories defined by the authors. As this problem affects many observations, we encode new, undescribed categories as either 'Others' (when there was already such a category) or 'Unknown' (in the case of payment status).

The reason for selecting only a small fraction of values to be missing is that we do not want to significantly change the underlying structure/patterns in the data.

In [None]:
import pandas as pd
import numpy as np
import random

In [None]:
# FIX: Use conda to force a clean reinstall of numpy and install TA-Lib
!conda install --yes -c conda-forge numpy=1.26.4 ta-lib

In [None]:
!conda install --yes -c conda-forge numpy xlrd

In [None]:

# FIX: Use pandas to read the Excel file directly from the URL
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls'

# The actual data in this file starts on the second row, so we use header=1
df = pd.read_excel(url, header=1)

# You can add a .head() call to see the first few rows
df.head()

In [None]:
# load the data from Excel
df = pd.read_excel("default of credit card clients.xls", skiprows=1, index_col=0)

# rename columns
df.columns = df.columns.str.lower().str.replace(" ", "_")

months = ["sep", "aug", "jul", "jun", "may", "apr"]
variables = ["payment_status", "bill_statement", "previous_payment"]
new_column_names = [x + "_" + y for x in variables for y in months]
rename_dict = {x: y for x, y in zip(df.loc[:, "pay_0":"pay_amt6"].columns, new_column_names)}
df.rename(columns=rename_dict, inplace=True)


# create dicts to map number to strings
gender_dict = {1: "Male", 
               2: "Female"}
education_dict = {0: "Others",
                  1: "Graduate school", 
                  2: "University", 
                  3: "High school", 
                  4: "Others",
                  5: "Others",
                  6: "Others"}
marital_status_dict = {0: "Others", 
                       1: "Married", 
                       2: "Single", 
                       3: "Others"}
payment_status = {-2: "Unknown",
                  -1: "Payed duly",
                  0: "Unknown",
                  1: "Payment delayed 1 month",
                  2: "Payment delayed 2 months",
                  3: "Payment delayed 3 months",
                  4: "Payment delayed 4 months",
                  5: "Payment delayed 5 months",
                  6: "Payment delayed 6 months",
                  7: "Payment delayed 7 months",
                  8: "Payment delayed 8 months",
                  9: "Payment delayed >= 9 months"}

# map numbers to strings
df["sex"] = df["sex"].map(gender_dict)
df["education"] = df["education"].map(education_dict)
df["marriage"] = df["marriage"].map(marital_status_dict)

for column in [x for x in df.columns if ("status" in x)]:
    df[column] = df[column].map(payment_status)

# define the ratio of missing values
RATIO_MISSING = 0.005

# input missing values to selected columns
random_state = np.random.RandomState(42)
for column in ["sex", "education", "marriage", "age"]:
    df.loc[df.sample(frac=RATIO_MISSING, random_state=random_state).index, column] = ""

# reset index
df.reset_index(drop=True, inplace=True)

# save to csv
df.to_csv("../Datasets/credit_card_default.csv", index=False)

## 13.1 Loading data and managing data types

### How to do it...

1. Import the libraries:

In [None]:
import pandas as pd

2. Load the data from the CSV file:

In [None]:
df = pd.read_csv("../Datasets/credit_card_default.csv", 
                 na_values="")
df

3. View the summary of the DataFrame: 

In [None]:
df.info()

4. Define a function for inspecting the exact memory usage of a DataFrame:

In [None]:
def get_df_memory_usage(df, top_columns=5):
    """
    Function for quick analysis of a pandas DataFrame's memory usage.
    It prints the top `top_columns` columns in terms of memory usage 
    and the total usage of the DataFrame.
    
    Parameters
    ------------
    df : pd.DataFrame
        DataFrame to be inspected
    top_columns : int
        Number of top columns (in terms of memory used) to display
    """
    print("Memory usage ----")
    memory_per_column = df.memory_usage(deep=True) / (1024 ** 2)
    print(f"Top {top_columns} columns by memory (MB):")
    print(memory_per_column.sort_values(ascending=False) \
                           .head(top_columns))
    print(f"Total size: {memory_per_column.sum():.2f} MB")

In [None]:
get_df_memory_usage(df, 5)

5. Convert the columns with `object` data type into `category` type:

In [None]:
object_columns = df.select_dtypes(include="object").columns
df[object_columns] = df[object_columns].astype("category")

get_df_memory_usage(df)

6. Downcast the numeric columns to integers:

In [None]:
numeric_columns = df.select_dtypes(include="number").columns
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], downcast="integer")

get_df_memory_usage(df)

In [None]:
df.info()

7. Downcast the `age` column using the `float` data type:

In [None]:
df["age"] = pd.to_numeric(df["age"], downcast="float")

get_df_memory_usage(df)

### There's more

Assign the data types to columns while loading the data using the `pd.read_csv`` method.

In [None]:
column_dtypes = {
    "education": "category", 
    "marriage": "category", 
    "sex": "category"
}
df_cat = pd.read_csv("../Datasets/credit_card_default.csv", 
                     na_values="", dtype=column_dtypes)

get_df_memory_usage(df_cat)

## 13.2 Exploratory data analysis

### Getting ready

In [None]:
import pandas as pd
df = pd.read_csv("../Datasets/credit_card_default.csv", na_values="")

### How to do it...

1. Import the libraries:

In [None]:
!pip install plotly

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.io as pio

2. Get summary statistics of the numeric variables: 

In [None]:
df.describe().transpose().round(2)

3. Get summary statistics of the categorical variables: 

In [None]:
df.describe(include="object").transpose()

Alternatively, we can get the summary statistics of all columns in one table using the following snippet:

In [None]:
df.describe(include="all").transpose()

4. Plot the distribution of age and, additionally, split it by gender: 

In [None]:
ax = sns.kdeplot(data=df, x="age", 
                 hue="sex", common_norm=False, 
                 fill=True)
ax.set_title("Distribution of age")

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_1", dpi=200)

As mentioned in the text, we can create a histogram (together with the KDE), by running:

In [None]:
ax = sns.histplot(data=df.dropna(), x="age", hue="sex", kde=True)
ax.set_title("Distribution of age");

sns.despine()
plt.tight_layout()

We noticed some gaps in the plot and the reason for this is the binning. Below, we created the same histogram using `sns.countplot` and `plotly_express`. By doing so, each value of age has a separate bin and we can inspect the plot in detail. There are no such spikes in the following plots:

In [None]:
ax = sns.countplot(data=df.dropna(), x="age", color="b")

for ind, label in enumerate(ax.get_xticklabels()):
    if int(float(label.get_text())) % 10 == 0:
        label.set_visible(True)
    else:
        label.set_visible(False)

ax.set_title("Histogram of age")

sns.despine()
plt.tight_layout()

In [None]:
ax = sns.countplot(data=df.dropna(), x="age", hue="sex")

for ind, label in enumerate(ax.get_xticklabels()):
    if int(float(label.get_text())) % 10 == 0:
        label.set_visible(True)
    else:
        label.set_visible(False)

ax.set_title("Histogram of age by gender")

sns.despine()
plt.tight_layout()

In [None]:
px.histogram(df.dropna(), x="age", color="sex", title = "Distribution of age")

In [None]:
df["age"].plot(kind="hist", title="Distribution of age")

sns.despine()
plt.tight_layout()

5. Create a `pairplot` of selected variables:

In [None]:
COLS_TO_PLOT = ["age", "limit_bal", "previous_payment_sep"]

pair_plot = sns.pairplot(df[COLS_TO_PLOT], kind="reg", 
                         diag_kind="kde", height=4,
                         plot_kws={"line_kws":{"color":"red"}})
pair_plot.fig.suptitle("Pairplot of selected variables")

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_2", dpi=200)

Additionally, we can separate the genders by specifying the `hue` argument:

In [None]:
pair_plot = sns.pairplot(data=df, 
                         x_vars=COLS_TO_PLOT, 
                         y_vars=COLS_TO_PLOT, 
                         hue="sex", 
                         height=4)
pair_plot.fig.suptitle("Pairplot of selected variables")
plt.subplots_adjust(top=0.95)

# plt.savefig("images/figure_13_3", dpi=200)

6. Analyze the relationship between age and limit balance:

In [None]:
ax = sns.jointplot(data=df, x="age", y="limit_bal", 
                   hue="sex", height=10)
ax.fig.suptitle("Age vs. limit balance")

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_4", dpi=200)

7. Define and run a function for plotting the correlation heatmap:

In [None]:
def plot_correlation_matrix(corr_mat, annotate=False):
    """
    Function for plotting the correlation heatmap. It masks the irrelevant fields.
    
    Parameters
    ----------
    corr_mat : pd.DataFrame
        Correlation matrix of the features.
    """
    
    # temporarily change style
    sns.set(style="white")
    # mask the upper triangle
    mask = np.zeros_like(corr_mat, dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    # set up the matplotlib figure
    fig, ax = plt.subplots()
    # set up custom diverging colormap
    cmap = sns.diverging_palette(240, 10, n=9, as_cmap=True)
    # plot the heatmap
    sns.heatmap(corr_mat, mask=mask, cmap=cmap, 
                annot=annotate, vmax=.3, 
                center=0, square=True, linewidths=.5, 
                cbar_kws={"shrink": .5}, ax=ax)
    ax.set_title("Correlation Matrix", fontsize=16)
    # change back to darkgrid style
    sns.set(style="darkgrid")

In [None]:
corr_mat = df.select_dtypes(include="number").corr()    
plot_correlation_matrix(corr_mat)

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_5", dpi=200)

We can also directly inspect the correlation between the features (numerical) and the target:

In [None]:
df.select_dtypes(include="number").corr()[["default_payment_next_month"]]

8. Analyze the distribution of age in groups using boxplots:

In [None]:
ax = sns.boxplot(data=df, y="age", x="marriage", hue="sex");
ax.set_title("Distribution of age")

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_6", dpi=200)

9. Plot the distribution of limit balance for each gender and education level:

In [None]:
ax = sns.violinplot(x="education", y="limit_bal", 
                    hue="sex", split=True, data=df)
ax.set_title(
    "Distribution of limit balance per education level", 
    fontsize=16
)

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_7", dpi=200)

The following code plots the same information, without splitting the violin plots.

In [None]:
ax = sns.violinplot(x="education", y="limit_bal", 
                    hue="sex", data=df)
ax.set_title("Distribution of limit balance per education level", 
             fontsize=16)

sns.despine()
plt.tight_layout()

10. Investigate the distribution of the target variable per gender and education level:

In [None]:
ax = sns.countplot("default_payment_next_month", hue="sex", 
                   data=df, orient="h")
ax.set_title("Distribution of the target variable", fontsize=16)

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_8", dpi=200)

11. Investigate the percentage of defaults per education level:

In [None]:
ax = df.groupby("education")["default_payment_next_month"] \
       .value_counts(normalize=True) \
       .unstack() \
       .plot(kind="barh", stacked=True)
ax.set_title("Percentage of defaults per education level", 
             fontsize=16)
ax.legend(title="Default", bbox_to_anchor=(1,1)) 

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_9", dpi=200)

### There's more

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Loan Default Dataset EDA")
profile

In [None]:
profile.to_file("loan_default_eda.html")

## 13.3 Splitting the data into training and test sets

### Getting ready

In [None]:
import pandas as pd
df = pd.read_csv("../Datasets/credit_card_default.csv", na_values="")

### How to do it...

1. Import the libraries:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

2. Separate the target from the features:

In [None]:
X = df.copy()
y = X.pop("default_payment_next_month")

3. Split the data into training and test sets:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

4. Split the data into training and test sets without shuffling:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

5. Split the data into training and test sets with stratification:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

6. Verify that the ratio of the target is preserved:

In [None]:
print("Target distribution - train")
print(y_train.value_counts(normalize=True).values)
print("Target distribution - test")
print(y_test.value_counts(normalize=True).values)

### There's more

In [None]:
import numpy as np

# define the size of the validation and test sets
VALID_SIZE = 0.1
TEST_SIZE = 0.2

# create the initial split - training and temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, 
    test_size=(VALID_SIZE + TEST_SIZE), 
    stratify=y, 
    random_state=42
)

# calculate the new test size
new_test_size = np.around(TEST_SIZE / (VALID_SIZE + TEST_SIZE), 2)

# create the valid and test sets
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=new_test_size, 
    stratify=y_temp, 
    random_state=42
)

In [None]:
print("Percentage of data in each set ----")
print(f"Train: {100 * len(X_train) / len(X):.2f}%")
print(f"Valid: {100 * len(X_valid) / len(X):.2f}%")
print(f"Test: {100 * len(X_test) / len(X):.2f}%")
print("")
print("Class distribution in each set ----")
print(f"Train: {y_train.value_counts(normalize=True).values}")
print(f"Valid: {y_valid.value_counts(normalize=True).values}")
print(f"Test: {y_test.value_counts(normalize=True).values}")

## 13.4 Identifying and dealing with missing values

### How to do it...

1. Import the libraries:

In [None]:
!pip install missingno

In [None]:
import pandas as pd 
import missingno as msno
from sklearn.impute import SimpleImputer

2. Inspect the information about the DataFrame:

In [None]:
X.info()

In [None]:
X.isnull().sum()

3. Visualize the nullity of the DataFrame:

In [None]:
msno.matrix(X)

sns.despine()
# plt.savefig("images/figure_13_12", dpi=200)

4. Define columns with missing values per data type:

In [None]:
NUM_FEATURES = ["age"]
CAT_FEATURES = ["sex", "education", "marriage"]

5. Impute the numerical feature:

In [None]:
for col in NUM_FEATURES:
    num_imputer = SimpleImputer(strategy="median")
    num_imputer.fit(X_train[[col]])
    X_train.loc[:, col] = num_imputer.transform(X_train[[col]])
    X_test.loc[:, col] = num_imputer.transform(X_test[[col]])

In [None]:
# alternative method using pandas

# for feature in NUM_FEATURES:
#     median_value = X_train[feature].median()
#     X_train.loc[:, feature].fillna(median_value, inplace=True)
#     X_test.loc[:, feature].fillna(median_value, inplace=True)

6. Impute the categorical features:

In [None]:
for col in CAT_FEATURES:
    cat_imputer = SimpleImputer(strategy="most_frequent")
    cat_imputer.fit(X_train[[col]])
    X_train.loc[:, col] = cat_imputer.transform(X_train[[col]])
    X_test.loc[:, col] = cat_imputer.transform(X_test[[col]])

In [None]:
# alternative method using pandas

# for feature in CAT_FEATURES:
#     mode_value = X_train[feature].mode().values[0]
#     X_train.loc[:, feature].fillna(mode_value, inplace=True)
#     X_test.loc[:, feature].fillna(mode_value, inplace=True)

7. Verify that there are no missing values:

In [None]:
X_train.info()

### There's more

We also look into the other types of visualizations offered by `missingno`.

In [None]:
msno.bar(X)

sns.despine()
plt.show()

In [None]:
msno.heatmap(X)

sns.despine()
plt.show()

In [None]:
msno.dendrogram(X)

sns.despine()
# plt.savefig("images/figure_13_13", dpi=200)

## 13.5 Encoding categorical variables

### How to do it...

1. Import the libraries:

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

2. Use Label Encoder to encode a selected column:

In [None]:
COL = "education"

X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

label_enc = LabelEncoder()
label_enc.fit(X_train_copy[COL])
X_train_copy.loc[:, COL] = label_enc.transform(X_train_copy[COL])
X_test_copy.loc[:, COL] = label_enc.transform(X_test_copy[COL])

X_test_copy[COL].head()

In [None]:
label_enc.classes_

3. Select categorical features for one-hot encoding:

In [None]:
cat_features = X_train.select_dtypes(include="object") \
                      .columns \
                      .to_list()

cat_features

4. Instantiate the `OneHotEncoder` object: 

In [None]:
one_hot_encoder = OneHotEncoder(sparse=False, 
                                handle_unknown="error", 
                                drop="first")

5. Create the column transformer using the one-hot encoder: 

In [None]:
one_hot_transformer = ColumnTransformer(
    [("one_hot", one_hot_encoder, cat_features)],
    remainder="passthrough",
    verbose_feature_names_out=False
)

6. Fit the transformer:

In [None]:
one_hot_transformer.fit(X_train)

7. Apply the transformations to both training and test sets:

In [None]:
col_names = one_hot_transformer.get_feature_names_out()

X_train_ohe = pd.DataFrame(one_hot_transformer.transform(X_train), 
                           columns=col_names, 
                           index=X_train.index)

X_test_ohe = pd.DataFrame(one_hot_transformer.transform(X_test), 
                          columns=col_names, 
                          index=X_test.index)

Below we can see how one-hot encoding increased the shape of our DataFrame:

In [None]:
X_train.shape

In [None]:
X_train_ohe.shape

In [None]:
X_train_ohe.to_csv("X_train_ohe.csv", index=True)
X_test_ohe.to_csv("X_test_ohe.csv", index=True)
y_train.to_csv("y_train.csv", index=True)
y_test.to_csv("y_test.csv", index=True)

### There's more

#### Using `pandas` for one-hot encoding

In [None]:
pd.get_dummies(X_train, prefix_sep="_", drop_first=True)

#### Specifying possible categories for OneHotEncoder

In [None]:
one_hot_encoder = OneHotEncoder(
    categories=[["Male", "Female", "Unknown"]], 
    sparse=False, 
    handle_unknown="error", 
    drop="first"
)

one_hot_transformer = ColumnTransformer(
    [("one_hot", one_hot_encoder, ["sex"])]
)

one_hot_transformer.fit(X_train)
one_hot_transformer.get_feature_names_out()

#### Category Encoders library

In [None]:
import category_encoders as ce

In [None]:
one_hot_encoder_ce = ce.OneHotEncoder(use_cat_names=True)

In [None]:
one_hot_encoder_ce.fit(X_train)
X_train_ce = one_hot_encoder_ce.transform(X_train)
X_train_ce.head()

#### Accessing the `category` encoding

In [None]:
column_dtypes = {
    "education": "category", 
    "marriage": "category", 
    "sex": "category"
}
df_cat = pd.read_csv("../Datasets/credit_card_default.csv", 
                     na_values="", dtype=column_dtypes)

In [None]:
df_cat["education"].cat.codes

In [None]:
dict(zip(df_cat["education"].cat.codes, df_cat["education"]))

## 13.6 Fitting a decision tree classifier

### Getting ready

In [None]:
import pandas as pd
X_train_ohe = pd.read_csv("X_train_ohe.csv", index_col=0)
X_test_ohe = pd.read_csv("X_test_ohe.csv", index_col=0)
y_train = pd.read_csv("y_train.csv", index_col=0)["default_payment_next_month"]
y_test = pd.read_csv("y_test.csv", index_col=0)["default_payment_next_month"]

### How to do it...

1. Import the libraries:

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics

from chapter_13_utils import performance_evaluation_report

2. Create the instance of the model, fit it to the training data and create prediction:

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=42)
tree_classifier.fit(X_train_ohe, y_train)
y_pred = tree_classifier.predict(X_test_ohe)

3. Evaluate the results:

In [None]:
LABELS = ["No Default", "Default"]
tree_perf = performance_evaluation_report(tree_classifier, 
                                          X_test_ohe, 
                                          y_test, labels=LABELS, 
                                          show_plot=True)

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_15", dpi=200)

In [None]:
tree_perf

4. Plot the first few levels of the fitted decision tree:

In [None]:
plot_tree(tree_classifier, max_depth=3, fontsize=10)

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_16", dpi=200)

In [None]:
tree_classifier.get_depth()

In [None]:
plot_tree(
    tree_classifier, 
    max_depth=2,
    feature_names = X_train_ohe.columns, 
    class_names=["No default", "Default"],
    rounded=True, 
    filled = True, 
    fontsize=10
)

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_17", dpi=200)

### There's more

**Precision-recall curve**

In [None]:
y_pred_prob = tree_classifier.predict_proba(X_test_ohe)[:, 1]

precision, recall, _ = metrics.precision_recall_curve(y_test, 
                                                      y_pred_prob)

In [None]:
ax = plt.subplot()
ax.plot(recall, precision, 
        label=f"PR-AUC = {metrics.auc(recall, precision):.2f}")
ax.set(title="Precision-Recall Curve", 
       xlabel="Recall", 
       ylabel="Precision")
ax.legend()

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_19", dpi=200)

In [None]:
ax = metrics.PrecisionRecallDisplay.from_estimator(
    tree_classifier, X_test_ohe, y_test
)
ax.ax_.set_title("Precision-Recall Curve")

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_20", dpi=200)

**Visualizing decision trees using `dtreeviz`**

1. Import the libraries:

In [None]:
from dtreeviz.trees import *

2. Fit a small decision tree with max depth of 3:

In [None]:
small_tree = DecisionTreeClassifier(max_depth=3, 
                                    random_state=42)
small_tree.fit(X_train_ohe, y_train)

In [None]:
plot_tree(small_tree, max_depth=3, fontsize=10)

sns.despine()
plt.tight_layout()

3. Plot the decision tree using `dtreeviz`:

In [None]:
viz = dtreeviz(small_tree, 
               x_data=X_train_ohe,
               y_data=y_train,
               feature_names=X_train_ohe.columns, 
               target_name="Default",
               class_names=["No", "Yes"], 
               title="Decision Tree - Loan default data set")
viz

4. Plot the simplified tree representation:

In [None]:
viz = dtreeviz(small_tree, 
               x_data=X_train_ohe,
               y_data=y_train,
               feature_names=X_train_ohe.columns, 
               target_name="Default",
               class_names=["No", "Yes"], 
               title="Decision Tree - Loan default data set",
               fancy=False)
viz

5. Plot the simplified tree representation and the decision path of the first observation from the test set:

In [None]:
viz = dtreeviz(small_tree, 
               x_data=X_train_ohe,
               y_data=y_train,
               feature_names=X_train_ohe.columns, 
               target_name="Default",
               class_names=["No", "Yes"], 
               title="Decision Tree - Loan default data set",
               fancy=False,
               X=X_test_ohe.iloc[0])
viz

7. Print the prediction path using words:

In [None]:
print(explain_prediction_path(small_tree, X_test_ohe.iloc[0], 
                              feature_names=X_test_ohe.columns, 
                              explanation_type="plain_english"))

## 13.7 Organizing the project with pipelines

### How to do it...

1. Import the libraries:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from chapter_13_utils import performance_evaluation_report

2. Load the data, separate the target and create the stratified train-test split:

In [None]:
df = pd.read_csv("../Datasets/credit_card_default.csv", na_values="")

X = df.copy()
y = X.pop("default_payment_next_month")

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=42)

3. Prepare lists of numerical/categorical features:

In [None]:
num_features = X_train.select_dtypes(include="number") \
                      .columns \
                      .to_list()
cat_features = X_train.select_dtypes(include="object") \
                      .columns \
                      .to_list()

In [None]:
# sanity check that all columns are included in the lists
len(X_train.columns) == (len(num_features) + len(cat_features))

4. Define the numerical pipeline:

In [None]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

5. Define the categorical pipeline:

In [None]:
cat_list = [
    list(X_train[col].dropna().unique()) for col in cat_features
]

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(categories=cat_list, 
                             sparse=False, 
                             handle_unknown="error", 
                             drop="first"))
])

6. Define the `ColumnTransformer` object: 

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", num_pipeline, num_features),
        ("categorical", cat_pipeline, cat_features)
    ], 
    remainder="drop"
)

7. Define the full pipeline including the decision tree model: 

In [None]:
dec_tree = DecisionTreeClassifier(random_state=42)

tree_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", dec_tree)
])

8. Fit the pipeline to the data:

In [None]:
tree_pipeline.fit(X_train, y_train)

9. Evaluate the performance of the entire pipeline:

In [None]:
LABELS = ["No Default", "Default"]
tree_perf = performance_evaluation_report(tree_pipeline, X_test, 
                                          y_test, labels=LABELS, 
                                          show_plot=True)

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_23", dpi=200)

In [None]:
tree_perf

### There's more

#### Adding custom transformers to a pipeline

1. Import the base estimator and transformer classes from `sklearn`: 

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

2. Define the `OutlierRemover` class:

In [None]:
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, n_std=3):
        self.n_std = n_std
    
    def fit(self, X, y = None):
        if np.isnan(X).any(axis=None):
            raise ValueError("""There are missing values in the array! 
                                Please remove them.""")

        mean_vec = np.mean(X, axis=0)
        std_vec = np.std(X, axis=0)
        
        self.upper_band_ = pd.Series(mean_vec + self.n_std * std_vec)
        self.upper_band_ = self.upper_band_.to_frame().transpose()
        self.lower_band_ = pd.Series(mean_vec - self.n_std * std_vec)
        self.lower_band_ = self.lower_band_.to_frame().transpose()
        self.n_features_ = len(self.upper_band_.columns)
        
        return self 
    
    def transform(self, X, y = None):
        X_copy = pd.DataFrame(X.copy())
        
        upper_band = pd.concat(
            [self.upper_band_] * len(X_copy), 
            ignore_index=True
        )
        lower_band = pd.concat(
            [self.lower_band_] * len(X_copy), 
            ignore_index=True
        )
        
        X_copy[X_copy >= upper_band] = upper_band
        X_copy[X_copy <= lower_band] = lower_band
        
        return X_copy.values

3. Add the `OutlierRemover` to the numerical pipeline:

In [None]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("outliers", OutlierRemover())
])

4. Execute the rest of the pipeline to compare the results: 

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("numerical", num_pipeline, num_features),
    ("categorical", cat_pipeline, cat_features)],
    remainder="drop")

dec_tree = DecisionTreeClassifier(random_state=42)

tree_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                ("classifier", dec_tree)])

tree_pipeline.fit(X_train, y_train)

tree_perf = performance_evaluation_report(tree_pipeline, X_test, 
                                          y_test, labels=LABELS, 
                                          show_plot=True)

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_24", dpi=200)

In [None]:
tree_perf

#### Accessing the elements of the pipeline

1. Display the structure of the pipeline:

In [None]:
tree_pipeline.named_steps

2. Access the estimator at the end of the pipeline:

In [None]:
tree_pipeline.named_steps["classifier"]

3. Access the upper thresholds of the fitted `OutlierRemover` transformer:

In [None]:
(
    tree_pipeline
    .named_steps["preprocessor"]
    .named_transformers_["numerical"]["outliers"]
    .upper_band_
)

In [None]:
( 
    tree_pipeline
    .named_steps["preprocessor"]
    .transformers_[0][1]["outliers"]
    .upper_band_
)

## 13.8 Tuning hyperparameters using grid search and cross-validation

### Getting ready

Please execute the code from the previous recipe before running this one!

### How to do it...

1. Import the libraries:

In [None]:
from sklearn.model_selection import (
    GridSearchCV, cross_val_score, 
    RandomizedSearchCV, cross_validate, 
    StratifiedKFold
)
from sklearn import metrics

2. Define the cross-validation scheme:

In [None]:
k_fold = StratifiedKFold(5, shuffle=True, random_state=42)

3. Evaluate the pipeline using cross-validation:

In [None]:
cross_val_score(tree_pipeline, X_train, y_train, cv=k_fold)

4. Add extra metrics to cross-validation:

In [None]:
cv_scores = cross_validate(tree_pipeline, X_train, y_train, cv=k_fold, 
                           scoring=["accuracy", "precision", "recall", 
                                    "roc_auc"])
pd.DataFrame(cv_scores)

5. Define the parameter grid:

In [None]:
param_grid = {
    "classifier__criterion": ["entropy", "gini"],
    "classifier__max_depth": range(3, 11),
    "classifier__min_samples_leaf": range(2, 11), 
    "preprocessor__numerical__outliers__n_std": [3, 4]
}

6. Run the exhaustive grid search: 

In [None]:
classifier_gs = GridSearchCV(tree_pipeline, param_grid, 
                             scoring="recall", cv=k_fold, 
                             n_jobs=-1, verbose=1)

classifier_gs.fit(X_train, y_train)

In [None]:
print(f"Best parameters: {classifier_gs.best_params_}") 
print(f"Recall (Training set): {classifier_gs.best_score_:.4f}") 
print(f"Recall (Test set): {metrics.recall_score(y_test, classifier_gs.predict(X_test)):.4f}")

7. Evaluate the performance of the tuned pipeline:

In [None]:
LABELS = ["No Default", "Default"]
tree_gs_perf = performance_evaluation_report(classifier_gs, X_test, 
                                             y_test, labels=LABELS, 
                                             show_plot=True)

sns.despine()
plt.tight_layout()
# plt.savefig("images/figure_13_26", dpi=200)

In [None]:
tree_gs_perf

8. Run the randomized grid search: 

In [None]:
classifier_rs = RandomizedSearchCV(tree_pipeline, param_grid, 
                                   scoring="recall", cv=k_fold, 
                                   n_jobs=-1, verbose=1, 
                                   n_iter=100, random_state=42)
classifier_rs.fit(X_train, y_train)

In [None]:
print(f"Best parameters: {classifier_rs.best_params_}") 
print(f"Recall (Training set): {classifier_rs.best_score_:.4f}") 
print(f"Recall (Test set): {metrics.recall_score(y_test, classifier_rs.predict(X_test)):.4f}")

9. Evaluate the performance of the Randomized Grid Search:

In [None]:
tree_rs_perf = performance_evaluation_report(classifier_rs, X_test, 
                                             y_test, labels=LABELS, 
                                             show_plot=True)

sns.despine()
plt.tight_layout()

In [None]:
tree_rs_perf

### There's more

#### Faster search with successive halving

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

# using the default values for min_resources and factor
classifier_sh = HalvingGridSearchCV(tree_pipeline, param_grid, 
                                    scoring="recall", cv=k_fold, 
                                    n_jobs=-1, verbose=1, 
                                    min_resources="exhaust", factor=3)

classifier_sh.fit(X_train, y_train)


In [None]:
print(f"Best parameters: {classifier_sh.best_params_}") 
print(f"Recall (Training set): {classifier_sh.best_score_:.4f}") 
print(f"Recall (Test set): {metrics.recall_score(y_test, classifier_sh.predict(X_test)):.4f}")

In [None]:
tree_sh_perf = performance_evaluation_report(classifier_sh, X_test, 
                                             y_test, labels=LABELS, 
                                             show_plot=True)

sns.despine()
plt.tight_layout()

In [None]:
pd.DataFrame(
    data = [
        classifier_gs.best_params_, 
        classifier_rs.best_params_, 
        classifier_sh.best_params_
    ],
    index = ["grid_search", "randomized_search", "halving_search"]
)

In [None]:
pd.DataFrame(
    data = [tree_gs_perf, tree_rs_perf, tree_sh_perf],
    index = ["grid_search", "randomized_search", "halving_search"]
).round(3)

#### Grid search with multiple classifiers

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_grid = [
    {"classifier": [RandomForestClassifier(random_state=42)],
     "classifier__n_estimators": np.linspace(100, 500, 10, dtype=int),
     "classifier__max_depth": range(3, 11),
     "preprocessor__numerical__outliers__n_std": [3, 4]},
    {"classifier": [DecisionTreeClassifier(random_state=42)],
     "classifier__criterion": ["entropy", "gini"],
     "classifier__max_depth": range(3, 11),
     "classifier__min_samples_leaf": range(2, 11),
     "preprocessor__numerical__outliers__n_std": [3, 4]}
]

In [None]:
classifier_gs_2 = GridSearchCV(tree_pipeline, param_grid, 
                               scoring="recall", cv=k_fold, 
                               n_jobs=-1, verbose=1)

classifier_gs_2.fit(X_train, y_train)

print(f"Best parameters: {classifier_gs_2.best_params_}") 
print(f"Recall (Training set): {classifier_gs_2.best_score_:.4f}") 
print(f"Recall (Test set): {metrics.recall_score(y_test, classifier_gs_2.predict(X_test)):.4f}")

In [None]:
pd.DataFrame(classifier_gs_2.cv_results_).sort_values("rank_test_score")