In [6]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# from dash import Dash, html, dcc, callback, Output, Input

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification, load_iris, load_wine, load_diabetes

import lightgbm
import xgboost as xgb
        
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

/kaggle/input/drug-classification/drug200.csv


# 💡🤔🌳 Understanding Trees - In Depth

In this notebook we are going to investigate trained machine learning trees and their predictions in depth. For that matter we will use [Kaggle's Drug Classification](https://www.kaggle.com/datasets/prathamtripathi/drug-classification/data) dataset.

The dataset contains the following columns: 

| Column | Description | 
|:-------|:------------|
| `Age` | Age of the patient |
| `Sex` | Gender of the patient |
| `BP` | Blood pressure levels |
| `Cholesterol` | Cholesterol levels |
| `Na_to_K` | Sodium to potassium ratio in blood |
| `Drug` | Drug type |

In [3]:
drugs = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')
print(f"Rows: {drugs.shape[0]:,}")
print(f"Cols: {drugs.shape[1]:,}")
drugs.head(3)

Rows: 200
Cols: 6


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC


# Global Settings

In [13]:
RANDOM_STATE = 42

# Pre-Processing

In [14]:
features = ['Age', 'Na_to_K']
target = None

In [15]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
categorical_columns = drugs.select_dtypes(include=['object']).columns

for col in categorical_columns:
    encoded_col_name = col + '_Encoded'
    drugs[encoded_col_name] = label_encoder.fit_transform(drugs[col])
    
    if 'Drug' not in encoded_col_name:
        features.append(encoded_col_name)
    else:
        target = encoded_col_name

print(f"Modelling features: {features}")
print(f"Modelling target  : {target}")
drugs.head(3)

Modelling features: ['Age', 'Na_to_K', 'Sex_Encoded', 'BP_Encoded', 'Cholesterol_Encoded']
Modelling target  : Drug_Encoded


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Sex_Encoded,BP_Encoded,Cholesterol_Encoded,Drug_Encoded
0,23,F,HIGH,HIGH,25.355,DrugY,0,0,0,0
1,47,M,LOW,HIGH,13.093,drugC,1,1,0,3
2,47,M,LOW,HIGH,10.114,drugC,1,1,0,3


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(drugs[features], 
                                                    drugs[target], 
                                                    test_size=0.2, 
                                                    random_state=RANDOM_STATE)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (160, 5)
X_test shape: (40, 5)
y_train shape: (160,)
y_test shape: (40,)


In [23]:
xgb_params = {
    'max_depth': 3,                # maximum depth of each tree
    'eta': 0.3,                    # training step for each iteration
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',     # error evaluation for multiclass training
    'num_class': 5
}
num_round = 20
xgb_train = xgb.DMatrix(X_train, label = y_train, enable_categorical = False)
xgb_test = xgb.DMatrix(X_test, label = y_test, enable_categorical = False)

xgb_evals_result = {}
xgb_clf = xgb.train(xgb_params,
                    xgb_train, 
                    num_round, 
                    evals=[
                        (xgb_train, 'training') , 
                        (xgb_test, 'valid')
                    ], 
                    evals_result = xgb_evals_result)

[0]	training-mlogloss:1.04337	valid-mlogloss:1.10859
[1]	training-mlogloss:0.75492	valid-mlogloss:0.82417
[2]	training-mlogloss:0.56781	valid-mlogloss:0.64859
[3]	training-mlogloss:0.43791	valid-mlogloss:0.52975
[4]	training-mlogloss:0.33362	valid-mlogloss:0.42209
[5]	training-mlogloss:0.26289	valid-mlogloss:0.34346
[6]	training-mlogloss:0.20470	valid-mlogloss:0.28226
[7]	training-mlogloss:0.15942	valid-mlogloss:0.22560
[8]	training-mlogloss:0.12674	valid-mlogloss:0.19071
[9]	training-mlogloss:0.10159	valid-mlogloss:0.15636
[10]	training-mlogloss:0.08279	valid-mlogloss:0.13064
[11]	training-mlogloss:0.06861	valid-mlogloss:0.11463
[12]	training-mlogloss:0.05786	valid-mlogloss:0.09906
[13]	training-mlogloss:0.05015	valid-mlogloss:0.09140
[14]	training-mlogloss:0.04396	valid-mlogloss:0.08328
[15]	training-mlogloss:0.03921	valid-mlogloss:0.07778
[16]	training-mlogloss:0.03528	valid-mlogloss:0.07250
[17]	training-mlogloss:0.03206	valid-mlogloss:0.06781
[18]	training-mlogloss:0.02952	valid-m

# Model Complexity Tool

In [None]:
tca = boosting.BoostingComplexityAnalyzer(model = xgb_shopping_model, classification_mode = 'binary', n_classes = None, metric_key = 'logloss', evals_result = xgb_evals_result)
tca.fit()
tree_complexity_dict = tca.transform()

## Plot the Max Depth by Boosting Round

In [None]:
plt = plots.Plotter(plotting_data = tree_complexity_dict)
plt.plot_data.keys()
plt.plot_tree_depth_by_estimator()

## Plot the Rolling Avg. Max Depth by Boosting Round

In [None]:
plt.plot_expanding_tree_depth_by_estimator()

## Plot the Cumulative Number of Features by Boosting Round

In [None]:
plt.plot_cumulative_features_used_by_estimator()

## Feature Gain Analysis

In [None]:
plt.plot_relative_gain_attribution_by_feature()

## Plot Cumulative Fraction of Total Gain by Tree Index

In [None]:
plt.plot_cumulative_model_gain_by_estimator()

## Show an ECDFPlot style cumulative that shows the number of features (backwards recursively eliminated) that sum to a percentage of a total model gain

In [None]:
plt.plot_ecdf_gain_attribution_by_feature()

In [None]:
plt.plot_cover_by_feature()

In [None]:
plt.plot_feature_cover_distribution()

## Split Threshold Analysis

In [None]:
plt.plot_rolling_threshold_sigma_by_estimator()

In [None]:
plt.plot_threshold_min_max_range_by_estimator()

## Plot Number of `new_unique_splits` per boosting round per feature

In [None]:
plt.plot_new_unique_splits_by_feature()

---

💚 Thank you for reading 💚

If you have any questions or feedback, feel free to leave a comment 🤔

This notebook is __still in progress__.

Please __UPVOTE__ if you enjoyed this notebook 🙏