In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

## Carregar os Dados já tratados

In [2]:
df_costs = pd.read_csv('../bagging/aula/healthcosts_cleaned.csv')

In [3]:
df_costs.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552


In [4]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


## Preparação dos dados

In [5]:
# Preparar os dados para o modelo
X = df_costs.drop(columns=['medical charges'], axis=1)
y = df_costs['medical charges']

In [6]:
# importar preprocessor
import joblib

preprocessor = joblib.load('../bagging/aula/preprocessor_dataset_healthcosts.pkl')

In [9]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=51)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)



In [10]:
print(f'Treinamento: {X_train.shape}')
print(f'Teste: {X_test.shape}')

Treinamento: (1070, 10)
Teste: (268, 10)


## Treinamento do Modelo Stacking

In [11]:
# Criar o modelo de Stacking Regressor

# Algoritmos Base
lr_model = LinearRegression()
elastic_model = ElasticNet(random_state=51)
tree_model = DecisionTreeRegressor(random_state=51)

# Meta-modelo ou Meta-learner
huber_model = HuberRegressor()

# Stacking Model
stacking_model = StackingRegressor(
    estimators= [
        ('linear regression', lr_model),
        ('elastic', elastic_model),
        ('decision_tree', tree_model)
    ],
    final_estimator=huber_model,
    # = False Usa apenas as predições dos estimadores
    # = True usa as predições dos estimadores + conjunto de treinamento (dataset)
    passthrough=False
)

In [12]:
# Treinar o modelo
stacking_model.fit(X_train,y_train)

0,1,2
,"estimators  estimators: list of (str, estimator) Base estimators which will be stacked together. Each element of the list is defined as a tuple of string (i.e. name) and an estimator instance. An estimator can be set to 'drop' using `set_params`.","[('linear regression', ...), ('elastic', ...), ...]"
,"final_estimator  final_estimator: estimator, default=None A regressor which will be used to combine the base estimators. The default regressor is a :class:`~sklearn.linear_model.RidgeCV`.",HuberRegressor()
,"cv  cv: int, cross-validation generator, iterable, or ""prefit"", default=None Determines the cross-validation splitting strategy used in `cross_val_predict` to train `final_estimator`. Possible inputs for cv are: * None, to use the default 5-fold cross validation, * integer, to specify the number of folds in a (Stratified) KFold, * An object to be used as a cross-validation generator, * An iterable yielding train, test splits, * `""prefit""`, to assume the `estimators` are prefit. In this case, the  estimators will not be refitted. For integer/None inputs, if the estimator is a classifier and y is either binary or multiclass, :class:`~sklearn.model_selection.StratifiedKFold` is used. In all other cases, :class:`~sklearn.model_selection.KFold` is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. If ""prefit"" is passed, it is assumed that all `estimators` have been fitted already. The `final_estimator_` is trained on the `estimators` predictions on the full training set and are **not** cross validated predictions. Please note that if the models have been trained on the same data to train the stacking model, there is a very high risk of overfitting. .. versionadded:: 1.1  The 'prefit' option was added in 1.1 .. note::  A larger number of split will provide no benefits if the number  of training samples is large enough. Indeed, the training time  will increase. ``cv`` is not used for model evaluation but for  prediction.",
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for `fit` of all `estimators`. `None` means 1 unless in a `joblib.parallel_backend` context. -1 means using all processors. See :term:`Glossary ` for more details.",
,"passthrough  passthrough: bool, default=False When False, only the predictions of estimators will be used as training data for `final_estimator`. When True, the `final_estimator` is trained on the predictions as well as the original training data.",False
,"verbose  verbose: int, default=0 Verbosity level.",0

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False

0,1,2
,"alpha  alpha: float, default=1.0 Constant that multiplies the penalty terms. Defaults to 1.0. See the notes for the exact mathematical meaning of this parameter. ``alpha = 0`` is equivalent to an ordinary least square, solved by the :class:`LinearRegression` object. For numerical reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. Given this, you should use the :class:`LinearRegression` object.",1.0
,"l1_ratio  l1_ratio: float, default=0.5 The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2.",0.5
,"fit_intercept  fit_intercept: bool, default=True Whether the intercept should be estimated or not. If ``False``, the data is assumed to be already centered.",True
,"precompute  precompute: bool or array-like of shape (n_features, n_features), default=False Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``False`` to preserve sparsity. Check :ref:`an example on how to use a precomputed Gram Matrix in ElasticNet ` for details.",False
,"max_iter  max_iter: int, default=1000 The maximum number of iterations.",1000
,"copy_X  copy_X: bool, default=True If ``True``, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-4 The tolerance for the optimization: if the updates are smaller or equal to ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller or equal to ``tol``, see Notes below.",0.0001
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `.",False
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive.",False
,"random_state  random_state: int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",51

0,1,2
,"criterion  criterion: {""squared_error"", ""friedman_mse"", ""absolute_error"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in the half mean Poisson deviance to find splits. .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 0.24  Poisson deviance criterion.",'squared_error'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. For an example of how ``max_depth`` influences the model, see :ref:`sphx_glr_auto_examples_tree_plot_tree_regression.py`.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",51
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0

0,1,2
,"epsilon  epsilon: float, default=1.35 The parameter epsilon controls the number of samples that should be classified as outliers. The smaller the epsilon, the more robust it is to outliers. Epsilon must be in the range `[1, inf)`.",1.35
,"max_iter  max_iter: int, default=100 Maximum number of iterations that ``scipy.optimize.minimize(method=""L-BFGS-B"")`` should run for.",100
,"alpha  alpha: float, default=0.0001 Strength of the squared L2 regularization. Note that the penalty is equal to ``alpha * ||w||^2``. Must be in the range `[0, inf)`.",0.0001
,"warm_start  warm_start: bool, default=False This is useful if the stored attributes of a previously used model has to be reused. If set to False, then the coefficients will be rewritten for every call to fit. See :term:`the Glossary `.",False
,"fit_intercept  fit_intercept: bool, default=True Whether or not to fit the intercept. This can be set to False if the data is already centered around the origin.",True
,"tol  tol: float, default=1e-05 The iteration will stop when ``max{|proj g_i | i = 1, ..., n}`` <= ``tol`` where pg_i is the i-th component of the projected gradient.",1e-05


## Análise de Resultados

In [13]:
# Fazer as predições com base no modelo treinado
y_pred = stacking_model.predict(X_test)

In [14]:
# Mostrar as predições
y_pred

array([ 7695.79959724, 43370.93405492, 20538.53371989, 23306.79628201,
       38542.87992825,  9712.19066584,  7864.68467422, 12191.71869977,
        5530.5934329 ,  9405.74504414,  8707.54848159, 11508.65098615,
        7327.30156712,  2463.04849912,  4553.46990533, 12495.95535168,
        3100.39761183,  6935.23743426, 18344.04697192, 20101.55089448,
        4888.24901415,  6829.38484691, 53567.50549203, 10875.11501712,
        5634.50271863, 15015.90063325, 11475.01694231,  1573.00910166,
       30832.39007449, 19316.03404105,  1437.97164139, 23004.37525715,
        2486.96709226,  2806.40056811,  7060.57244398, 24988.87749613,
        7348.17486445,  1418.96078897, 11657.23837065,  7247.37658504,
       11211.77968941,  1234.95208951,  3676.24582513,  1443.33461361,
       12248.44036693, 12752.260629  , 11556.60219094, 40223.96058083,
        8302.20504108, 12560.89421783,  4744.64392108, 37488.43146823,
        8832.32557457, 46791.81649497, 18839.48431266, 33244.72604585,
      

In [15]:
# Avaliar métricas do modelo
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [16]:
# Mostrar o Erro e R2 do modelo
print(f'Root Mean Squered Error: {rmse}')
print(f'R2: {r2}')

Root Mean Squered Error: 6641.236668309879
R2: 0.7463459096735836


In [17]:
# Calcular a importância considerando os modelos do stacking regressor

importances = []

for estimator in stacking_model.estimators_:
    # Uso em modelos lineares
    if hasattr(estimator, 'coef_'):
        importances.append(np.abs(estimator.coef_))
    
    # Uso em modelos de árvore
    elif hasattr(estimator, 'feature_importances_'):
        importances.append(estimator.feature_importances_)
    
    else:
        print(f'Não foi possivel carregar a importância das variáveis do modelo {type(estimator).__name__}')



In [20]:
# Calcular a média das importâncias
importances_media = np.mean(importances, axis=0)

# Normalizar
feature_importance = importances_media/np.sum(importances_media)

In [19]:
# Obter os nomes das features
feature_names = preprocessor.get_feature_names_out()

In [21]:
df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

In [22]:
df_importance = df_importance.sort_values(by='Importance', ascending=True)

df_importance.head()

Unnamed: 0,Feature,Importance
5,cat__sex_male,0.006653
4,cat__sex_female,0.006654
7,cat__region_northwest,0.012387
9,cat__region_southwest,0.017066
8,cat__region_southeast,0.022745


In [23]:
# Plotar

fig = px.bar(df_importance, x='Importance', y='Feature', title='Importância das Features - Stacking regressor', orientation='h')

fig.show()

## Propriedades do modelo

In [24]:
# Mostrar evidências do modelo
# Fazer uma predição num exemplo específico

X_sample = X_test[7].reshape(1, -1)

# Predições individuais dos estimadores
lr_pred = stacking_model.named_estimators_['linear regression'].predict(X_sample)
elatic_pred = stacking_model.named_estimators_['elastic'].predict(X_sample)
tree_pred = stacking_model.named_estimators_['decision_tree'].predict(X_sample)

# Predição final com o Stacking
stacking_pred = stacking_model.predict(X_sample)

In [27]:
# Exibir os resultados
print(f'Predição do Linear Regression: {lr_pred[0]}')
print(f'Predição do ElasticNet: {elatic_pred[0]}')
print(f'Predição da Árvore de Decisão: {tree_pred[0]}')
print(f'Predição Final do Stacking (HuberRegressor): {stacking_pred[0]}')

Predição do Linear Regression: 14793.194288532943
Predição do ElasticNet: 13782.69060634142
Predição da Árvore de Decisão: 11856.4115
Predição Final do Stacking (HuberRegressor): 12191.71869976939
