In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score,root_mean_squared_error

## Carregar os dados

In [2]:
# Carregar os dados (já transformados e limpos)

df_costs = pd.read_csv('../bagging/aula/healthcosts_cleaned.csv')

In [3]:
df_costs.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552


## Preparação dos Dados

In [4]:
# Preparar os dados para o modelo
X = df_costs.drop(columns=['medical charges'], axis=1)
y = df_costs['medical charges']

In [5]:
import joblib 

preprocessor = joblib.load('../bagging/aula/preprocessor_dataset_healthcosts.pkl')


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [7]:
# Aplicar preprocessor nos dados de treinamento e teste
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [8]:
# Mostrar as dimensões dos conjuntos
print(f'Dados de Treinamento: {X_train.shape}')
print(f'Dados de Teste: {X_test.shape}')

Dados de Treinamento: (1070, 10)
Dados de Teste: (268, 10)


## Treinamento do Modelo

In [9]:
# Criar o modelo de AdaBoost Regressor

boosting_model = AdaBoostRegressor(
    estimator=LinearRegression(),
    n_estimators=50,
    learning_rate=1.0,
    random_state=51
)

In [10]:
boosting_model.fit(X=X_train,y=y_train)

0,1,2
,"estimator  estimator: object, default=None The base estimator from which the boosted ensemble is built. If ``None``, then the base estimator is :class:`~sklearn.tree.DecisionTreeRegressor` initialized with `max_depth=3`. .. versionadded:: 1.2  `base_estimator` was renamed to `estimator`.",LinearRegression()
,"n_estimators  n_estimators: int, default=50 The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. Values must be in the range `[1, inf)`.",50
,"learning_rate  learning_rate: float, default=1.0 Weight applied to each regressor at each boosting iteration. A higher learning rate increases the contribution of each regressor. There is a trade-off between the `learning_rate` and `n_estimators` parameters. Values must be in the range `(0.0, inf)`.",1.0
,"loss  loss: {'linear', 'square', 'exponential'}, default='linear' The loss function to use when updating the weights after each boosting iteration.",'linear'
,"random_state  random_state: int, RandomState instance or None, default=None Controls the random seed given at each `estimator` at each boosting iteration. Thus, it is only used when `estimator` exposes a `random_state`. In addition, it controls the bootstrap of the weights used to train the `estimator` at each boosting iteration. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",51

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


## Análise dos Resultados

In [11]:
y_pred = boosting_model.predict(X_test)

In [12]:
y_pred

array([ 9.32914117e+03,  8.49857488e+03,  3.75280263e+04,  1.11497780e+04,
        2.74545728e+04,  1.18940600e+04,  1.15241066e+03,  1.96428198e+04,
        1.42555812e+03,  1.30158769e+04,  2.91844308e+04,  1.12128882e+04,
        5.53566997e+03,  3.91738247e+04,  4.08438445e+04,  3.77352796e+04,
        1.59315287e+04,  3.65804651e+04,  9.70206856e+03,  3.17125273e+04,
        5.24041097e+03,  1.10209611e+04,  3.09025817e+03,  7.46653334e+03,
        1.24741441e+04,  1.38234556e+04,  1.54783324e+04,  6.90632108e+03,
        1.02715351e+04,  3.04558187e+03,  1.02162565e+04,  1.49486208e+04,
        4.92091870e+03,  4.06341390e+03,  5.14364270e+03,  1.44541674e+04,
        2.77608979e+03,  9.30982439e+03,  3.35489302e+04,  3.32299653e+04,
        5.42285821e+03,  5.96291924e+03,  1.48722077e+04,  1.38831431e+04,
        9.68271204e+03,  1.34034101e+04,  6.12475028e+03,  4.32667443e+03,
        3.64247264e+04,  1.00569458e+04,  1.70546325e+04,  3.03039341e+03,
        1.38694585e+04,  

In [13]:
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [14]:
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

RMSE: 5955.772534437056
R2: 0.7715200237637087


In [15]:
# Calcular a importância das features usando os coeficientes
# Obter os coeficientes de casa estimador

coefs = np.mean([np.abs(estimator.coef_) for estimator in boosting_model.estimators_], axis=0)

importance = coefs/np.sum(coefs)

In [16]:
importance

array([0.19671054, 0.11403229, 0.01349726, 0.47829512, 0.01520078,
       0.01520078, 0.02530113, 0.03804662, 0.03855024, 0.06516523])

In [17]:
feature_names = preprocessor.get_feature_names_out()
feature_names

array(['num__age', 'num__bmi', 'num__children', 'num__smoker',
       'cat__sex_female', 'cat__sex_male', 'cat__region_northeast',
       'cat__region_northwest', 'cat__region_southeast',
       'cat__region_southwest'], dtype=object)

In [18]:
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importance})
importance_df = importance_df.sort_values(by='importance', ascending=True)

importance_df.head()

Unnamed: 0,feature,importance
2,num__children,0.013497
4,cat__sex_female,0.015201
5,cat__sex_male,0.015201
6,cat__region_northeast,0.025301
7,cat__region_northwest,0.038047


In [19]:
fig = px.bar(importance_df, x='importance',y='feature',title='Importancia das Features', orientation='h')

fig.show()

## Propriedades do Modelo

In [20]:
boosting_model.estimator_errors_

array([0.14255291, 0.20448829, 0.29989819, 0.33224754, 0.39499923,
       0.45648858, 0.47821362, 0.48372335, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])

In [21]:
boosting_model.estimator_weights_

array([1.79424627, 1.35847485, 0.84778273, 0.69803724, 0.42634592,
       0.17448704, 0.08720075, 0.06512962, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])