In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score

## Carregar Dados

In [2]:
# Carregar os dados
df_costs = pd.read_csv('./healthcosts.csv')

In [3]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   object 
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df_costs.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Feature Engineering

In [5]:
# Mostrar e remover as colunas categóricas que possuem somente um valor possível
for column in df_costs.select_dtypes(include=['object']).columns:
    if df_costs[column].nunique() == 1:
        print(f'Coluna {column} possui somente um valor possível : {df_costs[column].unique()}')

In [6]:
# Mostrar os valores possíveis para todas as colunas categóricas
for column in df_costs.select_dtypes(include=['object']).columns:
    print(f'Coluna {column} possui estes valores : {df_costs[column].unique()}')

Coluna sex possui estes valores : ['female' 'male']
Coluna smoker possui estes valores : ['yes' 'no']
Coluna region possui estes valores : ['southwest' 'southeast' 'northwest' 'northeast']


In [7]:
for column in df_costs.select_dtypes(include=['object']).columns:
    contagem_nulas = df_costs[column].isnull().sum()
    print(f'{column} : {contagem_nulas/len(df_costs) * 100:.2f}%')

sex : 0.00%
smoker : 0.00%
region : 0.00%


In [8]:
# Apresentar algumas estatísticas descritivas
df_costs.describe()

Unnamed: 0,age,bmi,children,medical charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [9]:
# Mostrar e remover as colunas numéricas que possuem somente um valor possível
for column in df_costs.select_dtypes(include=['number']).columns:
    if df_costs[column].nunique() == 1:
        print(f'Coluna {column} possui somente um valor possível : {df_costs[column].unique()}')

In [10]:
for column in df_costs.select_dtypes(include=['number']).columns:
    contagem_nulas = df_costs[column].isnull().sum()
    print(f'{column}: {contagem_nulas/len(df_costs) * 100:.2f}')

age: 0.00
bmi: 0.00
children: 0.00
medical charges: 0.00


In [11]:
df_costs['smoker'] = df_costs['smoker'].apply(lambda x: 1 if x == 'yes' else 0)

In [12]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


## EDA

In [13]:
# Mostrar distribuição de custos médicos
fig = px.histogram(df_costs, x ='medical charges', nbins=30, title='Distribuição de Custos Médicos')
fig.show()

In [14]:
# Mostrar distribuição de idade
fig = px.histogram(df_costs,x='age',nbins=30,title='Distribuição de idade')
fig.show()

In [15]:
# mostrar a quantidade de filhos
fig = px.histogram(df_costs, x='children',title='Distribuição de Quantidade de Filhos')
fig.show()

In [16]:
# Mostrar distribuição de BMI
fig = px.histogram(df_costs,x='bmi',nbins=30,title='Distribuição de BMI')
fig.show()

In [17]:
# Mostrar a distribuição do gênero
fig = px.bar(df_costs['sex'].value_counts(),title='Distribuição por Gênero')
fig.show()

In [18]:
# mostrar a distribuição da variável Smoker
fig = px.bar(df_costs['smoker'].value_counts(), title='Distribuição fumante')
fig.show()

In [19]:
# Mostrar a distribuição por região
fig = px.bar(df_costs['region'].value_counts(),title='Distribuição por Região')
fig.show()

In [20]:
# Boxplot de custos médicos por idade
fig = px.box(df_costs, x='age', y='medical charges', title='Boxplot de Custos médicos por idade')
fig.show()

In [21]:
# Boxplot de custos médicos por gênero
fig = px.box(df_costs, x='sex',y='medical charges', title='Boxplot de custos médicos por gênero')
fig.show()

In [22]:
# Boxplot de custos médicos por Smoker
fig = px.box(df_costs,x='smoker',y='medical charges', title='Boxplot de custos médicos por status de fumante')
fig.show()

In [23]:
# Boxplot de custos médicos por região
fig = px.box(df_costs,x='region',y='medical charges',title='Boxplot de Custos médicos por região')
fig.show()

In [24]:
# Plot de correlação das variáveis numéricas
corr_matrix = df_costs.select_dtypes(include=['number']).corr()

In [25]:
corr_matrix

Unnamed: 0,age,bmi,children,smoker,medical charges
age,1.0,0.109272,0.042469,-0.025019,0.299008
bmi,0.109272,1.0,0.012759,0.00375,0.198341
children,0.042469,0.012759,1.0,0.007673,0.067998
smoker,-0.025019,0.00375,0.007673,1.0,0.787251
medical charges,0.299008,0.198341,0.067998,0.787251,1.0


In [26]:
fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x=corr_matrix.columns,
        y=corr_matrix.index,
        z=np.array(corr_matrix),
        text=corr_matrix.values,
        texttemplate='%{text:.3f}',
        colorscale=px.colors.diverging.RdBu,
        zmin=-1,
        zmax=1
    )
)

fig.show()

## Preparação dos dados

In [27]:
# Preparar dados para o modelo
X = df_costs.drop(columns=['medical charges'],axis=1)
y = df_costs['medical charges']

In [28]:
# Column Transformer para normalizar variáveis numéricas e OneHotEncode para categóricas
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_features)
    ]
)

In [29]:
# Dividr os dados em treinamento e em teste
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=51)

In [30]:
# Aplicar o Column Transformer nos dados de treinamento e teste
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [31]:
# Mostrar as dimensões dos conjuntos
print(f'Dados de treinamento: {X_train.shape}')
print(f'Dados de teste: {X_test.shape}')

Dados de treinamento: (1070, 10)
Dados de teste: (268, 10)


## Treinamento do modelo

In [48]:
# Criar modelo de bagging Regressor
modelo_bagging_regressor = BaggingRegressor(
    estimator=LinearRegression(),
    n_estimators=10,
    random_state=51,
    max_features=0.6
)

In [49]:
# Treinar o modelo
modelo_bagging_regressor.fit(X_train, y_train)

0,1,2
,"estimator  estimator: object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a :class:`~sklearn.tree.DecisionTreeRegressor`. .. versionadded:: 1.2  `base_estimator` was renamed to `estimator`.",LinearRegression()
,"n_estimators  n_estimators: int, default=10 The number of base estimators in the ensemble.",10
,"max_samples  max_samples: int or float, default=None The number of samples to draw from X to train each base estimator (with replacement by default, see `bootstrap` for more details). - If None, then draw `X.shape[0]` samples irrespective of `sample_weight`. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` unweighted samples or  `max_samples * sample_weight.sum()` weighted samples.",
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator ( without replacement by default, see `bootstrap_features` for more details). - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features.",0.6
,"bootstrap  bootstrap: bool, default=True Whether samples are drawn with replacement. If False, sampling without replacement is performed. If fitting with `sample_weight`, it is strongly recommended to choose True, as only drawing with replacement will ensure the expected frequency semantics of `sample_weight`.",True
,"bootstrap_features  bootstrap_features: bool, default=False Whether features are drawn with replacement.",False
,"oob_score  oob_score: bool, default=False Whether to use out-of-bag samples to estimate the generalization error. Only available if bootstrap=True.",False
,"warm_start  warm_start: bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. See :term:`the Glossary `.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for both :meth:`fit` and :meth:`predict`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the random resampling of the original dataset (sample wise and feature wise). If the base estimator accepts a `random_state` attribute, a different seed is generated for each instance in the ensemble. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",51

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


## Análise dos Resultados

In [50]:
# Fazer predições com base no modelo treinado
y_pred = modelo_bagging_regressor.predict(X_test)

In [51]:
# mostrar y_pred
y_pred

array([11798.46165145, 27827.00052983,  7114.61930139, 12373.25724014,
       25857.75699771, 12096.08848318, 12068.3921653 , 13960.18279827,
        9629.01595749, 12930.59727738, 10610.82472249, 12744.64232513,
       10916.64637069,  6798.91616545,  8713.85981253, 13700.98616525,
        7723.61103624,  8003.68111973, 19852.63879896, 22217.3947528 ,
       10840.12801648,  9806.01074798, 24046.22297595, 13087.3631198 ,
        8596.78676123, 14465.59874983, 11036.01182152,  6085.76321128,
       19137.73891062, 11543.63728392,  7291.45587524, 22848.77863935,
        7652.79761533,  8185.89646534,  9269.75322612, 12018.45996821,
       13481.37768292,  6217.17770559, 12465.68424103, 10383.44307783,
       11311.43050368,  5821.16239365,  8878.5146454 ,  7013.4576691 ,
        6988.80903254, 13646.89766839, 14401.80102512, 26488.1388319 ,
       10605.199093  , 13188.34915451,  8242.80538014, 23299.7750049 ,
        9460.33306199, 30264.01467873,  7497.61708341, 21369.74519693,
      

In [52]:
# Avaliar métricas do modelo
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [53]:
# mostrar o Erro e R2 do modelo
print(f'Root Mean Squared Error: {rmse}')
print(f'R2: {r2}')

Root Mean Squared Error: 8133.359552823967
R2: 0.6195619919575382


In [38]:
# Calcular a importância das features usando os coeficientes

# Obter os coeficientes de cada estimador
coefs = np.array([estimator.coef_ for estimator in modelo_bagging_regressor.estimators_])

# Calcular a média dos coeficietes absolutos
feature_importance = np.mean(np.abs(coefs), axis=0)

# Normalizar as importâncias
feature_importance = feature_importance/np.sum(feature_importance)

In [39]:
feature_importance

array([0.17976121, 0.1179395 , 0.03055724, 0.51753238, 0.01200919,
       0.01200919, 0.03965395, 0.02790159, 0.02846395, 0.03417181])

In [40]:
# Obter os nomes das features
features_names = preprocessor.get_feature_names_out()


In [41]:
features_names

array(['num__age', 'num__bmi', 'num__children', 'num__smoker',
       'cat__sex_female', 'cat__sex_male', 'cat__region_northeast',
       'cat__region_northwest', 'cat__region_southeast',
       'cat__region_southwest'], dtype=object)

In [42]:
# Criar um dataframe com as importâncias e os nomes
importance_df = pd.DataFrame({'feature':features_names, 'importance':feature_importance})

# Ordenar o dataframe pela importância
importance_df = importance_df.sort_values('importance',ascending=True)

In [43]:
importance_df

Unnamed: 0,feature,importance
5,cat__sex_male,0.012009
4,cat__sex_female,0.012009
7,cat__region_northwest,0.027902
8,cat__region_southeast,0.028464
2,num__children,0.030557
9,cat__region_southwest,0.034172
6,cat__region_northeast,0.039654
1,num__bmi,0.11794
0,num__age,0.179761
3,num__smoker,0.517532


In [44]:
# Criar o gráfico de barras para mostrar a importância das features
fig = px.bar(importance_df, x='importance',y='feature',title='Importância das Features',
             orientation='h')

fig.update_xaxes(tickangle=False)
fig.show()

## Verificar propriedades do modelo

In [54]:
modelo_bagging_regressor.estimators_samples_

[array([735, 214, 951, ..., 841, 514,  65], shape=(1070,)),
 array([   5,  864, 1023, ...,   69,  391,  171], shape=(1070,)),
 array([117, 568, 879, ..., 185, 292, 886], shape=(1070,)),
 array([535, 159, 620, ..., 139, 529, 786], shape=(1070,)),
 array([478, 549, 212, ..., 750, 962, 793], shape=(1070,)),
 array([944,  92, 234, ...,  17, 325, 724], shape=(1070,)),
 array([341, 138, 615, ...,  45, 288, 525], shape=(1070,)),
 array([214, 790, 317, ..., 622, 278, 319], shape=(1070,)),
 array([508, 327, 928, ..., 655, 899, 816], shape=(1070,)),
 array([558, 698, 770, ..., 313, 751, 562], shape=(1070,))]

In [56]:
modelo_bagging_regressor.estimators_samples_[2].shape

(1070,)

In [55]:
modelo_bagging_regressor.estimators_features_

[array([9, 6, 5, 3, 2, 8]),
 array([0, 5, 1, 8, 3, 9]),
 array([5, 2, 4, 9, 6, 8]),
 array([9, 3, 6, 8, 7, 0]),
 array([5, 9, 7, 6, 3, 4]),
 array([1, 0, 9, 5, 3, 6]),
 array([6, 8, 0, 2, 1, 7]),
 array([0, 7, 4, 6, 2, 1]),
 array([1, 5, 9, 4, 0, 7]),
 array([0, 6, 7, 3, 1, 9])]

## Salvar dados e pre-processador do modelo

In [57]:
# Salvar dataframe como CSV
df_costs.to_csv('./healthcosts_cleaned.csv', index=False)

In [58]:
import joblib

joblib.dump(preprocessor,'./preprocessor_dataset_healthcosts.pkl')

['./preprocessor_dataset_healthcosts.pkl']