In [1]:
# Importar bibliotecas 

# EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np
from scipy.stats import chi2_contingency

# Machine Learning

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import root_mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar os Dados

In [2]:
# Carregar os dados (ja trasnformados e limpos)
df_costs = pd.read_csv('./datasets/healthcosts_cleaned.csv')

In [3]:
# Mostrar as primeiras linhas
df_costs.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552
5,31,female,25.74,0,0,southeast,3756.6216
6,46,female,33.44,1,0,southeast,8240.5896
7,37,female,27.74,3,0,northwest,7281.5056
8,37,male,29.83,2,0,northeast,6406.4107
9,60,female,25.84,0,0,northwest,28923.13692


In [4]:
# Mostrar as ultimas linhas
df_costs.tail(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
1328,23,female,24.225,2,0,northeast,22395.74424
1329,52,male,38.6,2,0,southwest,10325.206
1330,57,female,25.74,2,0,southeast,12629.1656
1331,23,female,33.4,0,0,southwest,10795.93733
1332,52,female,44.7,3,0,southwest,11411.685
1333,50,male,30.97,3,0,northwest,10600.5483
1334,18,female,31.92,0,0,northeast,2205.9808
1335,18,female,36.85,0,0,southeast,1629.8335
1336,21,female,25.8,0,0,southwest,2007.945
1337,61,female,29.07,0,1,northwest,29141.3603


In [5]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


### Preparaçao dos dados

In [6]:
# Preparar dados para o modelo
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [7]:
# Importar preprocessor ja salvo anteriormente
import joblib
preprocessor = joblib.load('./preprocessor_dataset_healthcosts.pkl')

In [8]:
# Dividir os dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [9]:
# Aplicar o preprocessor nos dados de treinamento e teste
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
# Mostrar as dimensões dos conjuntos
print(f'Dados de treinamento: {X_train.shape}')
print(f'Dados de teste: {X_test.shape}')

Dados de treinamento: (1070, 10)
Dados de teste: (268, 10)


### Treinamento do Modelo

In [11]:
# Criar o modelo de Boosting Regressor
boosting_model = AdaBoostRegressor(
    estimator=LinearRegression(),
    n_estimators=50,
    learning_rate=1.0,
    random_state=51,
)

In [12]:
# Treinar o modelo 
boosting_model.fit(X_train, y_train)

### Analise dos Resultados

In [13]:
# Fazer predições com base no modelo treinado
y_pred = boosting_model.predict(X_test)

In [14]:
y_pred

array([10774.04557571, 37774.        ,  4901.62996501, 12515.77498084,
       34646.        , 13378.04764218, 13630.3409287 , 16871.11602794,
        7158.66042009, 12561.25281837, 11572.73802017, 13972.77994414,
       11852.7121236 ,  6334.65990676,  6776.5211331 , 14085.83213492,
        7804.50884461,  7502.06200229, 25832.84029631, 29200.35816502,
       13225.92018378, 10081.29512958, 33230.        , 14421.1869303 ,
        7609.01561891, 17246.9953249 , 11436.12286396,  5293.94888491,
       24150.        ,  9373.38126627,  6837.98108944, 31288.        ,
        8016.75381931,  6663.74075501,  9120.73263597, 12786.09711692,
       15444.52333991,  4105.15043784, 13716.00000053, 10109.9541843 ,
       11985.21945631,  2517.6040554 ,  7439.8185571 ,  4219.99449624,
        5631.70000421, 16483.23953707, 16993.05520135, 35442.        ,
        8636.94880467, 14427.18924098,  7848.89785755, 30878.        ,
        8215.30896603, 41258.        ,  5261.61220316, 27913.73373556,
      

In [15]:
# Avaliar métricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [16]:
# Mostrar o Erro e R2 do Modelo
print(f'Root Mean Squared Error: {rmse}')
print(f'R2: {r2}')

Root Mean Squared Error: 6938.596010147248
R2: 0.7231228220148695


In [17]:
# Calcular a importãncia das features usando os coeficientes

# Obter os coeficientes de cada estimador
coefs = np.array([estimator.coef_ for estimator in boosting_model.estimators_])


In [18]:
# Calcular media dos coeficientes absolutos
importances = np.mean(np.abs(coefs), axis=0)

In [19]:
# Normalizar as importâncias
importances = importances / np.sum(importances)

In [20]:
# Obter os nomes das features
feature_names = preprocessor.get_feature_names_out()

In [21]:
feature_names

array(['num__age', 'num__bmi', 'num__children', 'num__smoker',
       'cat__sex_female', 'cat__sex_male', 'cat__region_northeast',
       'cat__region_northwest', 'cat__region_southeast',
       'cat__region_southwest'], dtype=object)

In [22]:
# Criar um dataframe com as importâncias e os nomes
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})

In [23]:
# Ordenar o Dataframe pela importãncia
importance_df = importance_df.sort_values('importance', ascending=True)

In [24]:
# Criar o gráfico de barras para mostrar a importância das features
fig = px.bar(importance_df, x='importance', y='feature', title='Importância das Features',
             orientation='h')

fig.update_xaxes(tickangle=45)
fig.show()

### Propriedades do Modelo

In [25]:
# Erros dos estimadores
boosting_model.estimator_errors_

array([0.13261321, 0.20075834, 0.26207646, 0.35501388, 0.42428527,
       0.42281211, 0.45429803, 0.48795179, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])

In [26]:
# Pesos dos estimadores
boosting_model.estimator_weights_

array([1.87804829, 1.38156143, 1.03520395, 0.59707191, 0.30520624,
       0.31123994, 0.18331954, 0.04820217, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])