In [88]:
# Importar bibliotecas 

# EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from scipy.stats import chi2_contingency

# Machine Learning
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar Dados

In [89]:
# Carregar os dados
df_costs = pd.read_csv('./datasets/healthcosts.csv')

In [90]:
# Mostrar as primeiras linhas do dataframe
df_costs.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [91]:
# Mostrar as últimas linhas do dataframe
df_costs.tail(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
1328,23,female,24.225,2,no,northeast,22395.74424
1329,52,male,38.6,2,no,southwest,10325.206
1330,57,female,25.74,2,no,southeast,12629.1656
1331,23,female,33.4,0,no,southwest,10795.93733
1332,52,female,44.7,3,no,southwest,11411.685
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [92]:
# Mostrar estrutura do dataframe
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   object 
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [93]:
# Mostrar e remover as colunas caregoricas que possuem somente um valor possível
for column in df_costs.select_dtypes(include=['object']).columns:
    if df_costs[column].nunique() == 1:
        print(f"Coluna {coluna} possui estes valores: {df_costs[column].unique()}")

In [94]:
# Mostrar os valores possíveis para todas as colunas categoricas
for column in df_costs.select_dtypes(include=['object']).columns:
    print(f'Coluna {column} possui estes valores: {df_costs[column].unique()}')

Coluna sex possui estes valores: ['female' 'male']
Coluna smoker possui estes valores: ['yes' 'no']
Coluna region possui estes valores: ['southwest' 'southeast' 'northwest' 'northeast']


In [95]:
# Mostrar o percentual de valores ausentes para as colunas categoricas
for column in df_costs.select_dtypes(include=['object']).columns:
    contagem_nulas = df_costs[column].isnull().sum()
    print(f'{column}: {contagem_nulas / len(df_costs) * 100:.2f}%')

sex: 0.00%
smoker: 0.00%
region: 0.00%


In [96]:
# Apresentar Estatísticas Descritivas
df_costs.describe()

Unnamed: 0,age,bmi,children,medical charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [97]:
# Mostrar e remover as colunas numericas que possuem somente um valor possível
for column in df_costs.select_dtypes(include=['number']).columns:
    if df_costs[column].nunique() == 1:
        print(f"Coluna {coluna} possui somente um valor possível: {df_costs[column].unique()}")

In [98]:
# Mostrar o percentual de valores ausentes para as colunas numericas
for column in df_costs.select_dtypes(include=['number']).columns:
    contagem_nulas = df_costs[column].isnull().sum()
    print(f'{column}: {contagem_nulas / len(df_costs) * 100:.2f}%')

age: 0.00%
bmi: 0.00%
children: 0.00%
medical charges: 0.00%


In [99]:
# Converter colunas categoricas com valores Yes e No para 1 e 0
for column in df_costs.select_dtypes(include=['object']).columns:
    valores_unicos = df_costs[column].unique()
    if set(valores_unicos).issubset(set(['yes', 'no'])):
        df_costs[column] = df_costs[column].apply(lambda x: 1 if x == 'yes' else 0)

In [100]:
df_costs.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552
5,31,female,25.74,0,0,southeast,3756.6216
6,46,female,33.44,1,0,southeast,8240.5896
7,37,female,27.74,3,0,northwest,7281.5056
8,37,male,29.83,2,0,northeast,6406.4107
9,60,female,25.84,0,0,northwest,28923.13692


### EDA

In [101]:
# Mostrar distribuição de custos médicos
fig = px.histogram(df_costs, x='medical charges', nbins=30, title='Distribuição de Custos Médicos')
fig.show()

In [102]:
# Mostrar distribuição de idade
fig = px.histogram(df_costs, x='age', nbins=30, title='Distribuição da Idade')
fig.show()

In [103]:
# Mostrar a quantidade de filhos
fig = px.histogram(df_costs, x='children', title='Distribuição de Quantidade de filhos')
fig.show()

In [104]:
# Mostrar distribuição de BMI
fig = px.histogram(df_costs, x='bmi', nbins=30,  title='Distribuição de BMI')
fig.show()

In [105]:
# Mostrar a distribuição do gênero 
fig = px.bar(df_costs['sex'].value_counts(), title='Distribuição por Gênero')
fig.show()

In [106]:
# Mostrar a distribuição da variável Smoker

fig = px.bar(df_costs['smoker'].value_counts(), title='Distribuição de Fumante')
fig.show()

In [107]:
# Mostrar a distribuição de região
fig = px.bar(df_costs['region'].value_counts(), title='Distribuição por Região')
fig.show()

In [108]:
# Boxplot de custos médicos por idade
fig = px.box(df_costs, x='age', y='medical charges', title='Boxplot de Custos Médicos por idade')
fig.show()

In [109]:
# Boxplot de custos médicos por gênero
fig = px.box(df_costs, x='sex', y='medical charges', title='Boxplot de Custos Médicos por Gênero')
fig.show()

In [110]:
# Boxplot de custos médicos por Smoker
fig = px.box(df_costs, x='smoker', y='medical charges', title='Boxplot de Custos Médicos por Status de Fumante')
fig.show()

In [111]:
# Boxplot de custos médicos por REgião
fig = px.box(df_costs, x='region', y='medical charges', title='Boxplot de Custos Médicos por Região')
fig.show()

In [112]:
# Plot de correlação das variáveis numéricas
corr_matrix = df_costs.select_dtypes(include=['number']).corr()

In [113]:
# Mostrar a matriz de correlação
corr_matrix

Unnamed: 0,age,bmi,children,smoker,medical charges
age,1.0,0.109272,0.042469,-0.025019,0.299008
bmi,0.109272,1.0,0.012759,0.00375,0.198341
children,0.042469,0.012759,1.0,0.007673,0.067998
smoker,-0.025019,0.00375,0.007673,1.0,0.787251
medical charges,0.299008,0.198341,0.067998,0.787251,1.0


In [114]:
fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x = corr_matrix.columns, 
        y = corr_matrix.index,
        z = np.array(corr_matrix),
        text = corr_matrix.values,
        texttemplate='%{text:.3f}',
        colorscale=px.colors.diverging.RdBu,
        zmin=-1,
        zmax=1
    )
)

### Preparação dos dados

In [115]:
# Preparar dados para o modelo
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [116]:
# Column Transformer para normalizar variáveis numericas e OneHotEncode para categoricas
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [117]:
# Dividir os dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [118]:
# Aplicar o Column Transformer nos dados de treinamento e teste
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [119]:
# Mostrar as dimensões dos conjuntos
print(f'Dados de treinamento: {X_train.shape}')
print(f'Dados de teste: {X_test.shape}')

Dados de treinamento: (1070, 10)
Dados de teste: (268, 10)


### Treinamento do modelo

In [120]:
# Criar o modelo de Bagging Regressor
bagging_model = BaggingRegressor(
    estimator=LinearRegression(),
    n_estimators=10,
    random_state=51,
    #max_features=0.6
    #max_samples=0.4
)

In [121]:
# Treinar o modelo 
bagging_model.fit(X_train, y_train)

### Análise dos Resultados

In [122]:
# Fazer predições com base no modelo treinado
y_pred = bagging_model.predict(X_test)

In [123]:
# Mostrar y_pred
y_pred

array([ 8825.87431553, 36759.53952543,  2789.6710757 , 11152.97794808,
       33991.76541249, 11632.81051037, 11580.72300833, 15000.42958011,
        5309.80564133, 10604.00092744,  9540.94339103, 12207.42863888,
        9988.72914469,  4215.08280389,  5465.27160893, 12641.61452269,
        5680.28850756,  4906.12740531, 25743.16713538, 28695.40329585,
       10281.66859612,  8516.60391307, 32424.87235965, 13117.60193313,
        6181.80084945, 16090.3763407 ,  9906.8251023 ,  2585.28455171,
       23317.45194613,  8200.85970101,  3859.51643311, 30227.16414288,
        5761.33971724,  4695.9211873 ,  7797.10958003, 11101.24558397,
       13234.27319817,  2100.22198306, 12176.21641027,  7749.64051638,
        9850.14453159,   807.53282484,  5919.96738582,  2100.91428416,
        4279.7255138 , 15127.1188436 , 15326.5736606 , 35015.30629209,
        8177.03026793, 12760.60251777,  5655.00748756, 30674.10549268,
        6997.07179132, 39919.47912168,  4399.72683258, 27533.70124054,
      

In [124]:
# Avaliar métricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [125]:
# Mostrar o Erro e R2 do Modelo
print(f'Root Mean Squared Error: {rmse}')
print(f'R2: {r2}')

Root Mean Squared Error: 6615.036060116528
R2: 0.7483433635462875


In [126]:
# Calcular a importãncia das features usando os coeficientes

# Obter os coeficientes de cada estimador
coefs = np.array([estimator.coef_ for estimator in bagging_model.estimators_])

# Calcular a média dos coeficientes absolutos
feature_importance = np.mean(np.abs(coefs), axis=0)

# Normalizar as importâncias
feature_importance = feature_importance / np.sum(feature_importance)

In [127]:
# Obter os nomes das features
feature_names = preprocessor.get_feature_names_out()

In [128]:
# Criar um dataframe com as importâncias e os nomes
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})

# Ordenar o Dataframe pela importãncia
importance_df = importance_df.sort_values('importance', ascending=True)

In [129]:
# Criar o gráfico de barras para mostrar a importância das features
fig = px.bar(importance_df, x='importance', y='feature', title='Importância das Features',
             orientation='h')

fig.update_xaxes(tickangle=45)
fig.show()

### Verificar propriedades do modelo

In [130]:
bagging_model.estimators_samples_

[array([503, 347, 592, ..., 894, 379, 343]),
 array([914,  70, 844, ..., 436, 320, 822]),
 array([151, 735, 546, ..., 669, 536, 198]),
 array([405, 317,  32, ..., 592, 790, 440]),
 array([776, 345, 478, ..., 102, 934, 750]),
 array([ 514, 1037,  824, ...,  827,  842,  876]),
 array([ 350,  900, 1045, ...,  167,  341,  985]),
 array([649, 979, 314, ..., 376, 597, 985]),
 array([875, 670, 998, ..., 114, 230, 555]),
 array([ 962, 1069,  677, ..., 1051,  745,  898])]

In [131]:
bagging_model.estimators_samples_[2].shape

(1070,)

In [132]:
bagging_model.estimators_features_

[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])]

### Salvar dados e pre-processador do modelo

In [133]:
# Salvar  dataframe como CSV
df_costs.to_csv('./datasets/healthcosts_cleaned.csv', index=False)

In [134]:
# Salvar o preprocessor
import joblib

joblib.dump(preprocessor, './preprocessor_dataset_healthcosts.pkl')

['./preprocessor_dataset_healthcosts.pkl']