In [1]:
from IPython.display import display, Markdown
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor


from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate

import warnings

warnings.filterwarnings('ignore')

# 1. Obtenção dos dados

In [2]:
tips = pd.read_csv('../data/raw/tips.csv')
display(tips.head())

dictionary = pd.read_csv('../data/external/data-dict.csv')
display(dictionary)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


Unnamed: 0,variavel,descricao,tipo,subtipo
0,total_bill,Valor total da conta (em dólares),quantitativa,continua
1,tip,Valor da gorjeta (em dólares),quantitativa,continua
2,sex,Sexo da pessoa que pagou a conta,qualitativa,nominal
3,smoker,Indica se havia fumantes no grupo,qualitativa,nominal
4,day,Dia da semana em que a refeição foi consumida,qualitativa,ordinal
5,time,Momento do dia em que a refeição foi consumida,qualitativa,nominal
6,size,Número de pessoas no grupo,quantitativa,discreta


In [3]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


# 2. Preparação dos dados

In [4]:
target_variable = 'tip'
quantitative_variables = (
    dictionary
    .query("tipo == 'quantitativa' and variavel != @target_variable")
    .variavel
    .to_list()
)
nominal_variables = (
    dictionary
    .query("subtipo == 'nominal'")
    .variavel
    .to_list()
)
ordinal_variables = (
    dictionary
    .query("subtipo == 'ordinal'")
    .variavel
    .to_list()
)

In [5]:
# tratamento de dados discrepantes
def remove_outliers_iqr(df):
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


tips = remove_outliers_iqr(tips)

In [17]:
X = tips.drop(columns=[target_variable], axis=1)
y = np.ravel(tips[[target_variable]])

In [18]:
quantitative_preprocess = Pipeline([
    ('normalization', StandardScaler())  # Normalização
])

nominal_preprocess = Pipeline([
    ('encoding', OneHotEncoder(sparse_output=False, drop='first'))  # Transformação de variáveis nominais
])

ordinal_preprocess = Pipeline([
    ('encoding', OrdinalEncoder())  # Transformação de variáveis ordinais
])

# Configuração do ColumnTransformer para aplicar o pré-processamento adequado a cada tipo de variável
preprocessor = ColumnTransformer([
    ('quantitative', quantitative_preprocess, quantitative_variables),
    ('nominal', nominal_preprocess, nominal_variables),
    ('ordinal', ordinal_preprocess, ordinal_variables)
])


# 3. Seleção dos modelos

In [20]:
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = 0.2
random_state = 42
scoring = 'r2'
metrics = ['neg_mean_absolute_error', 'neg_root_mean_squared_error', 'r2'] 

models = [
    ('LinearRegression', LinearRegression(), {"fit_intercept": [True, False], 'n_jobs': [None, -1]}),
    ('GradientBoostingRegressor', GradientBoostingRegressor(), {'n_estimators': range(80, 120, 10), 'max_depth': [3, 6, 8]}),
    ('Support Vector Regressor', SVR(), {'C': np.logspace(-4, 4, 20), 'kernel': ['linear', 'poly', 'sigmoid']}),
    ('DecisionTreeRegressor', DecisionTreeRegressor(), {'criterion': ['squared_error', 'absolute_error'], 'max_depth': [3, 6, 8]}),
    ('Ridge', Ridge(), {'alpha': np.logspace(-4, 4, 10),'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg']}),
    ('Lasso', Lasso(), {'alpha': [0.1, 1, 10, 100], 'selection': ['cyclic', 'random']}),
    ('ElasticNet', ElasticNet(), {'alpha': [0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9]}),
    ('K-Neighbors Regressor', KNeighborsRegressor(), {'n_neighbors': [3, 5, 7, 10], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']})
]


In [21]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )
    
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        cv=cross_validate_comparative_analysis,
        n_jobs=-1,
        scoring=metrics
    )
    
    scores['model_name'] = [model_name] * n_splits_comparative_analysis
    display(pd.DataFrame(scores).select_dtypes(include=[float, int]).agg(['mean', 'std']))
    results = pd.concat([results, pd.DataFrame(scores)], ignore_index=True)

running LinearRegression...


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_root_mean_squared_error,test_r2
mean,0.315389,0.040665,-0.6559,-0.835822,0.338065
std,0.091773,0.020801,0.063916,0.08595,0.11334


running GradientBoostingRegressor...


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_root_mean_squared_error,test_r2
mean,11.74073,0.017042,-0.648723,-0.888659,0.240036
std,4.166595,0.006505,0.106948,0.178642,0.280327


running Support Vector Regressor...


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_root_mean_squared_error,test_r2
mean,167.426227,0.011275,-0.660148,-0.861518,0.299264
std,47.602528,0.001666,0.080975,0.134405,0.150218


running DecisionTreeRegressor...


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_root_mean_squared_error,test_r2
mean,0.233988,0.013398,-0.658543,-0.919147,0.191777
std,0.085573,0.003832,0.097407,0.184591,0.287219


running Ridge...


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_root_mean_squared_error,test_r2
mean,1.786338,0.016513,-0.661757,-0.832452,0.345406
std,0.411283,0.005442,0.063594,0.083944,0.093507


running Lasso...


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_root_mean_squared_error,test_r2
mean,0.390204,0.025367,-0.669593,-0.836149,0.339958
std,0.098381,0.007086,0.06501,0.080662,0.086682


running ElasticNet...


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_root_mean_squared_error,test_r2
mean,0.397097,0.040849,-0.664187,-0.833136,0.34416
std,0.144137,0.052551,0.064088,0.084116,0.095626


running K-Neighbors Regressor...


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_root_mean_squared_error,test_r2
mean,1.102317,0.022313,-0.670032,-0.865368,0.29308
std,0.212783,0.012436,0.076516,0.0943,0.105784


In [26]:
def highlight_best(s, props=''):
    if s.name[1] != 'std':
        if s.name[0].endswith('time'):
            return np.where(s == np.nanmin(s.values), props, '')
        return np.where(s == np.nanmax(s.values), props, '')

display
(
    results
    .groupby('model_name')
    .agg(['mean', 'std']).T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)

Unnamed: 0,model_name,DecisionTreeRegressor,ElasticNet,GradientBoostingRegressor,K-Neighbors Regressor,Lasso,LinearRegression,Ridge,Support Vector Regressor
fit_time,mean,0.233988,0.397097,11.74073,1.102317,0.390204,0.315389,1.786338,167.426227
fit_time,std,0.085573,0.144137,4.166595,0.212783,0.098381,0.091773,0.411283,47.602528
score_time,mean,0.013398,0.040849,0.017042,0.022313,0.025367,0.040665,0.016513,0.011275
score_time,std,0.003832,0.052551,0.006505,0.012436,0.007086,0.020801,0.005442,0.001666
test_neg_mean_absolute_error,mean,-0.658543,-0.664187,-0.648723,-0.670032,-0.669593,-0.6559,-0.661757,-0.660148
test_neg_mean_absolute_error,std,0.097407,0.064088,0.106948,0.076516,0.06501,0.063916,0.063594,0.080975
test_neg_root_mean_squared_error,mean,-0.919147,-0.833136,-0.888659,-0.865368,-0.836149,-0.835822,-0.832452,-0.861518
test_neg_root_mean_squared_error,std,0.184591,0.084116,0.178642,0.0943,0.080662,0.08595,0.083944,0.134405
test_r2,mean,0.191777,0.34416,0.240036,0.29308,0.339958,0.338065,0.345406,0.299264
test_r2,std,0.287219,0.095626,0.280327,0.105784,0.086682,0.11334,0.093507,0.150218


# 3.1 Persistência do modelo

In [24]:
model_name, model_object, model_parameters  = [foo for foo in models if foo[0] == "Ridge"][0] 


model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )

approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model_grid_search)
])

approach.fit(X, y) #Seleciona o approach

print(f"Hiper parâmetros do modelo: {approach.steps[1][1].best_params_}")

Hiper parâmetros do modelo: {'alpha': 21.54434690031882, 'solver': 'auto'}


In [25]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco

['../models/model.joblib']

# 4 Resultados e discussões

O modelo Ridge se mostrou mais capaz dentre os modelos testados, porém os resultados não alegram.

A principal limitação identificada foi a quantidade de dados disponíveis, o que pode ter impactado negativamente o desempenho dos modelos. 

Além disso a preparação dos dados também pode ter influenciado no mal desempenho dos modelos.