In [17]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
tips = pd.read_csv('../data/raw/tips.csv')
display(tips.head())

dictionary = pd.read_csv('../data/external/data-dict.csv')
display(dictionary)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


Unnamed: 0,variavel,descricao,tipo,subtipo
0,total_bill,Valor total da conta (em dólares),quantitativa,contínua
1,tip,Valor da gorjeta (em dólares),quantitativa,contínua
2,sex,Sexo da pessoa que pagou a conta,qualitativa,nominal
3,smoker,Indica se havia fumantes no grupo,qualitativa,nominal
4,day,Dia da semana em que a refeição foi consumida,qualitativa,nominal
5,time,Momento do dia em que a refeição foi consumida,qualitativa,nominal
6,size,Número de pessoas no grupo,quantitativa,discreta


In [3]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


# Tratamentos

In [5]:
target_variable = 'tip'
quantitative_variables = (
    dictionary
    .query("tipo == 'quantitativa' and variavel != @target_variable")
    .variavel
    .to_list()
)
qualitative_variables = (
    dictionary
    .query("tipo == 'qualitativa' and variavel != @target_variable")
    .variavel
    .to_list()
)

In [9]:
X = tips.drop(columns=[target_variable])
y = tips[[target_variable]]

In [23]:
# tratamento de dados discrepantes
def remove_outliers_iqr(df):
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


tips = remove_outliers_iqr(tips)

In [24]:
quantitative_preprocess = Pipeline([
    ('missing', SimpleImputer(strategy='median')), # tratamento de dados ausentes
    ('normalization', StandardScaler()), # normalização
])

qualitative_preprocess = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados ausentes
    ('encoding', OneHotEncoder()) # transformação de variáveis
])

preprocess = ColumnTransformer([
    ('quantitative', quantitative_preprocess, quantitative_variables),
    ('qualitative', qualitative_preprocess, qualitative_variables)
])