## Setup

In [1]:
import math
import numpy as np
import pandas as pd
import inflection

import seaborn as sns
import matplotlib.pyplot as plt
import sweetviz

import warnings
warnings.filterwarnings("ignore")

from scipy import stats

from IPython.core.display import HTML

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

### Configurações

In [2]:
def jupyter_settings():
    %matplotlib inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    sns.set()

In [3]:
jupyter_settings()

## Coleta dos Dados

In [4]:
df_raw = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

In [5]:
df_raw.shape

(103904, 24)

In [6]:
df_test.shape

(25976, 23)

In [7]:
df_raw.sample(10)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Purchase,Store size,Store distance,InStore wifi,Open/Close time convenient,Easy of online shopping,Store location,Toilet cleaning,Dressing room,Waiting room,Kids entertainment,Seller service,Showroom,Self-Store,Purchase service,Store Service,Cleanliness,Carrier delay in minutes,Delivery delay in minutes,Satisfaction
4211,121459,Female,Loyal Customer,26,Personal,Medium,290,4,4,3,4,5,3,5,5,5,5,5,4,4,5,0,0.0,Satisfied
2123,8606,Male,Loyal Customer,37,Personal,Small,109,3,1,3,2,4,3,4,4,2,1,3,1,2,4,0,0.0,Neutral or Dissatisfaction
13444,127267,Female,Loyal Customer,55,Gift,Large,2212,3,5,5,5,1,4,3,3,3,3,3,2,3,4,0,0.0,Neutral or Dissatisfaction
78973,24967,Male,Loyal Customer,44,Gift,Small,949,4,2,2,2,4,4,4,4,2,4,5,2,3,4,0,0.0,Satisfied
3464,109219,Male,Loyal Customer,36,Gift,Large,255,1,1,1,1,2,4,3,1,1,1,1,1,1,2,0,0.0,Neutral or Dissatisfaction
42276,96817,Female,Loyal Customer,44,Gift,Large,849,4,4,4,4,4,4,4,5,5,4,5,3,5,5,9,2.0,Satisfied
13255,36983,Male,Loyal Customer,46,Gift,Medium,759,4,3,3,3,4,4,4,4,1,1,3,2,1,4,0,0.0,Satisfied
15347,121888,Male,Loyal Customer,46,Personal,Medium,366,1,4,1,2,3,1,1,3,1,1,2,3,5,3,0,0.0,Neutral or Dissatisfaction
93047,111224,Female,Loyal Customer,56,Personal,Medium,1447,1,4,1,4,2,1,1,5,5,1,5,5,5,1,19,20.0,Neutral or Dissatisfaction
87032,116550,Female,Loyal Customer,32,Personal,Medium,480,1,3,1,4,1,1,1,1,2,3,3,1,3,1,1,0.0,Neutral or Dissatisfaction


In [8]:
df_raw.tail()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Purchase,Store size,Store distance,InStore wifi,Open/Close time convenient,Easy of online shopping,Store location,Toilet cleaning,Dressing room,Waiting room,Kids entertainment,Seller service,Showroom,Self-Store,Purchase service,Store Service,Cleanliness,Carrier delay in minutes,Delivery delay in minutes,Satisfaction
103899,94171,Female,disloyal Customer,23,Gift,Medium,192,2,1,2,3,2,2,2,2,3,1,4,2,3,2,3,0.0,Neutral or Dissatisfaction
103900,73097,Male,Loyal Customer,49,Gift,Large,2347,4,4,4,4,2,4,5,5,5,5,5,5,5,4,0,0.0,Satisfied
103901,68825,Male,disloyal Customer,30,Gift,Large,1995,1,1,1,3,4,1,5,4,3,2,4,5,5,4,7,14.0,Neutral or Dissatisfaction
103902,54173,Female,disloyal Customer,22,Gift,Medium,1000,1,1,1,5,1,1,1,1,4,5,1,5,4,1,0,0.0,Neutral or Dissatisfaction
103903,62567,Male,Loyal Customer,27,Gift,Large,1723,1,3,3,3,1,1,1,1,1,1,4,4,3,1,0,0.0,Neutral or Dissatisfaction


In [9]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          103904 non-null  int64  
 1   Gender                      103904 non-null  object 
 2   Customer Type               103904 non-null  object 
 3   Age                         103904 non-null  int64  
 4   Type of Purchase            103904 non-null  object 
 5   Store size                  103904 non-null  object 
 6   Store distance              103904 non-null  int64  
 7   InStore wifi                103904 non-null  int64  
 8   Open/Close time convenient  103904 non-null  int64  
 9   Easy of online shopping     103904 non-null  int64  
 10  Store location              103904 non-null  int64  
 11  Toilet cleaning             103904 non-null  int64  
 12  Dressing room               103904 non-null  int64  
 13  Waiting room  

In [10]:
df_test.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Purchase,Store size,Store distance,InStore wifi,Open/Close time convenient,Easy of online shopping,Store location,Toilet cleaning,Dressing room,Waiting room,Kids entertainment,Seller service,Showroom,Self-Store,Purchase service,Store Service,Cleanliness,Carrier delay in minutes,Delivery delay in minutes
0,19556,Female,Loyal Customer,52,Gift,Medium,160,5,4,3,4,3,4,3,5,5,5,5,2,5,5,50,44.0
1,90035,Female,Loyal Customer,36,Gift,Large,2863,1,1,3,1,5,4,5,4,4,4,4,3,4,5,0,0.0
2,12360,Male,disloyal Customer,20,Gift,Medium,192,2,0,2,4,2,2,2,2,4,1,3,2,2,2,0,0.0
3,77959,Male,Loyal Customer,44,Gift,Large,3377,0,0,0,2,3,4,4,1,1,1,1,3,1,4,0,6.0
4,36875,Female,Loyal Customer,49,Gift,Medium,1182,2,3,4,3,4,1,2,2,2,2,2,4,2,4,0,20.0


In [11]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          25976 non-null  int64  
 1   Gender                      25976 non-null  object 
 2   Customer Type               25976 non-null  object 
 3   Age                         25976 non-null  int64  
 4   Type of Purchase            25976 non-null  object 
 5   Store size                  25976 non-null  object 
 6   Store distance              25976 non-null  int64  
 7   InStore wifi                25976 non-null  int64  
 8   Open/Close time convenient  25976 non-null  int64  
 9   Easy of online shopping     25976 non-null  int64  
 10  Store location              25976 non-null  int64  
 11  Toilet cleaning             25976 non-null  int64  
 12  Dressing room               25976 non-null  int64  
 13  Waiting room                259

### Renomear Colunas

In [12]:
df1 = df_raw.copy()
df1.columns

Index(['id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase',
       'Store size', 'Store distance', 'InStore wifi',
       'Open/Close time convenient', 'Easy of online shopping',
       'Store location', 'Toilet cleaning', 'Dressing room', 'Waiting room',
       'Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store',
       'Purchase service', 'Store Service', 'Cleanliness',
       'Carrier delay in minutes', 'Delivery delay in minutes',
       'Satisfaction'],
      dtype='object')

In [13]:
old_columns = [ 'id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase',
       'Store size', 'Store distance', 'InStore wifi',
       'Open/Close time convenient', 'Easy of online shopping',
       'Store location', 'Toilet cleaning', 'Dressing room', 'Waiting room',
       'Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store',
       'Purchase service', 'Store Service', 'Cleanliness',
       'Carrier delay in minutes', 'Delivery delay in minutes',
       'Satisfaction' ]


old_columns = pd.Series(old_columns).apply(lambda x: x.strip().replace(' ', '_').lower())

# Rename
df1.columns = old_columns
df1 = df1.rename(columns={'self-store':'self_store'})

df1.columns

Index(['id', 'gender', 'customer_type', 'age', 'type_of_purchase',
       'store_size', 'store_distance', 'instore_wifi',
       'open/close_time_convenient', 'easy_of_online_shopping',
       'store_location', 'toilet_cleaning', 'dressing_room', 'waiting_room',
       'kids_entertainment', 'seller_service', 'showroom', 'self_store',
       'purchase_service', 'store_service', 'cleanliness',
       'carrier_delay_in_minutes', 'delivery_delay_in_minutes',
       'satisfaction'],
      dtype='object')

In [14]:
old_columns_test = [ 'id', 'Gender', 'Customer Type', 'Age', 'Type of Purchase',
       'Store size', 'Store distance', 'InStore wifi',
       'Open/Close time convenient', 'Easy of online shopping',
       'Store location', 'Toilet cleaning', 'Dressing room', 'Waiting room',
       'Kids entertainment', 'Seller service', 'Showroom ', 'Self-Store',
       'Purchase service', 'Store Service', 'Cleanliness',
       'Carrier delay in minutes', 'Delivery delay in minutes']


old_columns_test = pd.Series(old_columns_test).apply(lambda x: x.strip().replace(' ', '_').lower())

# Rename
df_test.columns = old_columns_test
df_test = df_test.rename(columns={'self-store':'self_store'})

df_test.columns

Index(['id', 'gender', 'customer_type', 'age', 'type_of_purchase',
       'store_size', 'store_distance', 'instore_wifi',
       'open/close_time_convenient', 'easy_of_online_shopping',
       'store_location', 'toilet_cleaning', 'dressing_room', 'waiting_room',
       'kids_entertainment', 'seller_service', 'showroom', 'self_store',
       'purchase_service', 'store_service', 'cleanliness',
       'carrier_delay_in_minutes', 'delivery_delay_in_minutes'],
      dtype='object')

### Tratando NAs

In [15]:
df1.isna().sum()

id                              0
gender                          0
customer_type                   0
age                             0
type_of_purchase                0
store_size                      0
store_distance                  0
instore_wifi                    0
open/close_time_convenient      0
easy_of_online_shopping         0
store_location                  0
toilet_cleaning                 0
dressing_room                   0
waiting_room                    0
kids_entertainment              0
seller_service                  0
showroom                        0
self_store                      0
purchase_service                0
store_service                   0
cleanliness                     0
carrier_delay_in_minutes        0
delivery_delay_in_minutes     310
satisfaction                    0
dtype: int64

In [16]:
# Remover as linhas que contêm valores 'None' na coluna 'Satisfaction'
df1.dropna(subset=['delivery_delay_in_minutes'], inplace=True)

In [17]:
df1.isna().sum()

id                            0
gender                        0
customer_type                 0
age                           0
type_of_purchase              0
store_size                    0
store_distance                0
instore_wifi                  0
open/close_time_convenient    0
easy_of_online_shopping       0
store_location                0
toilet_cleaning               0
dressing_room                 0
waiting_room                  0
kids_entertainment            0
seller_service                0
showroom                      0
self_store                    0
purchase_service              0
store_service                 0
cleanliness                   0
carrier_delay_in_minutes      0
delivery_delay_in_minutes     0
satisfaction                  0
dtype: int64

In [18]:
df1.shape

(103594, 24)

## Estatística Descritiva

In [19]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,103594.0,64942.428625,37460.816597,1.0,32562.25,64890.0,97370.5,129880.0
age,103594.0,39.380466,15.113125,7.0,27.0,40.0,51.0,85.0
store_distance,103594.0,1189.325202,997.297235,31.0,414.0,842.0,1743.0,4983.0
instore_wifi,103594.0,2.729753,1.327866,0.0,2.0,3.0,4.0,5.0
open/close_time_convenient,103594.0,3.060081,1.525233,0.0,2.0,3.0,4.0,5.0
easy_of_online_shopping,103594.0,2.756984,1.398934,0.0,2.0,3.0,4.0,5.0
store_location,103594.0,2.977026,1.277723,0.0,2.0,3.0,4.0,5.0
toilet_cleaning,103594.0,3.202126,1.329401,0.0,2.0,3.0,4.0,5.0
dressing_room,103594.0,3.250497,1.349433,0.0,2.0,3.0,4.0,5.0
waiting_room,103594.0,3.439765,1.318896,0.0,2.0,4.0,5.0,5.0


## EDA Automática - Sweetviz

In [23]:
eda_raw = sweetviz.analyze(df1)
eda_raw.show_html()

                                             |                                             | [  0%]   00:00 ->…

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [25]:
eda_target = sweetviz.compare_intra(df1, df1['satisfaction'] == 'Neutral or Dissatisfaction', ['Insatisfeito', 'Satisfeito'])
eda_target.show_html()

                                             |                                             | [  0%]   00:00 ->…

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Pré Processamento para ML

### Excluindo variáveis não importantes

Escolha das variáveis conforme análise / notebook da Talitha

In [20]:
df1.columns

Index(['id', 'gender', 'customer_type', 'age', 'type_of_purchase',
       'store_size', 'store_distance', 'instore_wifi',
       'open/close_time_convenient', 'easy_of_online_shopping',
       'store_location', 'toilet_cleaning', 'dressing_room', 'waiting_room',
       'kids_entertainment', 'seller_service', 'showroom', 'self_store',
       'purchase_service', 'store_service', 'cleanliness',
       'carrier_delay_in_minutes', 'delivery_delay_in_minutes',
       'satisfaction'],
      dtype='object')

In [21]:
treino = df1.copy()
teste = df_test.copy()

In [22]:
treino.drop(['id', 'gender', 'carrier_delay_in_minutes', 'delivery_delay_in_minutes'], axis=1, inplace=True)
teste.drop(['id', 'gender', 'carrier_delay_in_minutes', 'delivery_delay_in_minutes'], axis=1, inplace=True)

In [23]:
treino.head()

Unnamed: 0,customer_type,age,type_of_purchase,store_size,store_distance,instore_wifi,open/close_time_convenient,easy_of_online_shopping,store_location,toilet_cleaning,dressing_room,waiting_room,kids_entertainment,seller_service,showroom,self_store,purchase_service,store_service,cleanliness,satisfaction
0,Loyal Customer,13,Personal,Small,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,Neutral or Dissatisfaction
1,disloyal Customer,25,Gift,Large,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,Neutral or Dissatisfaction
2,Loyal Customer,26,Gift,Large,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,Satisfied
3,Loyal Customer,25,Gift,Large,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,Neutral or Dissatisfaction
4,Loyal Customer,61,Gift,Large,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,Satisfied


### Tratar Categóricas

In [24]:
#variável target
treino['satisfaction'] = treino['satisfaction'].map({'Neutral or Dissatisfaction': 0, 'Satisfied': 1})

In [25]:
treino = pd.get_dummies(treino, columns=['customer_type', 'type_of_purchase', 'store_size'], drop_first=True)
teste = pd.get_dummies(teste, columns=['customer_type', 'type_of_purchase', 'store_size'], drop_first=True)

### Tratar Escala

In [26]:
sc = StandardScaler()

In [27]:
treino.head()

Unnamed: 0,age,store_distance,instore_wifi,open/close_time_convenient,easy_of_online_shopping,store_location,toilet_cleaning,dressing_room,waiting_room,kids_entertainment,seller_service,showroom,self_store,purchase_service,store_service,cleanliness,satisfaction,customer_type_disloyal Customer,type_of_purchase_Personal,store_size_Medium,store_size_Small
0,13,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,0,0,1,0,1
1,25,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,0,1,0,0,0
2,26,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,1,0,0,0,0
3,25,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,0,0,0,0,0
4,61,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,1,0,0,0,0


In [28]:
treino.isna().sum()

age                                0
store_distance                     0
instore_wifi                       0
open/close_time_convenient         0
easy_of_online_shopping            0
store_location                     0
toilet_cleaning                    0
dressing_room                      0
waiting_room                       0
kids_entertainment                 0
seller_service                     0
showroom                           0
self_store                         0
purchase_service                   0
store_service                      0
cleanliness                        0
satisfaction                       0
customer_type_disloyal Customer    0
type_of_purchase_Personal          0
store_size_Medium                  0
store_size_Small                   0
dtype: int64

In [29]:
teste.isna().sum()

age                                0
store_distance                     0
instore_wifi                       0
open/close_time_convenient         0
easy_of_online_shopping            0
store_location                     0
toilet_cleaning                    0
dressing_room                      0
waiting_room                       0
kids_entertainment                 0
seller_service                     0
showroom                           0
self_store                         0
purchase_service                   0
store_service                      0
cleanliness                        0
customer_type_disloyal Customer    0
type_of_purchase_Personal          0
store_size_Medium                  0
store_size_Small                   0
dtype: int64

In [30]:
treino[['age', 'store_distance']] = sc.fit_transform(treino[['age', 'store_distance']]) # ajusta as colunas para melhor escala

Aplicando para os dados de teste, porém a escala não é ajustada, mas sim apenas transformada. O ajuste é feito somente nos dados de treino.

In [31]:
teste[['age', 'store_distance']] = sc.transform(teste[['age', 'store_distance']]) # ajusta as colunas para melhor escala

In [32]:
# Separando as variáveis explicativas da variável resposta.
X = treino.drop('satisfaction', axis = 1)
Y = treino['satisfaction']

In [33]:
X.shape

(103594, 20)

In [34]:
Y.shape

(103594,)

In [35]:
X.head()

Unnamed: 0,age,store_distance,instore_wifi,open/close_time_convenient,easy_of_online_shopping,store_location,toilet_cleaning,dressing_room,waiting_room,kids_entertainment,seller_service,showroom,self_store,purchase_service,store_service,cleanliness,customer_type_disloyal Customer,type_of_purchase_Personal,store_size_Medium,store_size_Small
0,-1.745542,-0.731305,3,4,3,1,5,3,5,5,4,3,4,4,5,5,0,1,0,1
1,-0.951526,-0.956916,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,0,0,0
2,-0.885358,-0.047454,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0,0,0
3,-0.951526,-0.629028,2,5,5,5,2,2,2,2,2,5,3,1,4,2,0,0,0,0
4,1.430521,-0.977973,3,3,3,3,4,5,5,3,3,4,4,3,3,3,0,0,0,0


In [36]:
# Validação cruzada Leave One Out
kf = StratifiedKFold(n_splits = 10)

In [37]:
# Verificando a performance da Regressão Logística

lista_de_medidas = ['precision']

modelo = LogisticRegression()
accs_vc = cross_validate(modelo, X, Y, cv = kf, scoring = lista_de_medidas)

In [38]:
print(accs_vc['test_precision'].mean())

0.8695809464363176


In [39]:
# verificando a performance da Regressão Logística
def MachineLearning(dataset):

    X = treino.drop('satisfaction', axis = 1)
    y = treino['satisfaction']

    lista_de_medidas = ['accuracy', 'recall', 'precision', 'balanced_accuracy', 'f1']

    nome_das_medidas = ['acurácia', 'sensibilidade', 'vpp', 'eficiência', 'f1-score']

    lista_de_modelos = [LogisticRegression(),
                        DecisionTreeClassifier(max_depth = 3),
                        DecisionTreeClassifier(max_depth = 5),
                        DecisionTreeClassifier(max_depth = 7),
                        KNeighborsClassifier(n_neighbors = 5),
                        KNeighborsClassifier(n_neighbors = 15),
                        KNeighborsClassifier(n_neighbors = 25),
                        BaggingClassifier(),
                        RandomForestClassifier(n_estimators=300, max_depth = 3, max_features = 3),
                        RandomForestClassifier(n_estimators=300, max_depth = 3, max_features = 4),
                        RandomForestClassifier(n_estimators=300, max_depth = 6, max_features = 3),
                        RandomForestClassifier(n_estimators=300, max_depth = 6, max_features = 4)]

    nome_dos_modelos = ['Regressão Logística',
                        'Árvore (prof = 3)',
                        'Árvore (prof = 5)',
                        'Árvore (prof = 7)',
                        '5-NN',
                        '15-NN',
                        '25-NN',
                        'Bagging',
                        'Random Forest (prof = 3, mf = 3)',
                        'Random Forest (prof = 3, mf = 4)',
                        'Random Forest (prof = 6, mf = 3)',
                        'Random Forest (prof = 6, mf = 4)']

    resultados0 = {}

    for i in range(len(lista_de_modelos)):
        print('Rodando modelo: ' + nome_dos_modelos[i])
        accs_vc = cross_validate(lista_de_modelos[i], X, y, cv = kf, scoring = lista_de_medidas)

        acc = accs_vc['test_accuracy'].mean()
        sen = accs_vc['test_recall'].mean()
        vpp = accs_vc['test_precision'].mean()
        bac = accs_vc['test_balanced_accuracy'].mean()
        f1s = accs_vc['test_f1'].mean()

        resultados0[nome_dos_modelos[i]] = [acc, sen, vpp, f1s, bac]
    
    resultados = pd.DataFrame(resultados0, index = nome_das_medidas).T

    return resultados

In [40]:
MachineLearning(treino)

Rodando modelo: Regressão Logística
Rodando modelo: Árvore (prof = 3)
Rodando modelo: Árvore (prof = 5)
Rodando modelo: Árvore (prof = 7)
Rodando modelo: 5-NN
Rodando modelo: 15-NN
Rodando modelo: 25-NN
Rodando modelo: Bagging
Rodando modelo: Random Forest (prof = 3, mf = 3)
Rodando modelo: Random Forest (prof = 3, mf = 4)
Rodando modelo: Random Forest (prof = 6, mf = 3)
Rodando modelo: Random Forest (prof = 6, mf = 4)


Unnamed: 0,acurácia,sensibilidade,vpp,eficiência,f1-score
Regressão Logística,0.874433,0.835624,0.869581,0.852245,0.869871
Árvore (prof = 3),0.884627,0.906586,0.839926,0.871981,0.887208
Árvore (prof = 5),0.905081,0.875471,0.902592,0.888822,0.901601
Árvore (prof = 7),0.924745,0.891775,0.931703,0.911276,0.920869
5-NN,,,,,
15-NN,,,,,
25-NN,,,,,
Bagging,0.958019,0.933002,0.968984,0.950648,0.955078
"Random Forest (prof = 3, mf = 3)",0.894598,0.834755,0.91463,0.872845,0.887564
"Random Forest (prof = 3, mf = 4)",0.894434,0.839254,0.910177,0.873261,0.887948


## Submetendo uma predição no Kaggle

In [41]:
# modelo Bagging
def Submissao(treino, teste, modelo, nome = 'submissaoKaggle'):

    X = treino.drop('satisfaction', axis = 1)
    y = treino['satisfaction']
 
    modelo.fit(X, y)
    pred = modelo.predict(teste)

    url = '../data/raw/test.csv' 
    subm = pd.read_csv(url, usecols = ['id'])  
    subm['satisfaction'] = pred
    
    subm.to_csv(nome + '.csv', index = False)

In [43]:
bg = BaggingClassifier()

Submissao(treino, teste, modelo = bg, nome = 'SubmissaoBG-setup1')