## 0.0 Imports

In [None]:
import sweetviz
import inflection
import optuna
import warnings

import numpy  as np
import pandas as pd

from sklearn.tree              import DecisionTreeClassifier
from sklearn.svm               import SVC
from sklearn.naive_bayes       import GaussianNB
from sklearn.linear_model      import LogisticRegression
from sklearn.metrics           import precision_score, accuracy_score
from boruta                    import BorutaPy
from sklearn                   import metrics       as mt
from xgboost                   import XGBClassifier
from lightgbm                  import LGBMClassifier
from catboost                  import CatBoostClassifier
from sklearn.ensemble          import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing     import LabelEncoder
from sklearn.model_selection   import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.feature_selection import RFECV


warnings.filterwarnings('ignore')

### 0.1 Loading Data

In [None]:
df_raw = pd.read_csv('data/train.csv')

In [None]:
df_raw.head()

In [None]:
df_raw.columns

In [None]:
df_raw.shape

### 0.2 Helper Functions

In [None]:
def classification_metrics(model, y_val, y_hat):
    precision_micro = mt.precision_score(y_val, y_hat, average='micro')    
    acc_score = mt.accuracy_score(y_val, y_hat)
    recall_micro = mt.recall_score(y_val, y_hat)
    f1_micro = mt.f1_score(y_val, y_hat)
    
    df = pd.DataFrame([model.__name__, precision_micro, acc_score, recall_micro, f1_micro],
                       index=['Model', 'Precision', 'Accuracy', 'Recall', 'F1'])
    
    return df

def data_cleaning(df1):
    
    cols_old = df1.columns
    cols_new = cols_old.map(lambda x: inflection.parameterize(x, separator='_'))
    df1.columns = cols_new
    
    df1 = df1.fillna(0)
    
    return df1

def encoding_data(df4):
    le = LabelEncoder()

    colunas = categorical_attributes.columns

    for coluna in colunas:
        df4[coluna] = le.fit_transform(df4[coluna])
        
    return df4


def feature_eng(df21):

    survey_features = ['instore_wifi', 'open_close_time_convenient', 'easy_of_online_shopping', 'store_location', 
                       'toilet_cleaning', 'dressing_room', 'waiting_room','kids_entertainment','seller_service', 'showroom', 'self-store', 
                       'purchase_service', 'store_service', 'cleanliness']

    df21['average_rating'] = df21[survey_features].mean(axis=1)
    df21['max_rating'] = df21[survey_features].max(axis=1)
    df21['min_rating'] = df21[survey_features].min(axis=1)
    df21['var_rating'] = df21[survey_features].var(axis=1)

    df21['count_positive'] = df21[survey_features].apply(lambda row: sum(row > 3), axis=1)
    df21['count_negative'] = df21[survey_features].apply(lambda row: sum(row <= 3), axis=1)

    return df21


# LGBM Classifier
# Define the objective function for Optuna optimization LGBM
def objective_lgbm(trial):
    # Define hyperparameter search spaces for Optuna
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 15, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1.0),
    }

    # Initialize and train the LGBMClassifier with the suggested hyperparameters
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = lgb_model.predict(X_val)

    # Calculate micro-averaged precision manually
    precision_micro = precision_score(y_val, y_pred, average='micro')

    return precision_micro

# XGBoost Classifier
# Define the objective function for Optuna optimization for XGBoost Classifier
def objective_xgb(trial):
    # Define hyperparameter search spaces for Optuna
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.2),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1.0),
    }

    # Initialize and train the XGBClassifier with the suggested hyperparameters
    xgb_model = XGBClassifier(**params)
    xgb_model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = xgb_model.predict(X_val)

    # Calculate micro-averaged precision manually
    precision_micro = precision_score(y_val, y_pred, average='micro')

    return precision_micro

# CatBoostClassifier
def objective_cbc(trial):
    # Define hyperparameter search spaces for Optuna
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'iterations': trial.suggest_int('iterations', 100, 500),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.8, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
    }

    # Initialize and train the CatBoostClassifier with the suggested hyperparameters
    cat_model = CatBoostClassifier(**params, verbose=False)
    cat_model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = cat_model.predict(X_val)

    # Calculate micro-averaged accuracy manually
    accuracy_micro = precision_score(y_val, y_pred, average='micro')

    return accuracy_micro



# Random Forest Classifier Optuna
def objective_rfc(trial):
    # Define hyperparameter search spaces for Optuna
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),
    }

    # Initialize and train the RandomForestClassifier with the suggested hyperparameters
    rf_model = RandomForestClassifier(**params, random_state=42)
    rf_model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = rf_model.predict(X_val)

    # Calculate micro-averaged accuracy manually
    accuracy_micro = precision_score(y_val, y_pred, average='micro')

    return accuracy_micro

## 1.0 Data Description

In [None]:
df1 = df_raw.copy()

In [None]:
df1.head()

### 1.1 Data Shape

In [None]:
df1.shape

### 1.2 Rename Columns

In [None]:
cols_old = df1.columns
cols_new = cols_old.map(lambda x: inflection.parameterize(x, separator='_'))
df1.columns = cols_new

### 1.3 Check NaN

In [None]:
df1.isna().sum()

In [None]:
df1 = df1.fillna(0)

### 1.4 Change Types

In [None]:
df1.dtypes

### 1.5 Descriptive Statistics

In [None]:
numerical_attributes = df1.select_dtypes(include=['float64','int64'])
categorical_attributes = df1.select_dtypes(exclude=['float64','int64'])

In [None]:
disp = numerical_attributes.agg(['min', 'max', lambda x: max(x) - min(x), 'mean', 'median','std','skew','kurtosis']).T.rename(columns={'<lambda>':'range'})
disp

In [None]:
categorical_attributes

In [None]:
# #sweetviz
# report = sweetviz.analyze(df1)
# report.show_html()

## 2.0 Feature Engineering

In [None]:
df2 = df1.copy()

In [None]:
# df2 = feature_eng(df2)

## 3.0 EDA

In [None]:
df3 = df2.copy()

In [None]:
df3.head()

## 4.0 Pre-Processing

In [None]:
df4 = df3.copy()

In [None]:
df4 = df3.copy()

le = LabelEncoder()

colunas = ['gender', 'customer_type', 'type_of_purchase', 'store_size']

for coluna in colunas:
    df4[coluna] = le.fit_transform(df4[coluna])

In [None]:
df4['satisfaction'] = df4['satisfaction'].apply(lambda x: 1 if x =='Satisfied' else 0)

In [None]:
# Seletor de colunas automatizado
# rf = RandomForestClassifier(n_jobs=-1)

# X_imp = df4.drop('satisfaction', axis=1).copy()
# y_imp = df4['satisfaction'].copy()

# rf.fit(X_imp, y_imp)

# importances = rf.feature_importances_
# ranking_columns = X_imp.columns

In [None]:
# pd.DataFrame({'Feature':ranking_columns, 'Ranking':importances}).sort_values(by='Ranking', ascending=False)

In [None]:
# #RFE
# rfe = RFECV(estimator=rf, cv=5, n_jobs=-1)

# #train RFE
# X_rfe = rfe.fit_transform(X_imp, y_imp)

# selected_columns = X_imp.columns[rfe.support_]

In [None]:
# # Seletor de colunas RFECV

# selected_columns = ['id', 'customer_type', 'age', 'type_of_purchase', 'store_size',
#                     'store_distance', 'instore_wifi', 'open_close_time_convenient',
#                     'easy_of_online_shopping', 'store_location', 'dressing_room',
#                     'waiting_room', 'kids_entertainment', 'seller_service', 'showroom',
#                     'self-store', 'purchase_service', 'store_service', 'cleanliness',
#                     'delivery_delay_in_minutes', 'average_rating', 'max_rating',
#                     'min_rating', 'var_rating', 'count_positive', 'count_negative']

In [None]:
# Seletor de colunas do Bernardo

# selected_columns = ['min_rating','count_positive','average_rating', 'dressing_room','waiting_room',
#                     'max_rating','seller_service','showroom','cleanliness','instore_wifi','self-store',
#                     'toilet_cleaning', 'easy_of_online_shopping','age']

In [None]:
# Colunas padrão do Dataset

selected_columns = ['id', 'gender', 'customer_type', 'age', 'type_of_purchase',
       'store_size', 'store_distance', 'instore_wifi',
       'open_close_time_convenient', 'easy_of_online_shopping',
       'store_location', 'toilet_cleaning', 'dressing_room', 'waiting_room',
       'kids_entertainment', 'seller_service', 'showroom', 'self-store',
       'purchase_service', 'store_service', 'cleanliness',
       'carrier_delay_in_minutes', 'delivery_delay_in_minutes'
       ]

In [None]:
# Aplicação do Boruta para seleção de features

# rf = RandomForestClassifier(n_jobs=-1, n_estimators=500, oob_score=True, max_depth=6)

# X_boruta = df4.drop('satisfaction', axis=1).values.copy()
# y_boruta = df4['satisfaction'].values.copy()

# feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
# feat_selector.fit(X_boruta, y_boruta)

In [None]:
# cols_selected = feat_selector.support_.tolist()
# cols_selected

## 5.0 Machine Learning

In [None]:
df5 = df4.copy()

### 5.1 Split Train-Validation

In [None]:
X = df5.loc[:,selected_columns].copy()

y = df5['satisfaction'].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### 5.2 Test Models

In [None]:
# model_list = [RandomForestClassifier,
#              XGBClassifier,
#              CatBoostClassifier,
#              LGBMClassifier]

# for model in model_list:
#     if model.__name__ == 'CatBoostClassifier':
#         mds = model(verbose=False)
#     else:
#         mds = model()
#     kfold = KFold(n_splits=10)
#     s = cross_val_score(mds, X, y, scoring='precision_micro', cv=kfold)
#     print(f"{model.__name__:22} Precision_micro: {s.mean():.5f} +/- {s.std():.3f} ")

### 5.3 XGboost

In [None]:
#define model
xgb_model = XGBClassifier()

#train model
xgb_model.fit(X_train, y_train)

#predict
yhat_xgb = xgb_model.predict(X_val)
xgb_metrics = classification_metrics(XGBClassifier, y_val, yhat_xgb)
xgb_metrics

#### 5.3.1 XGboost Fine Tune

In [None]:
# # Perform Optuna optimization to find the best hyperparameters
# study = optuna.create_study(direction='maximize')
# study.optimize(objective_xgb, n_trials=100)

# # Get the best hyperparameters
# best_params = study.best_params
# print("Best hyperparameters:", best_params)

In [None]:
# Trial 1 finished with value: 0.9658341754487272 and 

best_params = {'learning_rate': 0.03675463577871452, 'n_estimators': 358, 'max_depth': 9, 'min_child_weight': 3, 
             'subsample': 0.9070981291095677, 'colsample_bytree': 0.8654054658354458, 'gamma': 0.1559437479399407, 
             'reg_alpha': 0.3193815108502869, 'reg_lambda': 0.816066093937444}


# Best is trial 1 with value: 0.9658341754487272.

In [None]:
# # Train the XGBClassifier with the best hyperparameters on the entire training dataset
# best_xgb_model = XGBClassifier(**best_params)
# best_xgb_model.fit(X_train, y_train)

In [None]:
# # Evaluate the model on the testing set
# y_pred = best_xgb_model.predict(X_val)
# precision_micro = precision_score(y_val, y_pred, average='micro')
# print("Micro-averaged Precision on the testing set:", precision_micro)

In [None]:
# # Define the hyperparameter grid for Grid Search
# param_grid = {
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators': [100, 150, 200],
#     'max_depth': [3, 4, 5],
#     'min_child_weight': [1, 3, 5],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
#     'gamma': [0, 0.1, 0.2],
#     'reg_alpha': [0, 0.1, 0.5],
#     'reg_lambda': [0, 0.1, 1.0],
# }

# # Perform Grid Search to find the best hyperparameters
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_
# print("Best hyperparameters:", best_params)

### 5.4 Random Forest

In [None]:
#define model
rf_model = RandomForestClassifier()

#train model
rf_model.fit(X_train, y_train)

#predict
yhat_rf = rf_model.predict(X_val)
rf_metrics = classification_metrics(RandomForestClassifier, y_val, yhat_rf)
rf_metrics

#### 5.4.1 Random Forest Fine Tune

In [None]:
# # Perform Optuna optimization to find the best hyperparameters
# study = optuna.create_study(direction='maximize')
# study.optimize(objective_rfc, n_trials=100)

# # Get the best hyperparameters
# best_params = study.best_params
# print("Best hyperparameters:", best_params)

In [None]:
# finished with value: 0.9550069775275492 and parameters: 

best_params = {'n_estimators': 217, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 0.43036001596791873}

#Best is trial 11 with value: 0.9550069775275492

In [None]:
# # Train the RandomForestClassifier with the best hyperparameters on the entire training dataset
# best_rf_model = RandomForestClassifier(**best_params, random_state=42)
# best_rf_model.fit(X_train, y_train)

# # Evaluate the model on the testing set
# y_pred = best_rf_model.predict(X_val)
# accuracy_micro = accuracy_score(y_val, y_pred)
# print("Micro-averaged Accuracy on the testing set:", accuracy_micro)

In [None]:
# # Define the hyperparameter grid for Grid Search
# param_grid = {
#     'n_estimators': [100, 200, 300],     # Number of trees in the forest
#     'max_depth': [None, 10, 20, 30],     # Maximum depth of each tree
#     'min_samples_split': [2, 5, 10],     # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4],       # Minimum number of samples required to be at a leaf node
#     'criterion': ['gini', 'entropy'],    # Function to measure the quality of a split
#     'bootstrap': [True, False]           # Whether to use bootstrapping for training trees
# }

# # Perform Grid Search to find the best hyperparameters
# grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_
# print("Best hyperparameters:", best_params)

# Best hyperparameters: {'bootstrap': False, 'criterion': 'entropy', 
#                        'max_depth': None, 'max_features': 'auto', 
#                        'min_samples_leaf': 1, 'min_samples_split': 10, 
#                        'n_estimators': 100}

### 5.5 CatBoost

In [None]:
#define model
cbc_model = CatBoostClassifier(verbose=False)

#train model
cbc_model.fit(X_train, y_train)

#predict
yhat_cat = cbc_model.predict(X_val)
cat_metrics = classification_metrics(CatBoostClassifier, y_val, yhat_cat)
cat_metrics

In [None]:
# cbc_model = CatBoostClassifier(learning_rate=0.07387672224251493, iterations=359, depth=8, l2_leaf_reg=5.734460480597243, 
#                                 subsample=0.9402392714951302, colsample_bylevel=0.8203118238462109, border_count=180, verbose=False)

# #train model
# cbc_model.fit(X_train, y_train)

# #predict
# yhat_cat = cbc_model.predict(X_val)
# cat_metrics = classification_metrics(CatBoostClassifier, y_val, yhat_cat)
# cat_metrics

#### 5.5.1 CatBoost Fine Tune

In [None]:
# Perform Optuna optimization to find the best hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(objective_cbc, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters:", best_params)

In [None]:
#Trial 47 finished with value: 0.9669890765603195 and parameters: 


best_params = {'learning_rate': 0.07387672224251493, 'iterations': 359, 'depth': 8, 'l2_leaf_reg': 5.734460480597243, 
               'subsample': 0.9402392714951302, 'colsample_bylevel': 0.8203118238462109, 'border_count': 180}


#Best is trial 47 with value: 0.9669890765603195.

In [None]:
# # Train the CatBoostClassifier with the best hyperparameters on the entire training dataset
# best_cat_model = CatBoostClassifier(**best_params, verbose=False)
# best_cat_model.fit(X_train, y_train)
# # Evaluate the model on the testing set
# y_pred = best_cat_model.predict(X_val)
# accuracy_micro = accuracy_score(y_val, y_pred)
# print("Micro-averaged Accuracy on the testing set:", accuracy_micro)

In [None]:
# catboost_model = CatBoostClassifier()

# param_grid = {
#     'iterations': [100, 150, 200],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'depth': [4, 6, 8],
#     'l2_leaf_reg': [1, 3, 5]
# }

# grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=False)
# grid_search.fit(X_train, y_train)

In [None]:
# # Get the best hyperparameters and train the CatBoostClassifier with them
# best_params = grid_search.best_params_
# best_params = {'depth': 8, 'iterations': 200, 'l2_leaf_reg': 5, 'learning_rate': 0.1}

In [None]:
# # Create a new CatBoostClassifier with the best parameters
# best_catboost_model = CatBoostClassifier(
#     iterations=best_params['iterations'],
#     learning_rate=best_params['learning_rate'],
#     depth=best_params['depth'],
#     l2_leaf_reg=best_params['l2_leaf_reg'],
#     verbose=False
# )

# # Train the model on the entire training dataset
# best_catboost_model.fit(X_train, y_train)
# y_pred = best_catboost_model.predict(X_val)
# accuracy = mt.accuracy_score(y_pred, y_val)
# accuracy

### 5.6 LGBMClassifier

In [None]:
#define model
lgbm_model = LGBMClassifier()

#train model
lgbm_model.fit(X_train, y_train)

#predict
yhat_lgb = lgbm_model.predict(X_val)
lgb_metrics = classification_metrics(LGBMClassifier, y_val, yhat_lgb)
lgb_metrics

#### 5.6.1 LGBMClassifier Fine Tune

In [None]:
# # Perform Optuna optimization to find the best hyperparameters
# study = optuna.create_study(direction='maximize')
# study.optimize(objective_lgbm, n_trials=100)

# # Get the best hyperparameters
# best_params = study.best_params
# print("Best hyperparameters:", best_params)

In [None]:
# 11 finished with value: 0.9657379336894278 and parameters: 
# {'learning_rate': 0.0987538633591418, 'n_estimators': 378, 'max_depth': 8, 'num_leaves': 31, 'min_child_samples': 1, 
# 'subsample': 0.912601414559834, 'colsample_bytree': 0.875643265547526, 
# 'reg_alpha': 0.9765911589066228, 'reg_lambda': 0.3294979414734899}. 
# Best is trial 11 with value: 0.9657379336894278.

In [None]:
best_params = {'learning_rate': 0.0987538633591418, 'n_estimators': 378, 'max_depth': 8, 'num_leaves': 31, 'min_child_samples': 1, 
'subsample': 0.912601414559834, 'colsample_bytree': 0.875643265547526, 
'reg_alpha': 0.9765911589066228, 'reg_lambda': 0.3294979414734899}

In [None]:
# # Train the LGBMClassifier with the best hyperparameters on the entire training dataset
# best_lgb_model = LGBMClassifier(**best_params)
# best_lgb_model.fit(X_train, y_train)

# # Evaluate the model on the testing set
# y_pred = best_lgb_model.predict(X_val)
# precision_micro = precision_score(y_val, y_pred, average='micro')
# print("Micro-averaged Precision on the testing set:", precision_micro)

In [None]:
# # Define the hyperparameter grid for Grid Search
# param_grid = {
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators': [100, 150, 200],
#     'max_depth': [3, 4, 5],
#     'num_leaves': [15, 31, 63],
#     'min_child_samples': [1, 5, 10],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
#     'reg_alpha': [0, 0.1, 0.5],
#     'reg_lambda': [0, 0.1, 1.0],
# }

# # Perform Grid Search to find the best hyperparameters
# grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=3, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_
# print("Best hyperparameters:", best_params)

## 6.0 Ensemble Learning

In [None]:
df6 = df5.copy()

In [None]:
cbc_model = CatBoostClassifier(learning_rate=0.07387672224251493, iterations=359, depth=8, l2_leaf_reg=5.734460480597243, 
                                subsample=0.9402392714951302, colsample_bylevel=0.8203118238462109, border_count=180, verbose=False)

rf_model = RandomForestClassifier(bootstrap=False, criterion='entropy', n_estimators=217, max_depth=10, min_samples_split=8,
                                  min_samples_leaf=1, max_features=0.43036001596791873, n_jobs=-1, random_state=42)

xgb_model = XGBClassifier(learning_rate=0.03675463577871452,  n_estimators=358, max_depth=9, min_child_weight=3, 
                          subsample=0.9070981291095677, colsample_bytree=0.8654054658354458, gamma=0.1559437479399407, 
                          reg_alpha=0.3193815108502869, reg_lambda=0.816066093937444, random_state=42, n_jobs=-1)

lgbm_model = LGBMClassifier(learning_rate=0.0987538633591418, n_estimators=378, max_depth=8, num_leaves=31, min_child_samples=1, 
                            subsample=0.912601414559834, colsample_bytree=0.875643265547526, reg_alpha=0.9765911589066228, 
                            reg_lambda=0.3294979414734899, random_state=42, n_jobs=-1)


logreg_model = LogisticRegression(max_iter=1000)

nb_model = GaussianNB()

dt_model = DecisionTreeClassifier()


# Voting Classifier Model
voting_clf = VotingClassifier(estimators=[('cbc', cbc_model), ('random_forest', rf_model), ('xgboost', xgb_model),
                                            ('lgbm', lgbm_model), ('logreg', logreg_model),
                                            ('gaussian', nb_model), ('decisiontree', dt_model)],
                            voting='soft', # 'soft' usa probabilidades para votação, enquanto 'hard' usa rótulos
                            weights=[3, 1, 1, 1, 1, 1, 1], # Pode ser usado para ponderar a importância de cada classificador, como [2, 1, 3, ...]
                            n_jobs=-1)

voting_clf.fit(X_train, y_train)

y_pred_vot = voting_clf.predict(X_val)

result = mt.precision_score(y_val, y_pred_vot, average='micro')
result

## 7.0 Submission

In [None]:
X_test = pd.read_csv('data/test.csv')

In [None]:
X_test = data_cleaning(X_test)
# X_test = feature_eng(X_test)

In [None]:
colunas_test = ['gender', 'customer_type', 'type_of_purchase', 'store_size']

for coluna in colunas_test:
    X_test[coluna] = le.fit_transform(X_test[coluna])

### 7.1 Predict on test

In [None]:
X = X.loc[:, selected_columns]
X_test = X_test.loc[:, selected_columns]

In [None]:
voting_clf.fit(X, y)

In [None]:
# yhat_test = voting_clf.predict(X_test)
# submission_df = X_test.copy()
# submission_df['Satisfaction'] = yhat_test
# submission_df = submission_df.loc[:,['id','Satisfaction']]
# submission_df.to_csv('submission/submission_hypertune_05.csv', index=False)