# Case Ifood
*Desenvolvido por Mário de Deus*

# Installs

In [None]:
#No Google Colab será necessário executar as instalações abaixo a cada nova sessão.
#No Jupyter Notebook, Jupyter Lab ou VSCode (localhost) basta instalar uma única vez.
!pip install numba==0.60.0
!pip install pycaret==3.3.2
!pip install shap==0.47.1

# Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import shap

from pycaret.classification import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_colwidth', 1000)

import warnings
warnings.filterwarnings('ignore')

# Descrição / Objetivo do problema

* O objetivo

O objetivo da equipe é construir um modelo preditivo que produzirá o maior lucro para a próxima campanha de marketing direto, programada para o próximo mês. A nova campanha, sexta, visa a venda de um novo gadget para clientes cadastrados no Banco de Dados da empresa. Para construir o modelo, foi realizada uma campanha piloto envolvendo 2.240 clientes. Os clientes foram selecionados aleatoriamente e contatados por telefone para a aquisição do gadget. Durante os meses seguintes, os clientes que compraram a oferta foram devidamente etiquetados. O custo total da campanha da amostra foi de 6,720MU e a receita gerada pelos clientes que aceitaram a oferta foi de 3,674MU. Globalmente, a campanha teve um lucro de -3,046MU. A taxa de sucesso da campanha foi de 15%. O objetivo da equipe é desenvolver um modelo que preveja o comportamento do cliente e aplicá-lo ao restante da base de clientes. Felizmente, o modelo permitirá que a empresa escolha a dedo os clientes com maior probabilidade de comprar a oferta, deixando de fora os não respondentes, tornando a próxima campanha altamente lucrativa. Além disso, além de maximizar o lucro da campanha, o CMO está interessado em estudar as características dos clientes que desejam comprar o gadget.
Os dados
O conjunto de dados contém características sociodemográficas e firográficas de cerca de 2.240 clientes contatados. Além disso, contém um sinalizador para aqueles clientes que responderam à campanha, comprando o produto.


# Data Loading

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
#Google COlab
# df = pd.read_csv('/content/drive/MyDrive/Front End e Mobile Development/2024/20240318 - Aula06/data.csv',encoding='utf-8')

#Jupyter
df = pd.read_csv('data.csv',encoding='utf-8')

df.head()

# Data Cleaning

Drop da feature ID por ser um identificador

In [None]:
df.drop('ID',axis = 1, inplace = True, errors = 'ignore')
df.shape

## Features com valores unicos
Verificando a existência de features com valores únicos (devem ser dropadas por não contribuirem para a explicar a variação da feature target)

In [None]:
df.nunique().sort_values()

In [None]:
df.drop(['Z_CostContact','Z_Revenue'],axis=1,inplace = True, errors = 'ignore')

## NaN analysis

In [None]:
df.isna().sum()

Somente a feature Income possui valores nulos.
Analisando as linhas com valores nulos em relação aos valores da feature target

In [None]:
#Distribuição da feature Response entre as amostras com Income = NaN
df[df.Income.isna()].Response.value_counts()

In [None]:
# Proporção de 0 e 1 da feature Response no df completo
df.Response.value_counts(normalize = True)

In [None]:
print('% amostras com NaN: ',np.round((df.Income.isna().sum()/len(df))*100,2))
print('% amostras com NaN e Response = 1: ',
      np.round((( len(df[(df.Income.isna()) & (df.Response == 1)]) / len(df))*100),2))

Dado que as 24 linhas com valores Nan representam 1% do dataset total, e que entre as 24 linhas com Income == Nan somente uma apresentou Response == 1 (0.04%), as 24 linhas serão dropadas

In [None]:
print('Shape antes do dropna: ',df.shape[0])
df.dropna(axis=0,inplace = True)
print('Shape após o dropna: ',df.shape[0])

## Ajuste do dtypes

In [None]:
df = df.convert_dtypes()
df.Dt_Customer = pd.to_datetime(df.Dt_Customer)
df.Response = df.Response.astype('bool')
df.dtypes

# Feature Engineering

## Idade dos clientes

In [None]:
from datetime import datetime
ano_atual = datetime.now().year
df['Age'] = ano_atual - df.Year_Birth
df.drop('Year_Birth',axis = 1, errors = 'ignore', inplace = True)
df.head()

## Tempo como cliente

In [None]:
dt = datetime.now().date()
df['Time_Customer'] = dt - pd.to_datetime(df['Dt_Customer']).dt.date
df['Time_Customer'] = df['Time_Customer'] / 365.25
def timedelta_to_fractional_days(timedelta):
    days = timedelta.days
    seconds = timedelta.seconds
    fractional_day = seconds / (24 * 60 * 60)
    return days + fractional_day

df['Time_Customer'] = df['Time_Customer'].apply(timedelta_to_fractional_days)

print(df[['Dt_Customer', 'Time_Customer']].head())
df.drop('Dt_Customer',axis = 1, inplace = True)

### Removendo valores incoerentes com a variável Marital_Status

In [None]:
index_to_drop = df[(df['Marital_Status'] =='YOLO') | (df['Marital_Status'] =='Absurd') | (df['Marital_Status'] =='absurd') | (df['Marital_Status'] == 'Alone')].index
df.drop(index_to_drop,inplace = True)
df = df.reset_index(drop = True)
df.Marital_Status.value_counts()
print(df.shape)

In [None]:
df.rename(columns={'Response':'z_Response'},inplace = True)
cols = df.columns.sort_values()
df = df[cols]
df.rename(columns={'z_Response':'Response'},inplace = True)

df.columns

# Preparação do dataset para Modelagem


## Train Test Validation Split

In [None]:
# sample 5% of data to be used as unseen data
df_train_test = df.sample(frac=0.95, random_state=123)
df_valid = df.drop(df_train_test.index)
df_train_test.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
# print the revised shape
print('Data for Modeling: ' + str(df_train_test.shape))
print('Unseen Data For Predictions: ' + str(df_valid.shape))

# Auto ML - PYCARET

**Para o problema de negócio em questão, a métrica Precision é a mais relevante.**

In [None]:
df_train_test.dtypes

## Setup

In [None]:
s = setup(data = df_train_test,
          target = 'Response',
          fix_imbalance = True,
          remove_outliers = True,
          categorical_features = ['Education', 'Marital_Status'],
          session_id = 123)

In [None]:
# check available models
#has to be called necessary only after having defined a setup.
models()

## Comparativo entre Modelos

In [None]:
best_model = compare_models(sort = 'auc')

In [None]:
print(best_model)

## Análise do Modelo

In [None]:
#evaluate model
evaluate_model(best_model)

In [None]:
#plot model - treshold
plot_model(best_model, plot = 'threshold')

In [None]:
#plot model - auc
# plot_model(best_model, plot = 'auc')

In [None]:
#plot model - confusion matrix
plot_model(best_model, plot = 'confusion_matrix')

In [None]:
#plot model - feature
plot_model(best_model, plot = 'feature')

In [None]:
#predict model - raw score
predict_model(best_model, raw_score= True)

* Outros tipos de plot:
https://pycaret.readthedocs.io/en/latest/api/classification.html#pycaret.classification.plot_model

## Criando um Modelo

In [None]:
#create model rf
mdl_rf = create_model('rf')

## Tuning dos Hiperparâmetros

### RF

In [None]:
tuned_rf = tune_model(mdl_rf)

In [None]:
#predict rf
#mdl_rf com e sem tuning apresentaram AUC e desvio padrao (STD) praticamente iguais.
predict_model(mdl_rf, raw_score = True)

In [None]:
print(mdl_rf)

In [None]:
predict_model(eval(i), data=df_valid, raw_score = True)

In [None]:
#predict rf
predict_model(tuned_rf, raw_score = True)

## AUC Plot

In [None]:
#auc
plot_model(tuned_rf, plot = 'auc')

## Feature Importance

In [None]:
#feature
plot_model(tuned_rf, plot = 'feature')

## Matriz de Confusão

In [None]:
#confusion matrix
plot_model(tuned_rf, plot = 'confusion_matrix')

# Referências:
* https://towardsdatascience.com/introduction-to-binary-classification-with-pycaret-a37b3e89ad8d
* https://pycaret.gitbook.io/docs/get-started/quickstart#classification
* https://pycaret.readthedocs.io/en/latest/api/classification.html#pycaret.classification.plot_model

## Save Model

In [None]:
save_model(mdl_rf, 'pickle_rf_pycaret2')

In [None]:
cols_x_test = get_config(variable="X_test").columns
cols_x_test

# Testing in X_test

## 0 - Testing Model

In [150]:
def test_model(model_name):

    # Importa métricas de avaliação do scikit-learn.
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

    # Carrega um modelo treinado do PyCaret usando o nome fornecido.
    loaded_model = load_model(model_name)

    # Faz previsões no dataset de validação (df_valid) usando o modelo carregado.
    predictions = predict_model(loaded_model, data=df_valid)

    # Imprime as primeiras linhas das previsões, mostrando a coluna alvo ('Response') e a coluna de previsão ('prediction_label').
    print("Previsões:")
    print(predictions[['Response', 'prediction_label']].head())

    # Verifica se a coluna alvo ('Response') existe no dataset de validação para realizar a avaliação.
    if 'Response' in df_valid.columns:
        # Extrai os valores reais da coluna alvo.
        y_true = df_valid['Response']

        # Extrai os valores previstos.
        y_pred = predictions['prediction_label']

        # Calcula o Erro Quadrático Médio (MSE).
        mse = mean_squared_error(y_true, y_pred)

        # Calcula o Coeficiente de Determinação (R²).
        r2  = r2_score(y_true, y_pred)

        # Calcula o Erro Absoluto Médio (MAE).
        mae = mean_absolute_error(y_true, y_pred)

        # Imprime as métricas de avaliação calculadas.
        print("\nMétricas de Avaliação no Novo Dataset:")
        print(f'Mean Squared Error (MSE): {mse}')
        print(f'R-squared (R²): {r2}')
        print(f'Mean Absolute Error (MAE): {mae}')

    else:
        # Imprime uma mensagem caso a coluna alvo não esteja presente no dataset de validação.
        print("\nO novo dataset não contém a coluna alvo, portanto a avaliação não pode ser feita.")

## 1 - Random Forest - Baseline (82.70%)

In [153]:
# Cria um modelo de Random Forest (mdl_rf) usando a função 'create_model' do PyCaret.
mdl_rf = create_model('rf')

# Otimiza os hiperparâmetros do modelo de Random Forest criado (tuned_rf) usando 'tune_model'.
tuned_rf = tune_model(mdl_rf)

# Faz previsões no dataset de validação (df_valid) com o modelo otimizado e retorna as probabilidades brutas (raw_score=True).
predict_model(tuned_rf, data=df_valid, raw_score = True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8636,0.827,0.1429,0.4,0.2105,0.1538,0.1786


Unnamed: 0,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Age,Complain,Education,Income,Kidhome,...,NumStorePurchases,NumWebPurchases,NumWebVisitsMonth,Recency,Teenhome,Time_Customer,Response,prediction_label,prediction_score_0,prediction_score_1
0,0,0,0,0,0,41,0,Graduation,26646,1,...,4,2,6,26,0,11.170428,False,0,0.9900,0.0100
1,0,0,0,0,0,79,0,Graduation,37760,0,...,6,4,7,20,0,12.616007,False,0,0.8500,0.1500
2,0,0,0,0,0,40,0,Master,20559,1,...,3,2,8,88,0,12.087605,False,0,0.9700,0.0300
3,0,0,0,0,0,68,0,Graduation,65486,0,...,10,4,2,29,1,10.921285,False,0,0.9700,0.0300
4,0,0,0,0,0,61,0,Master,79143,0,...,13,6,3,2,0,12.670764,False,0,0.6250,0.3750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,0,0,0,0,0,47,0,Basic,26487,1,...,3,2,5,23,0,11.898692,False,0,0.7800,0.2200
106,0,0,0,0,0,42,0,Master,89616,0,...,12,7,2,36,0,12.128669,True,0,0.5700,0.4300
107,0,0,0,0,0,78,0,Graduation,27469,0,...,3,0,6,2,0,12.695405,False,0,0.9567,0.0433
108,0,0,0,0,0,30,0,2n Cycle,80617,0,...,8,4,2,42,0,12.501019,False,0,0.6300,0.3700


## 2 - Linear Discriminant Analysis (81.81%)

In [154]:
# Cria um modelo de Análise Discriminante Linear (LDA) (mdl_lda).
mdl_lda = create_model('lda')

# Otimiza os hiperparâmetros do modelo LDA (tuned_lda) usando 30 folds na validação cruzada,
# 15 iterações na busca e otimizando a métrica AUC.
tuned_lda = tune_model(mdl_lda, fold=30, n_iter=15, optimize='auc')

# Faz previsões no dataset de validação (df_valid) com o modelo LDA otimizado,
# retornando as probabilidades brutas (raw_score=True).
predict_model(tuned_lda, data=df_valid, raw_score = True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7455,0.8181,0.7857,0.3056,0.44,0.3143,0.3731


Unnamed: 0,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Age,Complain,Education,Income,Kidhome,...,NumStorePurchases,NumWebPurchases,NumWebVisitsMonth,Recency,Teenhome,Time_Customer,Response,prediction_label,prediction_score_0,prediction_score_1
0,0,0,0,0,0,41,0,Graduation,26646,1,...,4,2,6,26,0,11.170428,False,0,0.9494,0.0506
1,0,0,0,0,0,79,0,Graduation,37760,0,...,6,4,7,20,0,12.616007,False,1,0.4148,0.5852
2,0,0,0,0,0,40,0,Master,20559,1,...,3,2,8,88,0,12.087605,False,0,0.9443,0.0557
3,0,0,0,0,0,68,0,Graduation,65486,0,...,10,4,2,29,1,10.921285,False,0,0.9921,0.0079
4,0,0,0,0,0,61,0,Master,79143,0,...,13,6,3,2,0,12.670764,False,1,0.0513,0.9487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,0,0,0,0,0,47,0,Basic,26487,1,...,3,2,5,23,0,11.898692,False,0,0.8128,0.1872
106,0,0,0,0,0,42,0,Master,89616,0,...,12,7,2,36,0,12.128669,True,1,0.3303,0.6697
107,0,0,0,0,0,78,0,Graduation,27469,0,...,3,0,6,2,0,12.695405,False,1,0.2360,0.7640
108,0,0,0,0,0,30,0,2n Cycle,80617,0,...,8,4,2,42,0,12.501019,False,1,0.2351,0.7649


## 3 - Ridge Classifier (76.26%)

In [155]:
# Cria um modelo de Regressão Ridge (mdl_ridge).
mdl_ridge = create_model('ridge')

# Otimiza os hiperparâmetros do modelo Ridge (tuned_ridge) usando 30 folds na validação cruzada,
# 15 iterações na busca e otimizando a métrica AUC.
tuned_ridge = tune_model(mdl_ridge, fold=30, n_iter=15, optimize='auc')

# Faz previsões no dataset de validação (df_valid) com o modelo Ridge otimizado,
# retornando as probabilidades brutas (raw_score=True).
predict_model(tuned_ridge, data=df_valid, raw_score = True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.7455,0.7626,0.7857,0.3056,0.44,0.3143,0.3731


Unnamed: 0,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Age,Complain,Education,Income,Kidhome,...,NumCatalogPurchases,NumDealsPurchases,NumStorePurchases,NumWebPurchases,NumWebVisitsMonth,Recency,Teenhome,Time_Customer,Response,prediction_label
0,0,0,0,0,0,41,0,Graduation,26646,1,...,0,2,4,2,6,26,0,11.170428,False,0
1,0,0,0,0,0,79,0,Graduation,37760,0,...,1,2,6,4,7,20,0,12.616007,False,1
2,0,0,0,0,0,40,0,Master,20559,1,...,0,2,3,2,8,88,0,12.087605,False,0
3,0,0,0,0,0,68,0,Graduation,65486,0,...,2,1,10,4,2,29,1,10.921285,False,0
4,0,0,0,0,0,61,0,Master,79143,0,...,9,1,13,6,3,2,0,12.670764,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,0,0,0,0,0,47,0,Basic,26487,1,...,1,3,3,2,5,23,0,11.898692,False,0
106,0,0,0,0,0,42,0,Master,89616,0,...,5,1,12,7,2,36,0,12.128669,True,1
107,0,0,0,0,0,78,0,Graduation,27469,0,...,0,1,3,0,6,2,0,12.695405,False,1
108,0,0,0,0,0,30,0,2n Cycle,80617,0,...,6,1,8,4,2,42,0,12.501019,False,1


## 4 - LightGBM (81.44%)

In [156]:
# Cria um modelo LightGBM (mdl_gbm).
mdl_gbm = create_model('lightgbm')

# Otimiza os hiperparâmetros do modelo LightGBM (tuned_gbm) usando 30 folds na validação cruzada,
# 15 iterações na busca e otimizando a métrica AUC.
tuned_gbm = tune_model(mdl_gbm, fold=30, n_iter=15, optimize='auc')

# Faz previsões no dataset de validação (df_valid) com o modelo LightGBM otimizado,
# retornando as probabilidades brutas (raw_score=True).
predict_model(tuned_gbm, data=df_valid, raw_score = True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8364,0.8144,0.3571,0.3571,0.3571,0.2634,0.2634




Unnamed: 0,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Age,Complain,Education,Income,Kidhome,...,NumStorePurchases,NumWebPurchases,NumWebVisitsMonth,Recency,Teenhome,Time_Customer,Response,prediction_label,prediction_score_0,prediction_score_1
0,0,0,0,0,0,41,0,Graduation,26646,1,...,4,2,6,26,0,11.170428,False,0,0.9953,0.0047
1,0,0,0,0,0,79,0,Graduation,37760,0,...,6,4,7,20,0,12.616007,False,0,0.8434,0.1566
2,0,0,0,0,0,40,0,Master,20559,1,...,3,2,8,88,0,12.087605,False,0,0.9908,0.0092
3,0,0,0,0,0,68,0,Graduation,65486,0,...,10,4,2,29,1,10.921285,False,0,0.9965,0.0035
4,0,0,0,0,0,61,0,Master,79143,0,...,13,6,3,2,0,12.670764,False,0,0.7439,0.2561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,0,0,0,0,0,47,0,Basic,26487,1,...,3,2,5,23,0,11.898692,False,0,0.7560,0.2440
106,0,0,0,0,0,42,0,Master,89616,0,...,12,7,2,36,0,12.128669,True,1,0.3376,0.6624
107,0,0,0,0,0,78,0,Graduation,27469,0,...,3,0,6,2,0,12.695405,False,0,0.9377,0.0623
108,0,0,0,0,0,30,0,2n Cycle,80617,0,...,8,4,2,42,0,12.501019,False,1,0.4288,0.5712


## 5 - XGBoost (82.63%)

In [157]:
# Cria um modelo XGBoost (mdl_xgb).
mdl_xgb = create_model('xgboost')

# Otimiza os hiperparâmetros do modelo XGBoost (tuned_xgb) usando 30 folds na validação cruzada,
# 15 iterações na busca e otimizando a métrica AUC.
tuned_xgb = tune_model(mdl_xgb, fold=30, n_iter=15, optimize='auc')

# Faz previsões no dataset de validação (df_valid) com o modelo XGBoost otimizado,
# retornando as probabilidades brutas (raw_score=True).
predict_model(tuned_xgb, data=df_valid, raw_score = True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.8,0.8263,0.8571,0.375,0.5217,0.4188,0.4761


Unnamed: 0,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Age,Complain,Education,Income,Kidhome,...,NumStorePurchases,NumWebPurchases,NumWebVisitsMonth,Recency,Teenhome,Time_Customer,Response,prediction_label,prediction_score_0,prediction_score_1
0,0,0,0,0,0,41,0,Graduation,26646,1,...,4,2,6,26,0,11.170428,False,0,0.9981,0.0019
1,0,0,0,0,0,79,0,Graduation,37760,0,...,6,4,7,20,0,12.616007,False,0,0.7179,0.2821
2,0,0,0,0,0,40,0,Master,20559,1,...,3,2,8,88,0,12.087605,False,0,0.9895,0.0105
3,0,0,0,0,0,68,0,Graduation,65486,0,...,10,4,2,29,1,10.921285,False,0,0.9980,0.0020
4,0,0,0,0,0,61,0,Master,79143,0,...,13,6,3,2,0,12.670764,False,1,0.1783,0.8217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,0,0,0,0,0,47,0,Basic,26487,1,...,3,2,5,23,0,11.898692,False,1,0.1385,0.8615
106,0,0,0,0,0,42,0,Master,89616,0,...,12,7,2,36,0,12.128669,True,1,0.0999,0.9001
107,0,0,0,0,0,78,0,Graduation,27469,0,...,3,0,6,2,0,12.695405,False,0,0.9518,0.0482
108,0,0,0,0,0,30,0,2n Cycle,80617,0,...,8,4,2,42,0,12.501019,False,1,0.1162,0.8838
