In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv, find_dotenv
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score

## CARREGANDO CONSTANTES DO AMBIENTE

In [2]:
load_dotenv(find_dotenv())
CSV = os.getenv("CSV_PATH")

## Carregando dados

In [3]:
df_vendas = pd.read_csv(f'{CSV}vendas.csv', sep=',', encoding='utf-8')
df_clientes = pd.read_csv(f'{CSV}cliente.csv', sep=',', encoding='utf-8')

In [4]:
df_vendas

Unnamed: 0,nk_ota_localizer_id,fk_contact,date_purchase,time_purchase,place_origin_departure,place_destination_departure,place_origin_return,place_destination_return,fk_departure_ota_bus_company,fk_return_ota_bus_company,gmv_success,total_tickets_quantity_success,ano
0,aa34ed7fd0a6b405df2df1bf9f8d68e6df9b9a868a6181...,Cliente 5,2021-02-23,20:08:25,Rodoviária 3,Rodoviária 5,0,0,48449a14a4ff7d79bb7a1b6f3d488eba397c36ef25634c...,1,45.31,1,2021
1,948356b25b90c0c87c147cead27483c481edda1dacc4c8...,Cliente 6,2021-02-11,22:25:15,Rodoviária 5,Rodoviária 6,0,0,1dfacb2ea5a03e0a915999e03b5a56196f1b1664d2f768...,1,154.20,1,2021
2,2ee9d0978acb5e113d0b3f846ab3f88c5a426321da8f87...,Cliente 7,2021-02-19,19:11:40,Rodoviária 6,Rodoviária 7,0,0,1d0ebea552eb43d0b1e1561f6de8ae92e3de7f1abec523...,1,188.99,1,2021
3,929cd361c225ec5d3510e14e8582fdcc61a24383cdb7a7...,Cliente 10,2021-07-02,11:41:19,Rodoviária 8,Rodoviária 10,0,0,c6f3ac57944a531490cd39902d0f777715fd005efac9a3...,1,61.55,1,2021
4,f08c3f551a19f1ce13525825dbf0d0ce9c3492da92bbb2...,Cliente 11,2022-07-14,10:16:52,Rodoviária 9,Rodoviária 11,0,0,96061e92f58e4bdcdee73df36183fe3ac64747c81c26f6...,1,55.46,1,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133305,52a39652c7b3db7eedbba20c16b4e68df6f514bec1ad79...,Cliente 133072,2023-04-21,13:24:41,Rodoviária 1,Rodoviária 80,0,0,35135aaa6cc23891b40cb3f378c53a17a1127210ce60e1...,1,135.23,1,2023
1133306,655038f142aea0104f73671352b3304617b8d19801495a...,Cliente 131770,2023-04-08,10:49:02,Rodoviária 6,Rodoviária 18,0,0,f32828acecb4282c87eaa554d2e1db74e418cd68458430...,1,183.92,1,2023
1133307,4e4f5fcded3e8b6915e05c6b9b504ec6830f723a831f68...,Cliente 133024,2023-12-29,19:47:12,Rodoviária 6,Rodoviária 18,0,0,3068430da9e4b7a674184035643d9e19af3dc7483e31cc...,1,156.00,1,2023
1133308,18b75368642c6c347fd00f56cd98322bb6d920b6a07545...,Cliente 133024,2023-01-08,23:57:55,Rodoviária 6,Rodoviária 18,0,0,35135aaa6cc23891b40cb3f378c53a17a1127210ce60e1...,1,294.99,1,2023


In [5]:
df_clientes

Unnamed: 0,fk_contact,total_gasto,qtd_compras,tp_pessoa,ultima_compra,grupo,ticket_medio
0,Cliente 5,85.92,2,PF,2021-02-27,Pessoa - Dormindo,42.96
1,Cliente 6,576.64,3,PF,2021-07-02,Pessoa - Dormindo,192.21
2,Cliente 7,750.89,5,PF,2021-09-03,Pessoa - Dormindo,150.18
3,Cliente 10,1052.88,8,PF,2022-02-15,Pessoa - Quase dormindo,131.61
4,Cliente 11,787.83,16,PF,2022-07-14,Pessoa - Quase dormindo,49.24
...,...,...,...,...,...,...,...
375585,Cliente 581808,569.57,1,PF,2022-02-26,Pessoa - Quase dormindo,569.57
375586,Cliente 581809,76.78,1,PF,2023-10-12,Pessoa - Potencial,76.78
375587,Cliente 581811,82.48,1,PF,2022-12-18,Pessoa - Quase dormindo,82.48
375588,Cliente 581812,119.82,1,PF,2022-02-10,Pessoa - Dormindo,119.82


## Criando um df para treinamento e fazer o predict

In [6]:
df_vendas

Unnamed: 0,nk_ota_localizer_id,fk_contact,date_purchase,time_purchase,place_origin_departure,place_destination_departure,place_origin_return,place_destination_return,fk_departure_ota_bus_company,fk_return_ota_bus_company,gmv_success,total_tickets_quantity_success,ano
0,aa34ed7fd0a6b405df2df1bf9f8d68e6df9b9a868a6181...,Cliente 5,2021-02-23,20:08:25,Rodoviária 3,Rodoviária 5,0,0,48449a14a4ff7d79bb7a1b6f3d488eba397c36ef25634c...,1,45.31,1,2021
1,948356b25b90c0c87c147cead27483c481edda1dacc4c8...,Cliente 6,2021-02-11,22:25:15,Rodoviária 5,Rodoviária 6,0,0,1dfacb2ea5a03e0a915999e03b5a56196f1b1664d2f768...,1,154.20,1,2021
2,2ee9d0978acb5e113d0b3f846ab3f88c5a426321da8f87...,Cliente 7,2021-02-19,19:11:40,Rodoviária 6,Rodoviária 7,0,0,1d0ebea552eb43d0b1e1561f6de8ae92e3de7f1abec523...,1,188.99,1,2021
3,929cd361c225ec5d3510e14e8582fdcc61a24383cdb7a7...,Cliente 10,2021-07-02,11:41:19,Rodoviária 8,Rodoviária 10,0,0,c6f3ac57944a531490cd39902d0f777715fd005efac9a3...,1,61.55,1,2021
4,f08c3f551a19f1ce13525825dbf0d0ce9c3492da92bbb2...,Cliente 11,2022-07-14,10:16:52,Rodoviária 9,Rodoviária 11,0,0,96061e92f58e4bdcdee73df36183fe3ac64747c81c26f6...,1,55.46,1,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133305,52a39652c7b3db7eedbba20c16b4e68df6f514bec1ad79...,Cliente 133072,2023-04-21,13:24:41,Rodoviária 1,Rodoviária 80,0,0,35135aaa6cc23891b40cb3f378c53a17a1127210ce60e1...,1,135.23,1,2023
1133306,655038f142aea0104f73671352b3304617b8d19801495a...,Cliente 131770,2023-04-08,10:49:02,Rodoviária 6,Rodoviária 18,0,0,f32828acecb4282c87eaa554d2e1db74e418cd68458430...,1,183.92,1,2023
1133307,4e4f5fcded3e8b6915e05c6b9b504ec6830f723a831f68...,Cliente 133024,2023-12-29,19:47:12,Rodoviária 6,Rodoviária 18,0,0,3068430da9e4b7a674184035643d9e19af3dc7483e31cc...,1,156.00,1,2023
1133308,18b75368642c6c347fd00f56cd98322bb6d920b6a07545...,Cliente 133024,2023-01-08,23:57:55,Rodoviária 6,Rodoviária 18,0,0,35135aaa6cc23891b40cb3f378c53a17a1127210ce60e1...,1,294.99,1,2023


In [7]:
df_predict = df_vendas.drop(columns=['nk_ota_localizer_id','fk_departure_ota_bus_company', 'fk_return_ota_bus_company', 'ano'])
df_predict

Unnamed: 0,fk_contact,date_purchase,time_purchase,place_origin_departure,place_destination_departure,place_origin_return,place_destination_return,gmv_success,total_tickets_quantity_success
0,Cliente 5,2021-02-23,20:08:25,Rodoviária 3,Rodoviária 5,0,0,45.31,1
1,Cliente 6,2021-02-11,22:25:15,Rodoviária 5,Rodoviária 6,0,0,154.20,1
2,Cliente 7,2021-02-19,19:11:40,Rodoviária 6,Rodoviária 7,0,0,188.99,1
3,Cliente 10,2021-07-02,11:41:19,Rodoviária 8,Rodoviária 10,0,0,61.55,1
4,Cliente 11,2022-07-14,10:16:52,Rodoviária 9,Rodoviária 11,0,0,55.46,1
...,...,...,...,...,...,...,...,...,...
1133305,Cliente 133072,2023-04-21,13:24:41,Rodoviária 1,Rodoviária 80,0,0,135.23,1
1133306,Cliente 131770,2023-04-08,10:49:02,Rodoviária 6,Rodoviária 18,0,0,183.92,1
1133307,Cliente 133024,2023-12-29,19:47:12,Rodoviária 6,Rodoviária 18,0,0,156.00,1
1133308,Cliente 133024,2023-01-08,23:57:55,Rodoviária 6,Rodoviária 18,0,0,294.99,1


In [8]:
df_predict['datetime'] = pd.to_datetime(df_predict['date_purchase'] + ' ' + df_predict['time_purchase'])
df_predict = df_predict.sort_values(by=['fk_contact', 'datetime']).reset_index(drop=True)
df_predict = df_predict.drop(columns=['date_purchase', 'time_purchase'])
df_predict

Unnamed: 0,fk_contact,place_origin_departure,place_destination_departure,place_origin_return,place_destination_return,gmv_success,total_tickets_quantity_success,datetime
0,Cliente 1,Rodoviária 18,Rodoviária 160,0,0,678.70,1,2023-07-06 06:16:58
1,Cliente 1,Rodoviária 18,Rodoviária 160,0,0,509.90,1,2023-07-29 11:53:08
2,Cliente 1,Rodoviária 57,Rodoviária 159,Rodoviária 216,Rodoviária 22,823.16,2,2023-09-19 13:41:40
3,Cliente 1,Rodoviária 913,Rodoviária 18,0,0,181.69,1,2024-02-04 18:09:33
4,Cliente 10,Rodoviária 8,Rodoviária 10,Rodoviária 860,Rodoviária 200,105.22,2,2021-02-13 00:02:57
...,...,...,...,...,...,...,...,...
1133305,Cliente 99995,Rodoviária 3,Rodoviária 130,0,0,69.50,1,2024-03-21 10:18:31
1133306,Cliente 99996,Rodoviária 53,Rodoviária 80,0,0,424.82,2,2022-09-17 11:47:33
1133307,Cliente 99996,Rodoviária 22,Rodoviária 6,0,0,42.18,1,2023-09-30 20:07:50
1133308,Cliente 99997,Rodoviária 16,Rodoviária 9,0,0,92.28,1,2021-10-12 15:20:32


## Reogarnizando as colunas

In [9]:
print(df_predict.columns.tolist())

['fk_contact', 'place_origin_departure', 'place_destination_departure', 'place_origin_return', 'place_destination_return', 'gmv_success', 'total_tickets_quantity_success', 'datetime']


In [10]:
nova_ordem = ['fk_contact', 'datetime', 'place_origin_departure', 'place_destination_departure', 'place_origin_return', 'place_destination_return', 'gmv_success', 'total_tickets_quantity_success']
df_predict = df_predict[nova_ordem]
df_predict

Unnamed: 0,fk_contact,datetime,place_origin_departure,place_destination_departure,place_origin_return,place_destination_return,gmv_success,total_tickets_quantity_success
0,Cliente 1,2023-07-06 06:16:58,Rodoviária 18,Rodoviária 160,0,0,678.70,1
1,Cliente 1,2023-07-29 11:53:08,Rodoviária 18,Rodoviária 160,0,0,509.90,1
2,Cliente 1,2023-09-19 13:41:40,Rodoviária 57,Rodoviária 159,Rodoviária 216,Rodoviária 22,823.16,2
3,Cliente 1,2024-02-04 18:09:33,Rodoviária 913,Rodoviária 18,0,0,181.69,1
4,Cliente 10,2021-02-13 00:02:57,Rodoviária 8,Rodoviária 10,Rodoviária 860,Rodoviária 200,105.22,2
...,...,...,...,...,...,...,...,...
1133305,Cliente 99995,2024-03-21 10:18:31,Rodoviária 3,Rodoviária 130,0,0,69.50,1
1133306,Cliente 99996,2022-09-17 11:47:33,Rodoviária 53,Rodoviária 80,0,0,424.82,2
1133307,Cliente 99996,2023-09-30 20:07:50,Rodoviária 22,Rodoviária 6,0,0,42.18,1
1133308,Cliente 99997,2021-10-12 15:20:32,Rodoviária 16,Rodoviária 9,0,0,92.28,1


## Iniciando as análises

### Criando as novas colunas

In [11]:
df_predict = df_predict.sort_values(by=['fk_contact', 'datetime']).reset_index(drop=True)


df_predict['comprou_prox_7_dias']   = 0
df_predict['comprou_prox_30_dias']  = 0
df_predict['dias_ate_prox_compra']  = None 

### Traçando perfil do cliente

In [12]:
labels = []

for cliente, group in df_predict.groupby('fk_contact'):
    compras = group['datetime'].tolist()
    indices = group.index.tolist()
    
    for i in range(len(compras) - 1):
        atual = compras[i]
        proxima = compras[i + 1]
        dias_dif = (proxima - atual).days
        
        labels.append({
            'index': indices[i],
            'dias_ate_prox_compra': dias_dif,
            'comprou_prox_7_dias': int(dias_dif <= 7),
            'comprou_prox_30_dias': int(dias_dif <= 30)
        })

# Transforma em DataFrame auxiliar
df_labels = pd.DataFrame(labels).set_index('index')

Fazendo o Merge dos dois df

In [13]:
df_predict.update(df_labels)
df_predict

Unnamed: 0,fk_contact,datetime,place_origin_departure,place_destination_departure,place_origin_return,place_destination_return,gmv_success,total_tickets_quantity_success,comprou_prox_7_dias,comprou_prox_30_dias,dias_ate_prox_compra
0,Cliente 1,2023-07-06 06:16:58,Rodoviária 18,Rodoviária 160,0,0,678.70,1,0.0,1.0,23.0
1,Cliente 1,2023-07-29 11:53:08,Rodoviária 18,Rodoviária 160,0,0,509.90,1,0.0,0.0,52.0
2,Cliente 1,2023-09-19 13:41:40,Rodoviária 57,Rodoviária 159,Rodoviária 216,Rodoviária 22,823.16,2,0.0,0.0,138.0
3,Cliente 1,2024-02-04 18:09:33,Rodoviária 913,Rodoviária 18,0,0,181.69,1,0.0,0.0,
4,Cliente 10,2021-02-13 00:02:57,Rodoviária 8,Rodoviária 10,Rodoviária 860,Rodoviária 200,105.22,2,0.0,0.0,139.0
...,...,...,...,...,...,...,...,...,...,...,...
1133305,Cliente 99995,2024-03-21 10:18:31,Rodoviária 3,Rodoviária 130,0,0,69.50,1,0.0,0.0,
1133306,Cliente 99996,2022-09-17 11:47:33,Rodoviária 53,Rodoviária 80,0,0,424.82,2,0.0,0.0,378.0
1133307,Cliente 99996,2023-09-30 20:07:50,Rodoviária 22,Rodoviária 6,0,0,42.18,1,0.0,0.0,
1133308,Cliente 99997,2021-10-12 15:20:32,Rodoviária 16,Rodoviária 9,0,0,92.28,1,0.0,0.0,


## Realizando tratamento antes de iniciar o treinamento do modelo

In [14]:
df_train = df_predict.dropna(subset=['dias_ate_prox_compra'])
df_train.shape


(757720, 11)

In [15]:
# Define qual coluna queremos prever
TARGET = 'comprou_prox_30_dias'

# Define as colunas que serão usadas como features (atributos) para o modelo
# Removemos colunas de ID, data, e as outras colunas alvo
features = [
    'gmv_success',
    'total_tickets_quantity_success',
    'dia_da_semana',
    'dia_do_mes',
    'mes',
    'ano',
    'semana_do_ano',
    'gasto_medio_acumulado',
    'qtd_compras_acumulada',
    'gasto_max_acumulado'
]

# Separa os dados em X (features) e y (alvo)
X = df_train[features]
y = df_train[TARGET]

print(f"Temos {X.shape[0]} amostras para treinamento.")
print(f"Número de features: {X.shape[1]}")
print("Features utilizadas:", features)
print("\nDistribuição do nosso alvo (y):")
print(y.value_counts(normalize=True))

KeyError: "['dia_da_semana', 'dia_do_mes', 'mes', 'ano', 'semana_do_ano', 'gasto_medio_acumulado', 'qtd_compras_acumulada', 'gasto_max_acumulado'] not in index"

## Modelo de treinamento