### Import das bibliotecas

In [1]:
import pandas as pd
from sklearn import model_selection, ensemble, pipeline, metrics
from feature_engine import encoding

### Lendo a base de dados e mostrando as 5 primeiras linhas

In [2]:
df = pd.read_csv('data\\clientes_churn_loja_auto.csv')
df.head()

Unnamed: 0,id_cliente,nome,sexo,idade,estado,tempo_como_cliente,frequencia_compras,ticket_medio,total_gasto,forma_pagamento,ultima_compra,comprou_servico_instal,reclamacoes,avaliacao_media,churn
0,1,Paulo Viana,Masculino,46,SP,15,3.0,618.73,22096.4,Pix,152,0,0,4.4,0
1,2,André Porto,Masculino,47,RS,38,0.2,992.6,8075.51,Boleto,314,1,0,3.7,1
2,3,Enzo Gabriel Cunha,Masculino,44,SC,59,1.7,94.33,12150.7,Cartão de crédito,51,1,0,4.0,0
3,4,Manuella Abreu,Feminino,20,SP,37,2.5,212.0,14492.4,Cartão de crédito,89,1,0,4.1,0
4,5,Dr. Levi Silveira,Masculino,43,RS,53,1.1,278.97,18052.3,Boleto,35,1,0,5.0,0


### Verificando os tipos de dados e se possuem registros vazios

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id_cliente              10000 non-null  int64  
 1   nome                    10000 non-null  object 
 2   sexo                    10000 non-null  object 
 3   idade                   10000 non-null  int64  
 4   estado                  10000 non-null  object 
 5   tempo_como_cliente      10000 non-null  int64  
 6   frequencia_compras      10000 non-null  float64
 7   ticket_medio            10000 non-null  float64
 8   total_gasto             10000 non-null  float64
 9   forma_pagamento         10000 non-null  object 
 10  ultima_compra           10000 non-null  int64  
 11  comprou_servico_instal  10000 non-null  int64  
 12  reclamacoes             10000 non-null  int64  
 13  avaliacao_media         10000 non-null  float64
 14  churn                   10000 non-null 

### Separando variável alvo, base de treino e base de teste

In [4]:
features = df.columns[2:-1].to_list()
target = 'churn'

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df[features],
    df[target],
    random_state=42,
    train_size=0.8,
    stratify=df[target]
)

print('Taxa de resposta na base de treino:', y_train.mean())
print('Taxa de resposta na base de teste:', y_test.mean())

Taxa de resposta na base de treino: 0.30325
Taxa de resposta na base de teste: 0.3035


### Separando variáveis categóricas e númericas

In [5]:
cat_features = X_train.dtypes[X_train.dtypes == 'object'].index.to_list()
num_features = list(set(features) - set(cat_features))

### Criando o Pipeline do modelo e treinando

In [6]:
onehot = encoding.OneHotEncoder(
    variables=cat_features,
    drop_last=True
)

model = ensemble.RandomForestClassifier(
    random_state=42
)

params = {
    'min_samples_leaf': [10, 25, 50, 75, 100],
    'n_estimators': [100, 200, 500, 1000],
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 8, 10, 12, 15]
}

grid = model_selection.GridSearchCV(
    model,
    param_grid=params,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=3
)

model_pipeline = pipeline.Pipeline([
    ('One Hot Encode', onehot),
    ('Modelo', grid)
])

model_pipeline.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


In [7]:
y_train_proba = model_pipeline.predict_proba(X_train)
y_test_proba = model_pipeline.predict_proba(X_test)

### Métricas

In [8]:
def report_metrics(y_true, y_proba, cohort=0.5):
    y_pred = (y_proba[:,1]>cohort).astype(int)

    acc = metrics.accuracy_score(y_true, y_pred)
    auc = metrics.roc_auc_score(y_true, y_proba[:,1])
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)

    res = {
        'Acurácia': acc,
        'Curva Roc': auc,
        'Precisão': precision,
        'Recall': recall
    }

    return res

In [9]:
report_train = report_metrics(y_train, y_train_proba)
report_train['Base'] = 'Train'
report_test = report_metrics(y_test, y_test_proba)
report_test['Base'] = 'Test'

df_metrics = pd.DataFrame([report_train, report_test])

df_metrics

Unnamed: 0,Acurácia,Curva Roc,Precisão,Recall,Base
0,0.8675,0.929898,0.825238,0.714345,Train
1,0.827,0.889651,0.755382,0.635914,Test


### Exportando o modelo

In [10]:
model_series = pd.Series({
    'model': model_pipeline,
    'features': features,
    'metrics': df_metrics
})

model_series.to_pickle('models\\churn_model.pkl')

### Fazendo novas previsões em clientes ativos

In [11]:
model_series = pd.read_pickle('models\\churn_model.pkl')

model = model_series['model']
features = model_series['features']

df_novos_clientes = pd.read_csv('data\\novos_clientes.csv')
df_novos_clientes.head()

predictions = model.predict(df_novos_clientes[features])
probs = model.predict_proba(df_novos_clientes[features])[:, 1]  # Probabilidade de churn (classe positiva)

df_novos_clientes['churn_predito'] = predictions
df_novos_clientes['probabilidade_churn'] = probs

cols = ['churn_predito', 'probabilidade_churn'] + [col for col in df_novos_clientes.columns if col not in ['churn_predito', 'probabilidade_churn']]
df_novos_clientes = df_novos_clientes[cols]

df_novos_clientes.sort_values(by='probabilidade_churn', ascending=False)

Unnamed: 0,churn_predito,probabilidade_churn,id_cliente,nome,sexo,idade,estado,tempo_como_cliente,frequencia_compras,ticket_medio,total_gasto,forma_pagamento,ultima_compra,comprou_servico_instal,reclamacoes,avaliacao_media
3,1,0.76842,4,Maria Júlia da Mota,Feminino,33,SP,15,1.9,795.92,18596.21,Cartão de crédito,55,1,0,1.9
8,1,0.752168,9,Maria Liz Castro,Feminino,58,BA,29,3.2,120.82,14487.51,Pix,63,0,0,2.5
4,1,0.701675,5,Pedro Henrique da Cunha,Masculino,20,RS,37,2.5,212.0,14492.4,Boleto,53,1,0,2.6
1,1,0.638099,2,Sr. Antônio Azevedo,Masculino,41,RS,3,0.3,971.41,1048.66,Boleto,294,1,0,2.8
0,1,0.617364,1,Murilo Fernandes,Masculino,56,SP,52,3.8,745.39,156007.85,Pix,331,0,1,2.8
7,1,0.566243,8,Cecília Aparecida,Feminino,57,SP,21,1.7,307.78,13155.0,Boleto,217,1,0,3.1
9,0,0.479893,10,Lucas Peixoto,Masculino,25,RS,35,3.2,625.66,87997.46,Pix,41,1,1,3.4
6,0,0.435752,7,Alice Montenegro,Feminino,59,RS,4,0.9,971.11,4073.11,Transferência,215,1,0,3.5
2,0,0.250851,3,Guilherme Lima,Masculino,38,SC,33,1.4,548.52,24306.85,Cartão de crédito,170,1,0,3.6
5,0,0.232416,6,Dr. Luiz Henrique Novais,Masculino,24,SC,44,2.1,82.67,9514.72,Boleto,50,1,1,3.7
