In [81]:

# EDA
import pandas as pd
import plotly.express as px
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar os dados

In [82]:
# Carregar os dados já tratado
df_leads = pd.read_csv('./datasets/leads_cleaned.csv')

In [83]:
# Mostrar as primeiras linhas
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [84]:
# Mostrar as ultimas linhas
df_leads.tail(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
9064,Landing Page Submission,Google,0,0,0,2.0,870,2.0,Email Opened,0,0,0,0,0,0,0,Email Opened
9065,Landing Page Submission,Google,0,0,1,8.0,1016,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened
9066,Landing Page Submission,Direct Traffic,0,0,0,2.0,1770,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9067,API,Direct Traffic,0,0,1,13.0,1409,2.6,SMS Sent,0,0,0,0,0,0,0,SMS Sent
9068,Landing Page Submission,Direct Traffic,0,0,1,5.0,210,2.5,SMS Sent,0,0,0,0,0,0,0,Modified
9069,Landing Page Submission,Direct Traffic,1,0,1,8.0,1845,2.67,Email Marked Spam,0,0,0,0,0,0,0,Email Marked Spam
9070,Landing Page Submission,Direct Traffic,0,0,0,2.0,238,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9071,Landing Page Submission,Direct Traffic,1,0,0,2.0,199,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9072,Landing Page Submission,Google,0,0,1,3.0,499,3.0,SMS Sent,0,0,0,0,0,0,0,SMS Sent
9073,Landing Page Submission,Direct Traffic,0,0,1,6.0,1279,3.0,SMS Sent,0,0,0,0,0,0,1,Modified


In [85]:
# Mostrar a estrutura do DF
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

### Preparação dos Dados


In [86]:
# Preparar os dados para o modelo
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']

In [87]:
# Usar preprocessor ja salvo anteriormente
import joblib
preprocessor = joblib.load('./preprocessor_dataset_leads.pkl')

In [88]:
# Dividir os dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [89]:
# Aplicar preprocessor
X_train = preprocessor.fit_transform(X_train).toarray()
X_test = preprocessor.transform(X_test).toarray()

In [90]:
# Mostrar a estrutura destes conjuntos
print(X_train.shape)
print(X_test.shape)

(7259, 68)
(1815, 68)


### Treinamento do Modelo de Voting Classifier

In [91]:
# Criar o modelo de VotingClassifier


lr_model = LogisticRegression(random_state=51)
# Para executar o modo de Soft Voting, é necessário colocar o hiperparâmetro probability = True no modelo SVC
svc_model = SVC(probability=True, kernel='linear')
tree_model = DecisionTreeClassifier(random_state=51)

# Criar o objeto do VotingClassifier
voting_model = VotingClassifier(
    estimators = [
        ('logistic regression', lr_model),
        ('svc', svc_model),
        ('decision tree', tree_model)
    ],
    
    # Hard faz a votação pela maioria das predições dos estimadores
    #Soft faz a votação pela média das probabilidades de cada classe vindas de cada estimador
    voting='soft'   
)

In [92]:
# Treinar o modelo
voting_model.fit(X_train, y_train)

### Análise dos Resultados

In [93]:
#Realizar prediçoes no conjunto de teste
y_pred = voting_model.predict(X_test)

In [94]:
y_pred

array([1, 0, 0, ..., 0, 0, 1])

In [95]:
# Calcular Métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [96]:
# Apresentar as metricas
print(f'Acuracia: {accuracy}')
print(f'Precisão: {precision}')
print(f'Recall: {recall}')
print(f'F1_Score: {f1}')

Acuracia: 0.7862258953168044
Precisão: 0.7203125
Recall: 0.6880597014925374
F1_Score: 0.7038167938931298


In [97]:
# Mostrar uma matriz de confusão 
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(conf_matrix,
                labels=dict(x='Predição', y='Real', color='Contagem'),
                x=['Not Converted', 'Converted'],
                y=['Not Converted', 'Converted'],
                color_continuous_scale='Viridis')
fig.update_traces(text=conf_matrix, texttemplate="%{z}")
fig.update_layout(coloraxis_showscale=False)
fig.show()

In [98]:
# Calcular a importância das variáveis considerando o Stacking Classifier

importances = []

for estimator in voting_model.estimators_:
    # Modelos lineares possuem coeficiente
    if hasattr(estimator, 'coef_'):
        importances.append(np.abs(estimator.coef_[0]))
        print(f'Coeficientes do modelo para {type(estimator).__name__}')  
    # Modelos baseados em arvores 
    elif hasattr(estimator, 'feature_importances_'):
        importances.append(np.abs(estimator.feature_importances_))
        print(f'Feature Importances do modelo para {type(estimator).__name__}')  
    # Caso não encontre coef e feature importances
    else:
        print(f'Não foi possível calcular a importância para {type(estimator).__name__}')    

Coeficientes do modelo para LogisticRegression
Coeficientes do modelo para SVC
Feature Importances do modelo para DecisionTreeClassifier


In [99]:
# Calcular a média das importâncias
importancia_media = np.mean(importances, axis=0)

In [100]:
importancia_media

array([0.22523852, 0.03587513, 0.11334446, 0.85206433, 0.08244887,
       0.0072784 , 0.03554339, 0.05729069, 0.04262539, 0.00572446,
       0.03675622, 0.01354635, 0.59017145, 0.62217065, 1.33160526,
       0.1270707 , 0.15406345, 0.332447  , 0.43163126, 0.12453701,
       0.01089549, 0.31246918, 0.32214603, 0.19972264, 0.04277403,
       0.27364251, 0.25843587, 0.35100332, 0.19591516, 0.88615977,
       0.1092695 , 0.33488185, 0.05459321, 0.14172053, 0.08172463,
       0.68320062, 0.60979761, 0.55129466, 0.02167269, 0.1209071 ,
       0.10067165, 0.29920279, 0.25233692, 0.6822895 , 0.48054617,
       0.31871189, 0.3549633 , 0.16075804, 0.22973273, 0.10262642,
       0.05419765, 0.02177073, 0.03318338, 0.18525745, 0.48984408,
       0.1209071 , 0.45366622, 0.1378337 , 0.05822045, 0.60502148,
       0.52612309, 0.57091876, 0.24872689, 0.35476814, 0.35041113,
       0.90905856, 0.02855717, 0.25441671])

In [101]:
# Nomes das features
feature_names = preprocessor.get_feature_names_out()

In [102]:
# Criar um Dataframe combinado nomes das variaveis com as importãncias (coeficientes)
df_feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importancia_media})

In [103]:
# Ordenar o Dataframe pela importãnncia
df_feature_importances = df_feature_importances.sort_values(by='Importance', ascending=True)

In [104]:
# Plotar a importância das features
fig = px.bar(df_feature_importances,
             x='Importance',
             y='Feature',
             orientation='h',
             title='Importãncia das Features dos algoritmos base')

fig.update_layout(height=1280, width=1000)
fig.show()

### Propriedades do Modelo

In [105]:
# Mostrar evidências Hard Voting

# Selecionar um registro da base para fazer a predição
X_sample = X_test[7].reshape(1, -1)

# Predições individuais dos estimadores
log_pred = voting_model.named_estimators_['logistic regression'].predict(X_sample)
svc_pred = voting_model.named_estimators_['svc'].predict(X_sample)
tree_pred = voting_model.named_estimators_['decision tree'].predict(X_sample)

# Predição final com o Voting Classifier
voting_pred = voting_model.predict(X_sample)

In [106]:
# Exibir os resultados
print(f'Regressão Logística: {log_pred[0]}')
print(f'SVC. {svc_pred[0]}')
print(f'Predição do Árvore de Decisão: {tree_pred[0]}')
print(f'Predição final do Hard Voting: {voting_pred[0]}')

Regressão Logística: 0
SVC. 0
Predição do Árvore de Decisão: 1
Predição final do Hard Voting: 1


In [107]:
# Mostrar evidências Soft Voting

# Selecionar um registro da base para fazer a predição
X_sample = X_test[7].reshape(1, -1)

# Predições individuais das probabilidades dos estimadores
log_proba = voting_model.named_estimators_['logistic regression'].predict_proba(X_sample)
svc_proba = voting_model.named_estimators_['svc'].predict_proba(X_sample)
tree_proba = voting_model.named_estimators_['decision tree'].predict_proba(X_sample)

# Predição final com o Voting Classifier
voting_pred = voting_model.predict(X_sample)
voting_proba = voting_model.predict(X_sample)

In [108]:
# Exibir os resultados
print(f'Regressão Logística: {log_proba}')
print(f'SVC. {svc_proba}')
print(f'Predição do Árvore de Decisão: {tree_pred[0]}')
print(f'Probabilidade final do Soft Voting (Votação Ponderada): {voting_proba}')
print(f'Predição final do Soft Voting (Votação Ponderada): {voting_pred[0]}')

Regressão Logística: [[0.67050795 0.32949205]]
SVC. [[0.6925824 0.3074176]]
Predição do Árvore de Decisão: 1
Probabilidade final do Soft Voting (Votação Ponderada): [1]
Predição final do Soft Voting (Votação Ponderada): 1
