In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from scipy.stats import chi2_contingency

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

In [2]:
# Carregar os dados
df_leads = pd.read_csv('./leads.csv')

In [3]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

In [4]:
df_leads.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


## Feature Engineering e Data Cleaning

In [5]:
df_leads.drop(columns=['Prospect ID', 'Lead Number'], axis=1, inplace=True)

In [6]:
# Mostrar e remover as colunas categóricas com valores únicos

for col in df_leads.select_dtypes(include=['object']).columns:
    if df_leads[col].nunique() == 1:
        print(f'Coluna {col} possui somente um valor possível {df_leads[col].nunique()}, removendo...')
        df_leads.drop(columns=[col], axis=1, inplace=True)



Coluna Magazine possui somente um valor possível 1, removendo...
Coluna Receive More Updates About Our Courses possui somente um valor possível 1, removendo...
Coluna Update me on Supply Chain Content possui somente um valor possível 1, removendo...
Coluna Get updates on DM Content possui somente um valor possível 1, removendo...
Coluna I agree to pay the amount through cheque possui somente um valor possível 1, removendo...


In [7]:
for col in df_leads.select_dtypes(include=['object']).columns:
    print(f'Coluna {col} possui os seguintes valores possíveis: {df_leads[col].unique()}')

Coluna Lead Origin possui os seguintes valores possíveis: ['API' 'Landing Page Submission' 'Lead Add Form' 'Lead Import'
 'Quick Add Form']
Coluna Lead Source possui os seguintes valores possíveis: ['Olark Chat' 'Organic Search' 'Direct Traffic' 'Google' 'Referral Sites'
 'Welingak Website' 'Reference' 'google' 'Facebook' nan 'blog'
 'Pay per Click Ads' 'bing' 'Social Media' 'WeLearn' 'Click2call'
 'Live Chat' 'welearnblog_Home' 'youtubechannel' 'testone' 'Press_Release'
 'NC_EDM']
Coluna Do Not Email possui os seguintes valores possíveis: ['No' 'Yes']
Coluna Do Not Call possui os seguintes valores possíveis: ['No' 'Yes']
Coluna Last Activity possui os seguintes valores possíveis: ['Page Visited on Website' 'Email Opened' 'Unreachable'
 'Converted to Lead' 'Olark Chat Conversation' 'Email Bounced'
 'Email Link Clicked' 'Form Submitted on Website' 'Unsubscribed'
 'Had a Phone Conversation' 'View in browser link Clicked' nan
 'Approached upfront' 'SMS Sent' 'Visited Booth in Tradeshow'
 

In [8]:
# Mostrar o percentual de valores ausentes ou com valor igual a 'select' para cada coluna categórica
for col in df_leads.select_dtypes(include=['object']).columns:
    contagem_nulas = (df_leads[col] == 'Select').sum() + df_leads[col].isnull().sum()
    print(f'{col}: {contagem_nulas/len(df_leads) * 100:.2f}%')

Lead Origin: 0.00%
Lead Source: 0.39%
Do Not Email: 0.00%
Do Not Call: 0.00%
Last Activity: 1.11%
Country: 26.63%
Specialization: 36.58%
How did you hear about X Education: 78.46%
What is your current occupation: 29.11%
What matters most to you in choosing a course: 29.32%
Search: 0.00%
Newspaper Article: 0.00%
X Education Forums: 0.00%
Newspaper: 0.00%
Digital Advertisement: 0.00%
Through Recommendations: 0.00%
Tags: 36.29%
Lead Quality: 51.59%
Lead Profile: 74.19%
City: 39.71%
Asymmetrique Activity Index: 45.65%
Asymmetrique Profile Index: 45.65%
A free copy of Mastering The Interview: 0.00%
Last Notable Activity: 0.00%


In [9]:
# Remover colunas categóricas cujo percentual de valores ausentes ou com valor igual a 'select' maior que 25%
for col in df_leads.select_dtypes(include=['object']).columns:
    contagem_nulas = (df_leads[col] == 'Select').sum() + df_leads[col].isnull().sum()
    if (contagem_nulas/len(df_leads) * 100) > 25:
        print(f'{col}')
        df_leads.drop(columns=[col],axis=1,inplace=True)

Country
Specialization
How did you hear about X Education
What is your current occupation
What matters most to you in choosing a course
Tags
Lead Quality
Lead Profile
City
Asymmetrique Activity Index
Asymmetrique Profile Index


In [10]:
# Na coluna Lead Souce, substituir o valor 'google' por 'Google'
df_leads['Lead Source'] = df_leads['Lead Source'].apply(lambda x: 'Google' if x == 'google' else x)

In [11]:
for col in df_leads.select_dtypes(include=['object']).columns:
    valores_unicos = df_leads[col].unique()
    if set(valores_unicos).issubset(set(['Yes','No'])):
        print(col)
        df_leads[col] = df_leads[col].apply(lambda x: 1 if x == 'Yes' else 0)

Do Not Email
Do Not Call
Search
Newspaper Article
X Education Forums
Newspaper
Digital Advertisement
Through Recommendations
A free copy of Mastering The Interview


In [12]:
# Remover Linhas que tenham valores ausentes em colunas categóricas
colunas_categoricas = df_leads.select_dtypes(include=['object']).columns
df_leads.dropna(subset=colunas_categoricas, inplace=True)

In [13]:
# Apresentar Estatisticas Descritivas
df_leads.describe()

Unnamed: 0,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,Asymmetrique Activity Score,Asymmetrique Profile Score,A free copy of Mastering The Interview
count,9103.0,9103.0,9103.0,9074.0,9103.0,9074.0,9103.0,9103.0,9103.0,9103.0,9103.0,9103.0,4944.0,4944.0,9103.0
mean,0.079205,0.00022,0.379216,3.456028,483.773921,2.370151,0.001538,0.00022,0.00011,0.00011,0.000439,0.000769,14.313511,16.34021,0.317258
std,0.270073,0.014822,0.485219,4.858802,545.519186,2.160871,0.039189,0.014822,0.010481,0.010481,0.020959,0.027721,1.394627,1.807428,0.465434
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,11.0,0.0
25%,0.0,0.0,0.0,1.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,15.0,0.0
50%,0.0,0.0,0.0,3.0,247.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,16.0,0.0
75%,0.0,0.0,1.0,5.0,924.0,3.2,0.0,0.0,0.0,0.0,0.0,0.0,15.0,18.0,1.0
max,1.0,1.0,1.0,251.0,2272.0,55.0,1.0,1.0,1.0,1.0,1.0,1.0,18.0,20.0,1.0


In [14]:
# Mostrar e remover as colunas numericas com valores únicos

for col in df_leads.select_dtypes(include=['number']).columns:
    if df_leads[col].nunique() == 1:
        print(f'Coluna {col} possui somente um valor possível {df_leads[col].nunique()}, removendo...')
        df_leads.drop(columns=[col], axis=1, inplace=True)



In [15]:
# Mostrar o percentual de valores ausentes
for col in df_leads.select_dtypes(include=['number']).columns:
    contagem_nulas = (df_leads[col] == 'Select').sum() + df_leads[col].isnull().sum()
    print(f'{col}: {contagem_nulas/len(df_leads) * 100:.2f}%')

Do Not Email: 0.00%
Do Not Call: 0.00%
Converted: 0.00%
TotalVisits: 0.32%
Total Time Spent on Website: 0.00%
Page Views Per Visit: 0.32%
Search: 0.00%
Newspaper Article: 0.00%
X Education Forums: 0.00%
Newspaper: 0.00%
Digital Advertisement: 0.00%
Through Recommendations: 0.00%
Asymmetrique Activity Score: 45.69%
Asymmetrique Profile Score: 45.69%
A free copy of Mastering The Interview: 0.00%


In [16]:
# Remover as colunas acima de 25%
for col in df_leads.select_dtypes(include=['number']).columns:
    contagem_nulas = (df_leads[col] == 'Select').sum() + df_leads[col].isnull().sum()
    if (contagem_nulas/len(df_leads) * 100) > 25:
        print(f'{col}')
        df_leads.drop(columns=[col],axis=1,inplace=True)

Asymmetrique Activity Score
Asymmetrique Profile Score


In [17]:
colunas_numericas = df_leads.select_dtypes(include=['number']).columns
df_leads.dropna(subset=colunas_numericas,inplace=True)

In [18]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9074 entries, 0 to 9239
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int64  
 

## EDA

Hit Ratio
- Razão entre leads convertidos em vendas sobre a quantidade de leads (Conceito 1)
    Ex: 100 leads criados e 30 foram convertidos = 30%
- Razão entre leads convertidos em vendas sobre a quantidade de leads encerrados (Conceito 2)
    Ex: 100 leads criados, 20 leads que não foram convertidos em vendas e 20 leads que foram convertidos em venda = 50 %

In [19]:
# Distribuição da variável target em percentual
fig = px.bar(df_leads['Converted'].value_counts()/len(df_leads) * 100,
             title='Hit Ratio - Fator de Conversão',
             labels={'index': 'Converted', 'value': 'Percentual'},
             opacity=0.8
             )

fig.update_layout(showlegend=False)
fig.show()

In [20]:
# Matriz de correlação das variáveis numéricas

corr_matrix = df_leads.select_dtypes(include=['number']).corr()


In [21]:
# Plot de Correlação
fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x=corr_matrix.columns,
        y=corr_matrix.index,
        z=np.array(corr_matrix),
        text=corr_matrix.values,
        texttemplate='%{text:.2f}',
        colorscale=px.colors.diverging.RdBu,
        zmin=-1,
        zmax=1
    )
)

fig.show()

In [22]:
# BoxPlot Converted x TotalVisits
fig = px.box(df_leads, x='Converted', y='TotalVisits',color='Converted')
fig.show()

In [23]:
# BoxPlot Converted x Total Time Spent
fig = px.box(df_leads, x='Converted', y='Total Time Spent on Website',color='Converted')
fig.show()

In [24]:
# BoxPlot Converted x PageViewsPerVisit
fig = px.box(df_leads, x='Converted', y='Page Views Per Visit',color='Converted')
fig.show()

In [25]:
# Criar uma tabela de contingência de Converted x Lead Source
contingency_table_lead_source = pd.crosstab(df_leads['Converted'], df_leads['Lead Source'])

In [26]:
contingency_table_lead_source

Lead Source,Click2call,Direct Traffic,Facebook,Google,Live Chat,NC_EDM,Olark Chat,Organic Search,Pay per Click Ads,Press_Release,Reference,Referral Sites,Social Media,WeLearn,Welingak Website,bing,blog,testone,welearnblog_Home,youtubechannel
Converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,1,1725,22,1726,0,0,1305,718,1,2,33,94,1,0,2,5,1,1,1,1
1,3,818,9,1147,2,1,448,436,0,0,410,31,1,1,127,1,0,0,0,0


In [27]:
# Executar o teste de independência de qui-quadrado
chi2, p, dof, expected = chi2_contingency(contingency_table_lead_source)

# Mostrar o resultado
print(f'Estatística de qui-quadrado: {chi2}')
print(f'Valor p: {p}')
print(f'Graus de Liberdade: {dof}')

print(f'Existe uma relação significativa entre Converted e Lead Source? {p < 0.05}')


Estatística de qui-quadrado: 942.1372507753774
Valor p: 1.1748671316223743e-187
Graus de Liberdade: 19
Existe uma relação significativa entre Converted e Lead Source? True


In [28]:
# Criar uma tabela de contingência de Converted x Lead Origin
contingency_table_lead_origin = pd.crosstab(df_leads['Converted'], df_leads['Lead Origin'])

In [29]:
contingency_table_lead_origin

Lead Origin,API,Landing Page Submission,Lead Add Form,Lead Import
Converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2463,3118,37,21
1,1115,1767,544,9


In [30]:
# Executar o teste de independência de qui-quadrado
chi2, p, dof, expected = chi2_contingency(contingency_table_lead_origin)

# Mostrar o resultado
print(f'Estatística de qui-quadrado: {chi2}')
print(f'Valor p: {p}')
print(f'Graus de Liberdade: {dof}')

print(f'Existe uma relação significativa entre Converted e Lead Origin? {p < 0.05}')

Estatística de qui-quadrado: 843.1212236836468
Valor p: 1.9228780932726904e-182
Graus de Liberdade: 3
Existe uma relação significativa entre Converted e Lead Origin? True


In [31]:
# Criar uma tabela de contingência de Converted x Last Notable Activity
contingency_table_lead_lna = pd.crosstab(df_leads['Converted'], df_leads['Last Notable Activity'])

In [32]:
contingency_table_lead_lna

Last Notable Activity,Approached upfront,Email Bounced,Email Link Clicked,Email Marked Spam,Email Opened,Email Received,Form Submitted on Website,Had a Phone Conversation,Modified,Olark Chat Conversation,Page Visited on Website,Resubscribed to emails,SMS Sent,Unreachable,Unsubscribed,View in browser link Clicked
Converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,51,128,0,1781,0,1,1,2587,158,225,0,663,10,33,1
1,1,9,45,2,1042,1,0,13,680,25,93,1,1489,22,12,0


In [33]:
# Executar o teste de independência de qui-quadrado
chi2, p, dof, expected = chi2_contingency(contingency_table_lead_lna)

# Mostrar o resultado
print(f'Estatística de qui-quadrado: {chi2}')
print(f'Valor p: {p}')
print(f'Graus de Liberdade: {dof}')

print(f'Existe uma relação significativa entre Converted e Last Notable Activity? {p < 0.05}')

Estatística de qui-quadrado: 1424.6171966295433
Valor p: 8.365508263958168e-295
Graus de Liberdade: 15
Existe uma relação significativa entre Converted e Last Notable Activity? True


## Preparação dos Dados

In [34]:
# Preparar os dados para o modelo
X = df_leads.drop(columns=['Converted'], axis=1)
y = df_leads['Converted']

In [35]:
# Criar column Transformer para normalizar numéricas e one-hot encoding nas categorias

numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat',OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
)



In [36]:
# Dividir os dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=51)

# Aplicar o Column Transformer
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [37]:
# Mostrar os conjuntos
print(X_train.shape)
print(X_test.shape)

(7259, 68)
(1815, 68)


## Treinamento do Modelo

In [38]:
# Criar o modelo de BaggingClassifier
bagging_model = BaggingClassifier(
    estimator=LogisticRegression(),
    n_estimators=10,
    random_state=51
)

In [39]:
bagging_model.fit(X_train, y_train)

0,1,2
,"estimator  estimator: object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a :class:`~sklearn.tree.DecisionTreeClassifier`. .. versionadded:: 1.2  `base_estimator` was renamed to `estimator`.",LogisticRegression()
,"n_estimators  n_estimators: int, default=10 The number of base estimators in the ensemble.",10
,"max_samples  max_samples: int or float, default=None The number of samples to draw from X to train each base estimator (with replacement by default, see `bootstrap` for more details). - If None, then draw `X.shape[0]` samples irrespective of `sample_weight`. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` unweighted samples or  `max_samples * sample_weight.sum()` weighted samples.",
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator ( without replacement by default, see `bootstrap_features` for more details). - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features.",1.0
,"bootstrap  bootstrap: bool, default=True Whether samples are drawn with replacement. If False, sampling without replacement is performed. If fitting with `sample_weight`, it is strongly recommended to choose True, as only drawing with replacement will ensure the expected frequency semantics of `sample_weight`.",True
,"bootstrap_features  bootstrap_features: bool, default=False Whether features are drawn with replacement.",False
,"oob_score  oob_score: bool, default=False Whether to use out-of-bag samples to estimate the generalization error. Only available if bootstrap=True.",False
,"warm_start  warm_start: bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. See :term:`the Glossary `. .. versionadded:: 0.17  *warm_start* constructor parameter.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for both :meth:`fit` and :meth:`predict`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the random resampling of the original dataset (sample wise and feature wise). If the base estimator accepts a `random_state` attribute, a different seed is generated for each instance in the ensemble. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",51

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


# Avaliação do modelo

In [40]:
# Fazer predições no conjunto de testes
y_pred = bagging_model.predict(X_test)

In [41]:
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

In [42]:
print(f'Acurácia: {accuracy}')
print(f'Precisão: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


Acurácia: 0.7972451790633609
Precisão: 0.7467320261437909
Recall: 0.682089552238806
F1-Score: 0.7129485179407177


In [43]:
# Mostrar uma matriz de confusão em Plotly

conf_matrix = confusion_matrix(y_test,y_pred)

fig = px.imshow(
    conf_matrix,
    labels=dict(x='Predição', y='Real', color='Contagem'),
    x=['Not Converted', 'Converted'],
    y=['Not Converted', 'Converted'],
    color_continuous_scale='Viridis'
)

fig.update_traces(text=conf_matrix, texttemplate="%{z}")
fig.update_layout(coloraxis_showscale=False)

fig.show()

In [44]:
# Calcular a importância das variáveis
importances = np.mean([np.abs(estimator.coef_[0]) for estimator in bagging_model.estimators_], axis=0)

In [45]:
importances

array([0.40189155, 0.08806415, 0.1724969 , 1.17646949, 0.07704141,
       0.06443448, 0.06484329, 0.0533487 , 0.07394129, 0.09108257,
       0.17577771, 0.03551897, 0.94631037, 1.0516547 , 2.38290094,
       0.17146875, 0.41623408, 0.58200876, 0.52993635, 0.21738235,
       0.02906281, 0.23886942, 0.70509266, 0.35022179, 0.14621416,
       0.7772982 , 0.61212057, 0.44696598, 0.16149782, 1.71081815,
       0.19323702, 0.29102003, 0.16202876, 0.16193619, 0.1589916 ,
       1.13696103, 0.83191452, 0.81171513, 0.20964312, 0.09017097,
       0.19279115, 0.39869674, 0.50111268, 1.20248534, 1.1136462 ,
       0.48552491, 0.40554919, 0.36056697, 0.22458563, 0.44071235,
       0.21679092, 0.07091514, 0.06796285, 0.61639388, 0.87379693,
       0.09017097, 0.65467374, 0.21113675, 0.10538988, 1.05884761,
       0.90565961, 1.05480368, 0.42009677, 0.40554919, 0.6427086 ,
       1.39221694, 0.39744413, 0.10056579])

In [46]:
# Obter os nomes das features após o preprocessor
features_names = (numeric_features.tolist() +
                  preprocessor.named_transformers_['cat']
                  .get_feature_names_out(categorical_features).tolist())

In [47]:
features_names

['Do Not Email',
 'Do Not Call',
 'TotalVisits',
 'Total Time Spent on Website',
 'Page Views Per Visit',
 'Search',
 'Newspaper Article',
 'X Education Forums',
 'Newspaper',
 'Digital Advertisement',
 'Through Recommendations',
 'A free copy of Mastering The Interview',
 'Lead Origin_API',
 'Lead Origin_Landing Page Submission',
 'Lead Origin_Lead Add Form',
 'Lead Origin_Lead Import',
 'Lead Source_Click2call',
 'Lead Source_Direct Traffic',
 'Lead Source_Facebook',
 'Lead Source_Google',
 'Lead Source_Live Chat',
 'Lead Source_NC_EDM',
 'Lead Source_Olark Chat',
 'Lead Source_Organic Search',
 'Lead Source_Pay per Click Ads',
 'Lead Source_Reference',
 'Lead Source_Referral Sites',
 'Lead Source_Social Media',
 'Lead Source_WeLearn',
 'Lead Source_Welingak Website',
 'Lead Source_bing',
 'Lead Source_blog',
 'Lead Source_testone',
 'Lead Source_welearnblog_Home',
 'Lead Source_youtubechannel',
 'Last Activity_Approached upfront',
 'Last Activity_Converted to Lead',
 'Last Activity_

In [48]:
# Criar um dataframe combinando os nomes das features e as importâncias
df_feature_importances = pd.DataFrame({'Feature': features_names, 'Importance':importances})

In [49]:
df_feature_importances

Unnamed: 0,Feature,Importance
0,Do Not Email,0.401892
1,Do Not Call,0.088064
2,TotalVisits,0.172497
3,Total Time Spent on Website,1.176469
4,Page Views Per Visit,0.077041
...,...,...
63,Last Notable Activity_Resubscribed to emails,0.405549
64,Last Notable Activity_SMS Sent,0.642709
65,Last Notable Activity_Unreachable,1.392217
66,Last Notable Activity_Unsubscribed,0.397444


In [50]:
# Ordenar o DataFrame pela importância
df_feature_importances = df_feature_importances.sort_values(by='Importance',ascending=True)

In [51]:
df_feature_importances

Unnamed: 0,Feature,Importance
20,Lead Source_Live Chat,0.029063
11,A free copy of Mastering The Interview,0.035519
7,X Education Forums,0.053349
5,Search,0.064434
6,Newspaper Article,0.064843
...,...,...
3,Total Time Spent on Website,1.176469
43,Last Activity_Had a Phone Conversation,1.202485
65,Last Notable Activity_Unreachable,1.392217
29,Lead Source_Welingak Website,1.710818


In [52]:
# Plotar a importância das features

fig = px.bar(df_feature_importances, 
             x='Importance',
             y='Feature',
             orientation='h',
             title='Importância das Features (com base nos coeficientes absolutos)'
)

fig.update_layout(height=1280,width=1000,yaxis={'categoryorder': 'total ascending'})
fig.show()


## Some more things

In [53]:
bagging_model.estimators_samples_

[array([6647, 2395, 3964, ..., 1386, 4404, 2518], shape=(7259,)),
 array([ 914, 6214, 4940, ..., 7166,  151, 1272], shape=(7259,)),
 array([6067, 5889, 4247, ..., 5644, 3350,  728], shape=(7259,)),
 array([ 405, 6461, 1538, ..., 6934, 1805, 2162], shape=(7259,)),
 array([3226, 4034, 4872, ..., 3994,  140, 4734], shape=(7259,)),
 array([ 514, 5133, 4920, ..., 6956, 7153, 1234], shape=(7259,)),
 array([1246, 4026, 3914, ..., 6846, 1043, 2326], shape=(7259,)),
 array([2697, 5933, 5075, ...,  543, 3580, 1019], shape=(7259,)),
 array([5528, 2923, 4766, ..., 5378, 5127,  672], shape=(7259,)),
 array([5310, 3010, 5471, ..., 4084, 4783, 4111], shape=(7259,))]

In [54]:
bagging_model.estimators_features_

[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
      

In [55]:
# Predizer Probabilidades de Conversão
y_pred_prob = bagging_model.predict_proba(X_train)

In [56]:
y_pred_prob

array([[0.94462405, 0.05537595],
       [0.9280015 , 0.0719985 ],
       [0.91636572, 0.08363428],
       ...,
       [0.63791065, 0.36208935],
       [0.51214849, 0.48785151],
       [0.67731414, 0.32268586]], shape=(7259, 2))

## Cenário de CRM - Utilidade da Probabilidade

- CRM
Leads concluídos - Resultado Positivo ou Negativo
Leads em aberto - Não tenho resultado

Treine um modelo no que está concluído, para que ele generalize bem no que está em aberto

Lead em aberto
- Probabilidade de Converter

Quando muito alto, podemos olhar com mais foco para realmente converter

Quando muito baixa, podemos descartar

- Importância das features

## Salvar dados e pre-processador do modelo

In [57]:
# Salvar dataframe como CSV
df_leads.to_csv('./leads_cleaned.csv', index=False)

In [58]:
import joblib

joblib.dump(preprocessor,'./preprocessor_dataset_leads.pkl')

['./preprocessor_dataset_leads.pkl']