In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

sns.set_style('whitegrid')

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

In [2]:
df_leads = pd.read_csv('../bagging/aula/leads_cleaned.csv')

In [3]:
df_leads.head()

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified


In [4]:
df_leads.tail()

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
9069,Landing Page Submission,Direct Traffic,1,0,1,8.0,1845,2.67,Email Marked Spam,0,0,0,0,0,0,0,Email Marked Spam
9070,Landing Page Submission,Direct Traffic,0,0,0,2.0,238,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9071,Landing Page Submission,Direct Traffic,1,0,0,2.0,199,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9072,Landing Page Submission,Google,0,0,1,3.0,499,3.0,SMS Sent,0,0,0,0,0,0,0,SMS Sent
9073,Landing Page Submission,Direct Traffic,0,0,1,6.0,1279,3.0,SMS Sent,0,0,0,0,0,0,1,Modified


In [5]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

## Preparação dos Dados

In [6]:
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']

In [7]:
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [8]:
# Usar o preprocessor já salvo anteriormente
import joblib

preprocessor = joblib.load('../bagging/aula/preprocessor_dataset_leads.pkl')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=51)

In [10]:
# Aplicar o preprocessor
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [11]:
# Mostrar a estrutura destes conjuntos
print(X_train.shape)
print(X_test.shape)

(7259, 68)
(1815, 68)


## Treinamento do modelo

In [12]:
# Criar o modelo de BoostingClassifier
boosting_model = AdaBoostClassifier(
    estimator=LogisticRegression(),
    n_estimators=50,
    learning_rate=0.1,
    random_state=51
)

In [13]:
# Treinar o modelo
boosting_model.fit(X_train, y_train)

0,1,2
,"estimator  estimator: object, default=None The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper ``classes_`` and ``n_classes_`` attributes. If ``None``, then the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier` initialized with `max_depth=1`. .. versionadded:: 1.2  `base_estimator` was renamed to `estimator`.",LogisticRegression()
,"n_estimators  n_estimators: int, default=50 The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. Values must be in the range `[1, inf)`.",50
,"learning_rate  learning_rate: float, default=1.0 Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the `learning_rate` and `n_estimators` parameters. Values must be in the range `(0.0, inf)`.",0.1
,"random_state  random_state: int, RandomState instance or None, default=None Controls the random seed given at each `estimator` at each boosting iteration. Thus, it is only used when `estimator` exposes a `random_state`. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",51

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


## Avaliação do modelo

In [14]:
y_pred = boosting_model.predict(X_test)

In [15]:
# Métricas do modelo
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test,y_pred)

In [16]:
# Mostrar métricas
print(f'Acurácia: {accuracy}')
print(f'Precisão: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Acurácia: 0.7790633608815427
Precisão: 0.6935251798561151
Recall: 0.7194029850746269
F1-Score: 0.7062271062271063


In [17]:
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(
    conf_matrix,
    labels=dict(x = 'Predição', y = 'Real', color="Contagem"),
    x=['Not Converted', 'Converted'],
    y=['Not Converted', 'Converted'],
    color_continuous_scale='Viridis'
)

fig.update_traces(text=conf_matrix, texttemplate='%{z}')
fig.update_layout(coloraxis_showscale=False)

fig.show()

In [18]:
# Calcular a importância das variáveis
importance = np.mean([np.abs(estimator.coef_[0]) for estimator in boosting_model.estimators_], axis=0)

In [19]:
importance, importance.shape

(array([2.71674882e-02, 5.03577625e-03, 4.84677554e-03, 5.04975753e-02,
        1.02437391e-02, 1.08999526e-03, 1.30783727e-03, 3.34208813e-03,
        3.37239005e-03, 4.05938689e-04, 2.99896325e-03, 8.22731217e-03,
        1.67209920e-02, 1.55471327e-02, 3.28477960e-02, 5.90137704e-04,
        1.47004714e-04, 1.43405991e-02, 6.90550339e-04, 6.30053313e-03,
        7.11321218e-05, 6.77862830e-05, 7.31914165e-03, 3.37312693e-03,
        4.29940842e-05, 2.41189122e-02, 1.14099451e-03, 1.30353654e-04,
        6.08526009e-05, 8.36190172e-03, 1.55387550e-04, 8.69174431e-05,
        4.24454410e-05, 6.88282782e-05, 4.05260477e-05, 4.62404007e-04,
        8.63908856e-03, 7.07066134e-03, 1.14318081e-03, 1.59853745e-04,
        6.88077024e-03, 1.61706735e-04, 1.34303835e-03, 1.61920501e-03,
        2.11649435e-02, 5.85592838e-03, 1.72390991e-04, 3.46982063e-02,
        1.02416324e-03, 6.64446641e-04, 4.17759238e-05, 4.11933552e-05,
        6.59816995e-05, 5.77150777e-04, 7.05625765e-04, 1.598537

In [20]:
# Obter os nomes reais das features
feature_names = (numeric_features.tolist() + preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist())

In [21]:
# Criar um DataFrame combinando os nomes das variáveis com as importâncias (coeficientes)
df_feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importance})

In [22]:
df_feature_importances = df_feature_importances.sort_values(by='Importance', ascending=True)
df_feature_importances

Unnamed: 0,Feature,Importance
34,Lead Source_youtubechannel,0.000041
51,Last Activity_Visited Booth in Tradeshow,0.000041
67,Last Notable Activity_View in browser link Cli...,0.000042
58,Last Notable Activity_Form Submitted on Website,0.000042
50,Last Activity_View in browser link Clicked,0.000042
...,...,...
60,Last Notable Activity_Modified,0.028695
64,Last Notable Activity_SMS Sent,0.031831
14,Lead Origin_Lead Add Form,0.032848
47,Last Activity_SMS Sent,0.034698


In [23]:
# Plotar o gráfico de importância

fig = px.bar(df_feature_importances,
             x='Importance',
             y='Feature',
             orientation='h',
             title='Importância das Features (Baseada nos coeficientes absolutos)'
)

fig.update_layout(height=1280,width=1000,yaxis={'categoryorder': 'total ascending'})
fig.show()

## Saídas do modelo

In [24]:
# Erros dos estimadores
boosting_model.estimator_errors_

array([0.37994214, 0.38542874, 0.38625994, 0.38232347, 0.37879997,
       0.37036344, 0.36172148, 0.35246908, 0.34558145, 0.31393885,
       0.31692438, 0.32023063, 0.27056929, 0.27849037, 0.29824822,
       0.30901855, 0.32264088, 0.33846908, 0.3531957 , 0.36529172,
       0.36744302, 0.3906565 , 0.38605135, 0.40639296, 0.40262505,
       0.41124235, 0.4270884 , 0.42346159, 0.42987367, 0.43602776,
       0.4479203 , 0.44411563, 0.4504534 , 0.45217655, 0.45929966,
       0.45639688, 0.46011546, 0.46089732, 0.46158429, 0.46758137,
       0.46692241, 0.46879289, 0.47061615, 0.47295472, 0.47441434,
       0.47424984, 0.47633927, 0.47713595, 0.47465613, 0.47570535])

In [25]:
# Pesos dos estimadores
boosting_model.estimator_weights_

array([0.04897938, 0.04665686, 0.04630609, 0.04796979, 0.04946449,
       0.0530658 , 0.05679002, 0.06082036, 0.06385188, 0.07817685,
       0.07679424, 0.07527121, 0.09917361, 0.09519623, 0.08556536,
       0.08047117, 0.07416617, 0.0670124 , 0.06050215, 0.05524692,
       0.0543202 , 0.04445535, 0.04639409, 0.0378897 , 0.03945392,
       0.03588319, 0.02937404, 0.03085791, 0.02823666, 0.02572991,
       0.02090771, 0.02244754, 0.0198839 , 0.01918804, 0.01631624,
       0.01748566, 0.01598778, 0.01567308, 0.01539663, 0.01298567,
       0.01325039, 0.01249909, 0.0117671 , 0.01082868, 0.01024321,
       0.01030919, 0.00947137, 0.009152  , 0.01014625, 0.00972552])

In [26]:
# Predizer a Probabilidade de Conversão
y_pred_prob = boosting_model.predict_proba(X_test)

In [27]:
y_pred_prob

array([[0.23673492, 0.76326508],
       [0.73494926, 0.26505074],
       [0.73494926, 0.26505074],
       ...,
       [0.88079708, 0.11920292],
       [0.88079708, 0.11920292],
       [0.11920292, 0.88079708]], shape=(1815, 2))