In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
import plotly.express as px


In [2]:
df_airline_sat = pd.read_csv('../bagging/df_airline_sat_cleaned.csv')

In [3]:
df_airline_sat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  object 
 1   Customer Type                      103904 non-null  object 
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  object 
 4   Class                              103904 non-null  object 
 5   Flight Distance                    103904 non-null  int64  
 6   Inflight wifi service              103904 non-null  int64  
 7   Departure/Arrival time convenient  103904 non-null  int64  
 8   Ease of Online booking             103904 non-null  int64  
 9   Gate location                      103904 non-null  int64  
 10  Food and drink                     103904 non-null  int64  
 11  Online boarding                    1039

## Importar preprocessador

In [4]:
import joblib

preprocessor = joblib.load('../bagging/preprocessor.pkl')

In [5]:
X = df_airline_sat.drop(columns=['satisfaction'], axis=1)
y = df_airline_sat['satisfaction']

In [6]:
# Dividir os dados em treino e teste

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [7]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [8]:
len(X_train), len(X_test)

(83123, 20781)

## Treinar o modelo

In [9]:
adaboost_model = AdaBoostClassifier(
    estimator=LogisticRegression(),
    learning_rate=0.1,
    random_state=51,
    n_estimators=60
)

In [10]:
adaboost_model.fit(X_train,y_train)

0,1,2
,"estimator  estimator: object, default=None The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper ``classes_`` and ``n_classes_`` attributes. If ``None``, then the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier` initialized with `max_depth=1`. .. versionadded:: 1.2  `base_estimator` was renamed to `estimator`.",LogisticRegression()
,"n_estimators  n_estimators: int, default=50 The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. Values must be in the range `[1, inf)`.",60
,"learning_rate  learning_rate: float, default=1.0 Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the `learning_rate` and `n_estimators` parameters. Values must be in the range `(0.0, inf)`.",0.1
,"random_state  random_state: int, RandomState instance or None, default=None Controls the random seed given at each `estimator` at each boosting iteration. Thus, it is only used when `estimator` exposes a `random_state`. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",51

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


## Avaliar resultados

In [11]:
y_pred = adaboost_model.predict(X_test)

In [12]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.87      0.88     11713
           1       0.83      0.85      0.84      9068

    accuracy                           0.86     20781
   macro avg       0.86      0.86      0.86     20781
weighted avg       0.86      0.86      0.86     20781



In [13]:
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(
    conf_matrix,
    labels=dict(x = 'Predição', y = 'Real', color="Contagem"),
    x=['Neutro/Insatisfeito', 'Satisfeito'],
    y=['Neutro/Insatisfeito', 'Satisfeito'],
    color_continuous_scale='Viridis'
)

fig.update_traces(text=conf_matrix, texttemplate='%{z}')
fig.update_layout(coloraxis_showscale=False)

fig.show()

## Importância das Features

In [14]:
importance = np.mean([np.abs(estimator.coef_[0]) for estimator in adaboost_model.estimators_], axis=0)
importances_norm = importance/np.sum(importance)

importances_norm

array([0.00292842, 0.0598815 , 0.13004563, 0.06417205, 0.05087774,
       0.02032534, 0.03029577, 0.00532783, 0.03030032, 0.10471303,
       0.04732535, 0.05532133, 0.05854875, 0.055528  , 0.04237847,
       0.05706063, 0.03503199, 0.04399532, 0.02893988, 0.05882376,
       0.00615596, 0.0120229 ])

In [15]:
feature_names = preprocessor.get_feature_names_out()
feature_names

array(['nom__Gender_Male', 'nom__Customer Type_disloyal Customer',
       'nom__Type of Travel_Personal Travel', 'ord_classe__Class',
       'notas__Inflight wifi service',
       'notas__Departure/Arrival time convenient',
       'notas__Ease of Online booking', 'notas__Gate location',
       'notas__Food and drink', 'notas__Online boarding',
       'notas__Seat comfort', 'notas__Inflight entertainment',
       'notas__On-board service', 'notas__Leg room service',
       'notas__Baggage handling', 'notas__Checkin service',
       'notas__Inflight service', 'notas__Cleanliness', 'num_cont__Age',
       'num_cont__Flight Distance',
       'num_cont__Departure Delay in Minutes',
       'num_cont__Arrival Delay in Minutes'], dtype=object)

In [16]:
df_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
df_importance = df_importance.sort_values(by='Importance', ascending=True)

df_importance

Unnamed: 0,Feature,Importance
0,nom__Gender_Male,0.00082
7,notas__Gate location,0.001492
20,num_cont__Departure Delay in Minutes,0.001724
21,num_cont__Arrival Delay in Minutes,0.003367
5,notas__Departure/Arrival time convenient,0.005692
18,num_cont__Age,0.008104
6,notas__Ease of Online booking,0.008484
8,notas__Food and drink,0.008485
16,notas__Inflight service,0.00981
14,notas__Baggage handling,0.011868


In [17]:
fig = px.bar(df_importance,x='Importance', y='Feature', orientation='h')
fig.update_layout(height=1280,width=1000,yaxis={'categoryorder': 'total ascending'})
fig.show()