In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import BaggingClassifier
import joblib

## Carregar os Dados

In [2]:
df_airline_sat = pd.read_csv('./train.csv')

In [3]:
df_airline_sat.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [4]:
df_airline_sat.drop(columns=["Unnamed: 0"], axis=1, inplace=True)

In [5]:
df_airline_sat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer Type                      103904 non-null  object 
 3   Age                                103904 non-null  int64  
 4   Type of Travel                     103904 non-null  object 
 5   Class                              103904 non-null  object 
 6   Flight Distance                    103904 non-null  int64  
 7   Inflight wifi service              103904 non-null  int64  
 8   Departure/Arrival time convenient  103904 non-null  int64  
 9   Ease of Online booking             103904 non-null  int64  
 10  Gate location                      103904 non-null  int64  
 11  Food and drink                     1039

## EDA

### Análise Univariada

In [6]:
# Verificar se só existe apenas um valor nas colunas
for col in df_airline_sat.columns:
    if df_airline_sat[col].nunique() == 1:
        print(f'Essa Coluna: {col} possui apenas 1 valor')
        df_airline_sat.drop(columns=[col], axis=1, inplace=True)

In [7]:
# Verificar valores nulos
for col in df_airline_sat.columns:
    contagem_nulas = df_airline_sat[col].isnull().sum()
    print(f'{col}: {contagem_nulas} - {contagem_nulas/len(df_airline_sat)}%')

id: 0 - 0.0%
Gender: 0 - 0.0%
Customer Type: 0 - 0.0%
Age: 0 - 0.0%
Type of Travel: 0 - 0.0%
Class: 0 - 0.0%
Flight Distance: 0 - 0.0%
Inflight wifi service: 0 - 0.0%
Departure/Arrival time convenient: 0 - 0.0%
Ease of Online booking: 0 - 0.0%
Gate location: 0 - 0.0%
Food and drink: 0 - 0.0%
Online boarding: 0 - 0.0%
Seat comfort: 0 - 0.0%
Inflight entertainment: 0 - 0.0%
On-board service: 0 - 0.0%
Leg room service: 0 - 0.0%
Baggage handling: 0 - 0.0%
Checkin service: 0 - 0.0%
Inflight service: 0 - 0.0%
Cleanliness: 0 - 0.0%
Departure Delay in Minutes: 0 - 0.0%
Arrival Delay in Minutes: 310 - 0.00298352325223283%
satisfaction: 0 - 0.0%


In [8]:
# Imputar a média no Arrival Delay in Minutes
mean_value = df_airline_sat['Arrival Delay in Minutes'].mean()
mean_value

np.float64(15.178678301832152)

In [9]:
df_airline_sat['Arrival Delay in Minutes'] = df_airline_sat['Arrival Delay in Minutes'].fillna(mean_value)
df_airline_sat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer Type                      103904 non-null  object 
 3   Age                                103904 non-null  int64  
 4   Type of Travel                     103904 non-null  object 
 5   Class                              103904 non-null  object 
 6   Flight Distance                    103904 non-null  int64  
 7   Inflight wifi service              103904 non-null  int64  
 8   Departure/Arrival time convenient  103904 non-null  int64  
 9   Ease of Online booking             103904 non-null  int64  
 10  Gate location                      103904 non-null  int64  
 11  Food and drink                     1039

In [10]:
# Verificar valores negativos

df_numeric = df_airline_sat.select_dtypes(include=['number'])
print((df_numeric < 0).sum())

id                                   0
Age                                  0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
dtype: int64


In [11]:
df_airline_sat['Class'].unique()

array(['Eco Plus', 'Business', 'Eco'], dtype=object)

In [12]:
# Valores possíveis
for col in df_airline_sat.select_dtypes(include=['object']).columns:
    print(f'{col} Possui os seguintes valores únicos - {df_airline_sat[col].unique()}')

Gender Possui os seguintes valores únicos - ['Male' 'Female']
Customer Type Possui os seguintes valores únicos - ['Loyal Customer' 'disloyal Customer']
Type of Travel Possui os seguintes valores únicos - ['Personal Travel' 'Business travel']
Class Possui os seguintes valores únicos - ['Eco Plus' 'Business' 'Eco']
satisfaction Possui os seguintes valores únicos - ['neutral or dissatisfied' 'satisfied']


In [13]:
# Modificando o valor da variável alvo Satisfaction para 1 ou 0

df_airline_sat['satisfaction'] = df_airline_sat['satisfaction'].apply(lambda x : 1 if x == 'satisfied' else 0)

In [14]:
df_airline_sat.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,0
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,0
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1


In [15]:
contagem_type_travel = df_airline_sat.value_counts('Type of Travel')
px.bar(contagem_type_travel/len(df_airline_sat) * 100, color=contagem_type_travel.index)

In [16]:
contagem_customer_type = df_airline_sat.value_counts('Customer Type')
px.bar(contagem_customer_type/len(df_airline_sat) * 100, color=contagem_customer_type.index)

In [17]:
contagem_class = df_airline_sat.value_counts('Class')
px.bar(contagem_class/len(df_airline_sat) * 100, color=contagem_class.index)

In [18]:
contagem_gender = df_airline_sat.value_counts('Gender')

px.bar(contagem_gender/len(df_airline_sat) * 100, color=contagem_gender.index)

### Análise Bivariada

In [19]:
px.box(df_airline_sat, x='satisfaction', y='Departure Delay in Minutes')

In [20]:
px.box(df_airline_sat, x='satisfaction', y='Seat comfort')

In [21]:
px.box(df_airline_sat, x='satisfaction', y='Flight Distance')

In [22]:
px.box(df_airline_sat, x='satisfaction', y='Age')

### Matriz de correlação

In [23]:
corr_matrix = df_airline_sat.select_dtypes(include=['number']).corr()

In [24]:
fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x=corr_matrix.columns,
        y=corr_matrix.index,
        z=np.array(corr_matrix),
        text=corr_matrix.values,
        texttemplate='%{text:.2f}',
        colorscale=px.colors.diverging.RdBu,
        zmin=-1,
        zmax=1
    )
)


fig.show()

### Teste Estatístico

In [25]:
from scipy.stats import mannwhitneyu

dist_satisfeito = df_airline_sat[df_airline_sat['satisfaction'] == 1]['Flight Distance']
dist_insatisfeito = df_airline_sat[df_airline_sat['satisfaction'] == 0]['Flight Distance']

stat, p = mannwhitneyu(dist_satisfeito, dist_insatisfeito)
print(f'Valor p do teste Mann-Whitney: {p}')
print(f'Existe correlação? {p < 0.05}')

Valor p do teste Mann-Whitney: 0.0
Existe correlação? True


## Preparação dos Dados

In [26]:
df_airline_sat.drop(columns=['id'], axis=1, inplace=True)

In [27]:
X = df_airline_sat.drop(columns=['satisfaction'], axis=1)
y = df_airline_sat['satisfaction']

In [28]:
X.shape, y.shape

((103904, 22), (103904,))

In [29]:
X_train,X_test, y_train, y_test = train_test_split(X,y,train_size=0.2,random_state=51)

In [30]:
colunas_nominais = ['Gender', 'Customer Type', 'Type of Travel']

coluna_classe = ['Class']
ordem_classe = [['Eco Plus', 'Business', 'Eco']]

colunas_notas = [
    'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking',
    'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
    'Inflight entertainment', 'On-board service', 'Leg room service',
    'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness'
]

colunas_numericas = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']

preprocessor = ColumnTransformer(
    transformers=[
        ('nom', OneHotEncoder(drop='first',handle_unknown='ignore'), colunas_nominais),
        ('ord_classe', OrdinalEncoder(categories=ordem_classe), coluna_classe),
        ('notas', MinMaxScaler(), colunas_notas),
        ('num_cont', RobustScaler(), colunas_numericas)
    ]
)


In [31]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

## Treinar o modelo

In [32]:
bagging_model = BaggingClassifier(
    estimator=LogisticRegression(),
    n_estimators=30,
    random_state=42
)

In [33]:
bagging_model.fit(X_train,y_train)

0,1,2
,"estimator  estimator: object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a :class:`~sklearn.tree.DecisionTreeClassifier`. .. versionadded:: 1.2  `base_estimator` was renamed to `estimator`.",LogisticRegression()
,"n_estimators  n_estimators: int, default=10 The number of base estimators in the ensemble.",30
,"max_samples  max_samples: int or float, default=None The number of samples to draw from X to train each base estimator (with replacement by default, see `bootstrap` for more details). - If None, then draw `X.shape[0]` samples irrespective of `sample_weight`. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` unweighted samples or  `max_samples * sample_weight.sum()` weighted samples.",
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator ( without replacement by default, see `bootstrap_features` for more details). - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features.",1.0
,"bootstrap  bootstrap: bool, default=True Whether samples are drawn with replacement. If False, sampling without replacement is performed. If fitting with `sample_weight`, it is strongly recommended to choose True, as only drawing with replacement will ensure the expected frequency semantics of `sample_weight`.",True
,"bootstrap_features  bootstrap_features: bool, default=False Whether features are drawn with replacement.",False
,"oob_score  oob_score: bool, default=False Whether to use out-of-bag samples to estimate the generalization error. Only available if bootstrap=True.",False
,"warm_start  warm_start: bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. See :term:`the Glossary `. .. versionadded:: 0.17  *warm_start* constructor parameter.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for both :meth:`fit` and :meth:`predict`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the random resampling of the original dataset (sample wise and feature wise). If the base estimator accepts a `random_state` attribute, a different seed is generated for each instance in the ensemble. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",42

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [34]:
y_pred = bagging_model.predict(X_test)

In [35]:
y_pred

array([1, 0, 1, ..., 1, 0, 1], shape=(83124,))

## Avaliar resultados

In [36]:
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

In [37]:
print(f'Acurácia: {accuracy}')
print(f'Precisão: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Acurácia: 0.8723954573889611
Precisão: 0.8690780306549002
Recall: 0.8308033855973359
F1-Score: 0.8495098108763816


In [38]:
cm = confusion_matrix(y_test, y_pred)

labels = ['Neutro/Insatisfeito', 'Satisfeito']

fig = px.imshow(
    cm, 
    text_auto=True,
    labels=dict(x="Predição", y="Valor Real", color="Contagem"),
    x=labels, 
    y=labels,
    color_continuous_scale='Blues',
    title="Matriz de Confusão - Satisfação de Passageiros"
)

fig.update_layout(
    title_x=0.5,
    width=600, 
    height=500
)

fig.show()

## Avaliar importância das features

In [39]:
importances = np.mean([np.abs(estimator.coef_[0]) for estimator in bagging_model.estimators_], axis=0)
importances

array([0.03867899, 2.11770116, 3.10101683, 0.2096054 , 1.50208826,
       0.66340348, 0.60743419, 0.27623798, 0.16814641, 3.37050026,
       0.39958572, 0.20613152, 1.66229273, 1.30981854, 0.62062055,
       1.43511201, 0.84795999, 0.89367305, 0.16196515, 0.09559778,
       0.05066351, 0.11283875])

In [40]:
importances_norm = importances/ np.sum(importances)
importances_norm

array([0.00194846, 0.10667943, 0.15621407, 0.0105589 , 0.07566787,
       0.03341903, 0.03059957, 0.01391552, 0.00847039, 0.16978933,
       0.02012918, 0.0103839 , 0.08373818, 0.06598226, 0.03126383,
       0.07229393, 0.04271608, 0.04501888, 0.00815901, 0.00481575,
       0.00255218, 0.00568426])

In [41]:
feature_names = preprocessor.get_feature_names_out()

In [42]:
feature_names

array(['nom__Gender_Male', 'nom__Customer Type_disloyal Customer',
       'nom__Type of Travel_Personal Travel', 'ord_classe__Class',
       'notas__Inflight wifi service',
       'notas__Departure/Arrival time convenient',
       'notas__Ease of Online booking', 'notas__Gate location',
       'notas__Food and drink', 'notas__Online boarding',
       'notas__Seat comfort', 'notas__Inflight entertainment',
       'notas__On-board service', 'notas__Leg room service',
       'notas__Baggage handling', 'notas__Checkin service',
       'notas__Inflight service', 'notas__Cleanliness', 'num_cont__Age',
       'num_cont__Flight Distance',
       'num_cont__Departure Delay in Minutes',
       'num_cont__Arrival Delay in Minutes'], dtype=object)

In [43]:
df_importancia = pd.DataFrame({
    'Feature': feature_names,
    'Importancia': importances_norm
})

In [44]:
df_importancia = df_importancia.sort_values('Importancia', ascending=True)

In [45]:
fig = px.bar(df_importancia, 
             x='Importancia',
             y='Feature',
             orientation='h',
             title='Importância das Features (com base nos coeficientes absolutos)'
)

fig.update_layout(height=1280,width=1000,yaxis={'categoryorder': 'total ascending'})
fig.show()


In [46]:
df_airline_sat.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,0
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,0
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1


In [47]:
df_airline_sat.to_csv('./df_airline_sat_cleaned.csv',index=False)

In [48]:
joblib.dump(bagging_model, './baggin_model.pkl')
joblib.dump(preprocessor, './preprocessor.pkl')

['./preprocessor.pkl']