## Exemplo Regressão Logística e MLPs

### Classificador para avaliar satisfação de usuários em linhas aereas

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris, load_digits
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import normalize, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error, roc_auc_score, f1_score, confusion_matrix

'''
Customer Satisfaction in Airline
Source: https://www.kaggle.com/datasets/yakhyojon/customer-satisfaction-in-airline
'''

data_path = 'Modulo01_Invistico_Airline.csv'

# Carregando dataset em data_df
data_df   = pd.read_csv(data_path)
pd.set_option('display.max_columns',None)

print(data_df.columns)

Index(['satisfaction', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient',
       'Food and drink', 'Gate location', 'Inflight wifi service',
       'Inflight entertainment', 'Online support', 'Ease of Online booking',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Cleanliness', 'Online boarding',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes'],
      dtype='object')


In [3]:
## Exemplo de treino com regressão logistica (Dataset iris)
X,y = load_iris(return_X_y=True)

X, Xtest, y, ytest = train_test_split(X,y,test_size=.2,random_state=0)

clf = LogisticRegression(penalty='l1', solver='liblinear',random_state=0, max_iter=1000).fit(X, y)
clf.score(Xtest, ytest)

0.9666666666666667

In [5]:
X,y = load_iris(return_X_y=True)

X, Xtest, y, ytest = train_test_split(X,y,test_size=.2,random_state=0)

clf = LogisticRegression(penalty='l2', solver='lbfgs',random_state=0, max_iter=1000).fit(X, y)
clf.score(Xtest, ytest)

predictions = clf.predict(Xtest)
cmatrix = confusion_matrix(ytest, predictions)

print(cmatrix)

[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]


In [6]:
# retorna as informações estatísticas do dataset
data_df.describe()

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
count,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129487.0
mean,39.427957,1981.409055,2.838597,2.990645,2.851994,2.990422,3.24913,3.383477,3.519703,3.472105,3.465075,3.485902,3.695673,3.340807,3.705759,3.352587,14.713713,15.091129
std,15.11936,1027.115606,1.392983,1.527224,1.443729,1.30597,1.318818,1.346059,1.306511,1.30556,1.270836,1.292226,1.156483,1.260582,1.151774,1.298715,38.071126,38.46565
min,7.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,1359.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,40.0,1925.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,0.0,0.0
75%,51.0,2544.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,5.0,4.0,5.0,4.0,12.0,13.0
max,85.0,6951.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


In [7]:
# Exibir os 5 primeiros exemplos do dataset
# Verificar tipos de dados e features disponíveis (colunas)
data_df.head(5)

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,2,4,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,0,2,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,2,0,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,3,4,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,4,3,4,2,2,0,2,4,2,5,0,0.0


In [8]:
# Separando os dados em subconjunto (treino e teste)
# 20% dos dados atribuidos ao conjunto de teste

train_df_full, test_df_full = train_test_split(data_df, test_size=.2)

# Removendo colunas nao numericas
# obs: poderiamos utilizar essas colunas fazendo a transformacao das labels em numeros

train_df = train_df_full.drop(['Class', 'Type of Travel', 'Customer Type'], axis='columns')
test_df  = test_df_full.drop(['Class', 'Type of Travel', 'Customer Type'], axis='columns')

train_df = train_df.dropna()

# Atribuindo a coluna satisfaction como o rotulo esperado/resposta do modelo (y)
X = train_df.drop('satisfaction', axis='columns')
y = train_df.satisfaction

test_df = test_df.dropna()
Xtest = test_df.drop('satisfaction', axis='columns')
ytest = test_df.satisfaction

print(X.describe())
print(y.describe())


                 Age  Flight Distance   Seat comfort  \
count  103594.000000    103594.000000  103594.000000   
mean       39.426753      1981.002944       2.841719   
std        15.112228      1025.369308       1.393501   
min         7.000000        50.000000       0.000000   
25%        27.000000      1362.000000       2.000000   
50%        40.000000      1925.000000       3.000000   
75%        51.000000      2543.000000       4.000000   
max        85.000000      6950.000000       5.000000   

       Departure/Arrival time convenient  Food and drink  Gate location  \
count                      103594.000000   103594.000000  103594.000000   
mean                            2.992509        2.853042       2.992471   
std                             1.528180        1.445401       1.306259   
min                             0.000000        0.000000       0.000000   
25%                             2.000000        2.000000       2.000000   
50%                             3.000000     

In [9]:
X, y, Xtest, ytest = X.to_numpy(), y.to_numpy(), Xtest.to_numpy(), ytest.to_numpy()

In [10]:
# Treininado um modelo de regressao logistica com nosso dataset
clf = LogisticRegression(penalty='l2', solver='lbfgs',random_state=0, max_iter=10000).fit(X, y)
clf.score(Xtest, ytest)

0.8091376047580427

In [11]:
print(f"Ground truth: {y[-1]}, Predicted Value: {clf.predict([Xtest[-1]])}")

# visualizando scores através da matriz de confunsão
predictions = clf.predict(Xtest)
cmatrix = confusion_matrix(ytest, predictions)

print(cmatrix)


Ground truth: satisfied, Predicted Value: ['satisfied']
[[ 9154  2635]
 [ 2307 11797]]


In [12]:
ynum = [1.0 if x == 'satisfied' else 0.0 for x in ytest]
prednum = [1.0 if x == 'satisfied' else 0.0 for x in predictions]

print(f1_score(ynum, prednum))
print(roc_auc_score(ynum, prednum))

0.8268152509111298
0.8064579684995293


In [13]:
# testando diferentes configurações com MLPs
Xnorm = normalize(X, norm='l1')
mlp1 = MLPClassifier(hidden_layer_sizes=(2,), activation='tanh', solver='sgd', learning_rate='constant',learning_rate_init=0.01, max_iter=1000)
mlp1.fit(Xnorm,y)
Xtest_norm = normalize(Xtest, norm='l1')

print(f'Train score: {mlp1.score(Xnorm, y)}')
print(f'Test score: {mlp1.score(Xtest_norm, ytest)}')

Train score: 0.6700291522675058
Test score: 0.67581971961534


In [14]:

Xnorm = normalize(X, norm='l1')
mlp2 = MLPClassifier(hidden_layer_sizes=(10,), activation='tanh', solver='sgd', learning_rate='constant',learning_rate_init=0.01, max_iter=1000)
mlp2.fit(Xnorm, y)
Xtest_norm = normalize(Xtest, norm='l1')

print(f'Train score: {mlp2.score(Xnorm, y)}')
print(f'Test score: {mlp2.score(Xtest_norm, ytest)}')

Train score: 0.67543487074541
Test score: 0.6796045263198548


In [None]:
Xnorm = normalize(X, norm='l1')
mlp3 = MLPClassifier(hidden_layer_sizes=(100,), activation='tanh', solver='sgd', learning_rate='constant',learning_rate_init=0.01, max_iter=1000)
mlp3.fit(Xnorm, y)
Xtest_norm = normalize(Xtest, norm='l1')

print(f'Train score: {mlp3.score(Xnorm, y)}')
print(f'Test score: {mlp3.score(Xtest_norm, ytest)}')

In [None]:
Xnorm = normalize(X, norm='l1')
mlp4 = MLPClassifier(hidden_layer_sizes=(20,10), activation='relu', solver='adam', learning_rate='constant',learning_rate_init=0.01, max_iter=1000)
mlp4.fit(Xnorm, y)
Xtest_norm = normalize(Xtest, norm='l1')

print(f'Train score: {mlp4.score(Xnorm, y)}')
print(f'Test score: {mlp4.score(Xtest_norm, ytest)}')

In [None]:
Xnorm = normalize(X, norm='l1')
mlp5 = MLPClassifier(hidden_layer_sizes=(5,10,5), activation='relu', solver='adam', learning_rate='constant',learning_rate_init=0.001, max_iter=1000)
mlp5.fit(Xnorm, y)
Xtest_norm = normalize(Xtest, norm='l1')

print(f'Train score: {mlp5.score(Xnorm, y)}')
print(f'Test score: {mlp5.score(Xtest_norm, ytest)}')

In [None]:
cols_mask = (data_df.dtypes == 'object')
categorical_cols = list(cols_mask[cols_mask].index)
print(categorical_cols)

encoder = OrdinalEncoder()

X = train_df_full.copy()
X = X.dropna()
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])
X = X.drop('satisfaction', axis='columns')
y = train_df.satisfaction

Xtest = test_df_full.copy()
Xtest = Xtest.dropna()
Xtest[categorical_cols] = encoder.fit_transform(Xtest[categorical_cols])
ytest = test_df.satisfaction
Xtest = Xtest.drop('satisfaction', axis='columns')

print(X.describe())
print(y.describe())

X, y, Xtest, ytest = X.to_numpy(), y.to_numpy(), Xtest.to_numpy(), ytest.to_numpy()

clf = LogisticRegression(penalty='l2', solver='lbfgs',random_state=0, max_iter=10000).fit(X, y)
print(f"Logistic Train score: {clf.score(X,y)}")
print(f"Logistic Test  score: {clf.score(Xtest, ytest)}")

Xnorm = normalize(X, norm='l1')
mlp = MLPClassifier(hidden_layer_sizes=(20,10), activation='relu', solver='adam', learning_rate='constant',learning_rate_init=0.01, max_iter=1000)
mlp.fit(Xnorm, y)
Xtest_norm = normalize(Xtest, norm='l1')

print(f'MLP Train score: {mlp.score(Xnorm, y)}')
print(f'MLP Test score: {mlp.score(Xtest_norm, ytest)}')

