# Coletando os dados

In [1]:
# abrir a conexao com o banco de dados
import sqlite3
import pandas as pd

In [2]:
# abrindo a conexão com o banco de dados
conn = sqlite3.connect( "database.db" )

In [3]:
# coletando os dados
consulta_atividade = """
SELECT *
FROM
flight_activity fa LEFT JOIN flight_loyalty_history flh
ON (fa.loyalty_number = flh.loyalty_number )
"""

In [4]:
# executando a consulta
df_atividade = pd.read_sql_query( consulta_atividade, conn )

# Inspecionando a planilha de dados

In [5]:
# verificando a quantidade de linhas
numero_linhas = df_atividade.shape[0]
print( 'O numero de linhas eh:', numero_linhas )

O numero de linhas eh: 405624


In [6]:
# verificando a quantidade de colunas
numero_colunas = df_atividade.shape[1]
print( 'O numero de linhas eh:', numero_linhas )

O numero de linhas eh: 405624


In [7]:
# descobrindo as informacoes gerais sobre a planilha de dados
df_atividade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405624 entries, 0 to 405623
Data columns (total 26 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   loyalty_number               405624 non-null  int64  
 1   year                         405624 non-null  int64  
 2   month                        405624 non-null  int64  
 3   flights_booked               405624 non-null  int64  
 4   flights_with_companions      405624 non-null  int64  
 5   total_flights                405624 non-null  int64  
 6   distance                     405624 non-null  int64  
 7   points_accumulated           405624 non-null  float64
 8   points_redeemed              405624 non-null  int64  
 9   dollar_cost_points_redeemed  405624 non-null  int64  
 10  loyalty_number               405624 non-null  int64  
 11  country                      405624 non-null  object 
 12  province                     405624 non-null  object 
 13 

In [8]:
# Somar a colunas "total_flights"
df_atividade.loc[:, 'total_flights'].sum()

2087689

In [9]:
# Somar a colunas "distance"
df_atividade.loc[:, 'distance'].mean()

1208.880058872256

In [10]:
# Valor mínimo de salário
df_atividade.loc[:, 'distance'].min()

0

In [11]:
# Valor máximo de salário
df_atividade.loc[:, 'distance'].max()

6293

In [12]:
# checando o número de dados faltante nas colunas
df_atividade.isna().sum()

loyalty_number                      0
year                                0
month                               0
flights_booked                      0
flights_with_companions             0
total_flights                       0
distance                            0
points_accumulated                  0
points_redeemed                     0
dollar_cost_points_redeemed         0
loyalty_number                      0
country                             0
province                            0
city                                0
postal_code                         0
gender                              0
education                           0
salary                         102672
marital_status                      0
loyalty_card                        0
clv                                 0
enrollment_type                     0
enrollment_year                     0
enrollment_month                    0
cancellation_year              355560
cancellation_month             355560
dtype: int64

## Seleção de linhas e colunas

In [13]:
# Comando do Pandas para selecionar linhas e colunas
#df = df1.iloc[linhas, colunas]

In [14]:
# selecionando colunas de uma planilha
colunas = ['year', 'month', 'flights_booked', 'flights_with_companions',
'total_flights', 'distance', 'points_accumulated', 'points_redeemed',
'dollar_cost_points_redeemed', 'salary', 'clv', 'enrollment_year',
'enrollment_month', 'loyalty_card']

In [15]:
df_dados_limpos = df_atividade.loc[:, colunas]

In [16]:
df_dados_limpos

Unnamed: 0,year,month,flights_booked,flights_with_companions,total_flights,distance,points_accumulated,points_redeemed,dollar_cost_points_redeemed,salary,clv,enrollment_year,enrollment_month,loyalty_card
0,2017,1,3,0,3,1521,152.0,0,0,92552.0,7919.20,2016,8,Aurora
1,2017,1,10,4,14,2030,203.0,0,0,,2887.74,2013,3,Nova
2,2017,1,6,0,6,1200,120.0,0,0,,2838.07,2016,7,Nova
3,2017,1,0,0,0,0,0.0,0,0,63253.0,4170.57,2015,8,Star
4,2017,1,0,0,0,0,0.0,0,0,91163.0,6622.05,2014,1,Star
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
405619,2018,12,0,0,0,0,0.0,0,0,,7290.07,2014,5,Aurora
405620,2018,12,0,0,0,0,0.0,0,0,217943.0,8564.77,2012,8,Nova
405621,2018,12,3,0,3,1233,123.0,0,0,47670.0,20266.50,2017,7,Nova
405622,2018,12,0,0,0,0,0.0,0,0,,2631.56,2018,7,Star


# Preparando os dados para treinamento do algoritmo

### Operações a serem realizadas nesta etapa:

* Remoção ou substituição de lnhas que contém dados faltantes
* Remoção das colunas com dados sem variabilidade
* Colunas com valores altos de correlação
* Trnasformação dos dados categ´´oricos em dados  numéricos

### Limpando a base de dados

In [17]:
# Numero de dados faltantes
df_atividade.isna().sum()

loyalty_number                      0
year                                0
month                               0
flights_booked                      0
flights_with_companions             0
total_flights                       0
distance                            0
points_accumulated                  0
points_redeemed                     0
dollar_cost_points_redeemed         0
loyalty_number                      0
country                             0
province                            0
city                                0
postal_code                         0
gender                              0
education                           0
salary                         102672
marital_status                      0
loyalty_card                        0
clv                                 0
enrollment_type                     0
enrollment_year                     0
enrollment_month                    0
cancellation_year              355560
cancellation_month             355560
dtype: int64