Importando as bibliotecas

In [None]:
import numpy as np
import pandas as pd

Carregando base de dados

In [None]:
# Treinamento com a variável alvo
transact_train_database = pd.read_csv('data/transact_train.txt', sep='|')

# Teste sem a variável alvo
transact_test_database = pd.read_csv('data/transact_class.txt', sep = '|')

# Variável alvo do teste
transact_original_database = pd.read_csv('data/realclass_t1.txt', sep = '|')

Alterando a granularidade

In [None]:
transact_train_database = transact_train_database.drop_duplicates(subset=['sessionNo'], keep='last')
transact_train_database_y = transact_train_database['order']
transact_train_database_y = transact_train_database_y.replace(to_replace=['y', 'n'], value=[1, 0])
transact_train_database_x = transact_train_database.drop(['order'], axis=1)

transact_test_database_x = transact_test_database.drop_duplicates(subset=['sessionNo'], keep='last')

Tratando os valores ausentes

Lista de variáveis numéricas

In [None]:
list_numeric_variables = [
  'cMinPrice',
  'cMaxPrice',
  'cSumPrice',
  'bMinPrice',
  'bMaxPrice',
  'bSumPrice',
  'bStep',
  'maxVal',
  'customerScore',
  'accountLifetime',
  'payments',
  'age',
  'address',
  'lastOrder',
]

Lista de variáveis string

In [None]:
list_string_variables = [
  'availability',
  'onlineStatus',
]

In [None]:
list_customer_variables = [
  'maxVal',
  'customerScore',
  'accountLifetime',
  'payments',
  'age',
  'address',
  'lastOrder',
]

In [None]:
def replaceValueMissing(df, columns, value):
  removed = df[columns].replace('?', value)
  for column in columns:
    df[column] = removed[column]

  return df

In [None]:
def convertFloat(df, columns):
  for column in columns:
    df[column] = df[column].astype(float)

  return df

In [None]:
def replaceMissingByMean(df, columns):
  for column in columns:
    mean = df[column].mean()
    df[column].fillna(mean, inplace=True)

  return df

In [None]:
def replaceMissingByMode(df, columns):
  for column in columns:
    mode = df[column].mode()
    df[column].fillna(mode[0], inplace=True)

  return df

In [None]:
def replaceMissingByMedian(df, columns):
  for column in columns:
    median = df[column].median()
    df[column].fillna(median, inplace=True)

  return df

In [None]:
def replaceMissingByFixedValue(df, columns, value):
  for column in columns:
    df[column].fillna(value, inplace=True)

  return df

Base na granularidade do projeto

In [None]:
# Substituindo a string ? por np.nan em todas p/ todas as variáveis da base na granularidade do projeto

transact_train_database_x = replaceValueMissing(transact_train_database_x.copy(), list_numeric_variables, np.nan)
transact_train_database_x = replaceValueMissing(transact_train_database_x.copy(), list_string_variables, np.nan)

transact_test_database_x = replaceValueMissing(transact_test_database_x.copy(), list_numeric_variables, np.nan)
transact_test_database_x = replaceValueMissing(transact_test_database_x.copy(), list_string_variables, np.nan)

transact_train_database_x = convertFloat(transact_train_database_x, list_numeric_variables)
transact_test_database_x = convertFloat(transact_test_database_x, list_numeric_variables)

# # Substituindo os missing values das variáveis numéricas pela média
# transact_train_database_x = replaceMissingByMean(transact_train_database_x, list_numeric_variables)
# transact_test_database_x = replaceMissingByMean(transact_test_database_x, list_numeric_variables)

# transact_train_database_x = replaceValueMissing(transact_train_database_x, list_customer_variables, np.nan)
# transact_test_database_x = replaceValueMissing(transact_test_database_x, list_customer_variables, np.nan)

# Substituindo os missing values das variáveis String por um valor fixo
transact_train_database_x = replaceMissingByFixedValue(transact_train_database_x, list_string_variables, 'ausente')
transact_test_database_x = replaceMissingByFixedValue(transact_test_database_x, list_string_variables, 'ausente')

In [None]:
transact_train_database_x.head(10)

In [None]:
transact_train_X_by_mode = replaceMissingByMode(transact_train_database_x.copy(), list_numeric_variables)
transact_test_X_by_mode = replaceMissingByMode(transact_test_database_x.copy(), list_numeric_variables)

transact_train_X_by_mode = replaceValueMissing(transact_train_X_by_mode, list_customer_variables, np.nan)
transact_test_X_by_mode = replaceValueMissing(transact_test_X_by_mode, list_customer_variables, np.nan)

transact_train_X_by_mode.to_csv('transact_train_X_by_mode.csv', index=False)
transact_test_X_by_mode.to_csv('transact_test_X_by_mode.csv', index=False)

In [None]:
transact_train_X_by_mode.head()

In [None]:
# Atribuindo valor zero para as variáveis comportamentais que não possuem valor de customerNo

rows_without_customerNo = transact_train_database_x['customerNo'] == '?'
transact_train_database_x.loc[rows_without_customerNo, 'maxVal'] = 0
transact_train_database_x.loc[rows_without_customerNo, 'customerScore'] = 0
transact_train_database_x.loc[rows_without_customerNo, 'accountLifetime'] = 0
transact_train_database_x.loc[rows_without_customerNo, 'payments'] = 0
transact_train_database_x.loc[rows_without_customerNo, 'age'] = 0
transact_train_database_x.loc[rows_without_customerNo, 'address'] = 0
transact_train_database_x.loc[rows_without_customerNo, 'lastOrder'] = 0

# Teste
rows_test_without_customerNo = transact_test_database_x['customerNo'] == '?'
transact_test_database_x.loc[rows_test_without_customerNo, 'maxVal'] = 0
transact_test_database_x.loc[rows_test_without_customerNo, 'customerScore'] = 0
transact_test_database_x.loc[rows_test_without_customerNo, 'accountLifetime'] = 0
transact_test_database_x.loc[rows_test_without_customerNo, 'payments'] = 0
transact_test_database_x.loc[rows_test_without_customerNo, 'age'] = 0
transact_test_database_x.loc[rows_test_without_customerNo, 'address'] = 0
transact_test_database_x.loc[rows_test_without_customerNo, 'lastOrder'] = 0

In [None]:
# Substituindo a string ? por np.nan em todas p/ todas as variáveis da base a base original
transact_test_database = replaceValueMissing(transact_test_database.copy(), list_numeric_variables, np.nan)
transact_test_database = replaceValueMissing(transact_test_database.copy(), list_string_variables, np.nan)
transact_train_database = replaceValueMissing(transact_train_database.copy(), list_numeric_variables, np.nan)
transact_train_database = replaceValueMissing(transact_train_database.copy(), list_string_variables, np.nan)

transact_train_database = convertFloat(transact_train_database, list_numeric_variables)
transact_test_database = convertFloat(transact_test_database, list_numeric_variables)

In [None]:
transact_train_database_x.describe(include='all')

In [None]:
transact_test_database_x.describe(include='all')

In [None]:
transact_train_database.head()

In [None]:
transact_test_database.head()

Criando novas variáveis

In [None]:
transact_train_database_x = transact_train_database_x.set_index('sessionNo')
transact_test_database_x = transact_test_database_x.set_index('sessionNo')

In [None]:
# Criando variável bSumPrice_cSumPrice
transact_test_database_x['bSumPrice_cSumPrice'] = transact_test_database_x['bSumPrice'] / transact_test_database_x['cSumPrice']
transact_train_database_x['bSumPrice_cSumPrice'] = transact_train_database_x['bSumPrice'] / transact_train_database_x['cSumPrice']

# --------- Definindo lastOrder_accountLifetime ---------


In [None]:
# Criando variável lastOrder_accountLifetime
transact_test_database_x['lastOrder_accountLifetime'] = transact_test_database_x['lastOrder'] / transact_test_database_x['accountLifetime']
transact_train_database_x['lastOrder_accountLifetime'] = transact_train_database_x['lastOrder'] / transact_train_database_x['accountLifetime']


In [None]:
# Criando variável bCount_cCount
transact_test_database_x['bCount_cCount'] = transact_test_database_x['bCount'] / transact_test_database_x['cCount']
transact_train_database_x['bCount_cCount'] = transact_train_database_x['bCount'] / transact_train_database_x['cCount']


In [None]:
transact_test_database_x.head()

In [None]:
transact_train_database_x.head()

# Transformação dos Dados

In [None]:
#  Remove variáveis contendo ids
transact_train_database_x = transact_train_database_x.drop('customerNo', axis=1)
transact_test_database_x = transact_test_database_x.drop('customerNo', axis=1)

In [None]:
# Criação de variáveis dummies
transact_train_database_x = pd.get_dummies(transact_train_database_x, prefix='_')
transact_test_database_x = pd.get_dummies(transact_test_database_x, prefix='_')

In [None]:
assert(transact_train_database_x.shape[1] == transact_test_database_x.shape[1]), 'Número de variáveis diferente'

In [None]:
variables_transact_train_database_x = list(transact_train_database_x.columns)
variables_transact_test_database_x = list(transact_test_database_x.columns)

in_train_not_test = [i for i in variables_transact_train_database_x if i not in variables_transact_test_database_x]
in_test_not_train = [i for i in variables_transact_test_database_x if i not in variables_transact_train_database_x]

if len(in_train_not_test) > 0:
    print('Variáveis que tão no treinamneto e não no teste')
    print(in_train_not_test)
else:
    print('Todas as variáveis do treinamento estão no teste')

if len(in_test_not_train) > 0:
    print('Variáveis que tão no teste e não no treinamento')
    print(in_test_not_train)
else:
    print('Todas as variáveis do teste estão no treinamento')

In [None]:
transact_train_database_x['__mainly not determinable'] = transact_train_database_x['__mainly not determinable'] + transact_train_database_x['__mainly not determinable']
transact_train_database_x = transact_train_database_x.drop('__mainly not determinable', axis=1)

In [None]:
assert(transact_train_database_x.shape[1] == transact_test_database_x.shape[1]), 'Número de variáveis diferente'

In [None]:
# Checando se todas as colunas estão como numericas, ou seja, não existem variáveis String (type object)
columns_obj = transact_train_database_x.dtypes[transact_train_database_x.dtypes == np.object]

assert(len(columns_obj) == 0), 'Existem variáveis do tipo object'

columns_obj_test = transact_test_database_x.dtypes[transact_test_database_x.dtypes == np.object]

assert(len(columns_obj_test) == 0), 'Existem variáveis do tipo object'

print('Passou no teste: as bases não possuem variáveis String!')

# Salvando as bases de dados

In [None]:
transact_train_database_x.to_csv('transact_train_database_x.csv', index=False)
transact_test_database_x.to_csv('transact_test_database_x.csv', index=False)
transact_train_database_y.to_csv('transact_train_database_y.csv', index=False)