# Competição DSA de Machine Learning - Edição Junho/2019
Imagine estar com fome em uma parte desconhecida da cidade e receber recomendações de restaurantes, com base em suas preferências pessoais, no momento certo. A recomendação vem com um desconto em anexo da sua operadora de cartão de crédito para um local ao virar a esquina!

Uma Startup pensou nisso e construiu parcerias com comerciantes para oferecer promoções ou descontos aos portadores de cartões de crédito. Mas essas promoções funcionam tanto para o consumidor quanto para o comerciante? Os clientes aproveitam a experiência? Os comerciantes veem resultado? A personalização é fundamental.

Os profissionais da Startup construíram modelos de aprendizado de máquina para entender os aspectos e preferências mais importantes no ciclo de vida de seus clientes, desde alimentos a compras. Mas até agora nenhum deles é especificamente adaptado para um indivíduo ou perfil. É aqui que você entra. Precisando de um modelo preditivo mais robusto, a Startup selecionou você como Cientista de Dados.


## The Goal
 
- Os arquivos dataset_treino.csv e dataset_teste.csv contêm card_ids e informações sobre o próprio cartão - o primeiro mês em que o cartão estava ativo, etc.
- Objetivo é prever um índice de lealdade para cada card_id
- O modelo é avalidado pelo Root-Mean-Squared-Error (RMSE)

## Key features of the model training process in this kernel:
- **Cross Validation:** Using 2-fold cross-validation (para testar mais rapido)
- **Models:** svr, gradient boosting, random forest, xgboost, lightgbm e keras regressors
- **Blending:** Para ter as previsoes finais eu juntei os modelos para obter uma performance melhor

## Extraindo e Carregando os Dados

In [None]:
# Importando bibliotecas que serao utilizadas neste projeto
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance


# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import preprocessing
from sklearn import utils

# Keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
pd.set_option('display.max_columns', None)

import gc 
import pickle
import datetime
import os
print(os.listdir("data"))
#print(os.listdir("../input"))

In [None]:
print(datetime.datetime.now().time())

In [None]:
#'../input/datadsa/transacoes_historicas/transacoes_historicas.csv'
hist = pd.read_csv('data/transacoes_historicas.csv'
                            ,parse_dates=['purchase_date']
                            ,dtype = {
                                'city_id': np.int16
                                ,'installments': np.int16
                                ,'merchant_category_id': np.int16
                                ,'month_lag': np.int8
                                ,'purchase_amount': np.float32
                                ,'state_id': np.int8
                                ,'subsector_id': np.int8
                            }) 

#''../input/competicao-dsa-machine-learning-jun-2019/novas_transacoes_comerciantes.csv
novas = pd.read_csv('data/novas_transacoes_comerciantes.csv'
                            ,parse_dates=['purchase_date']
                            ,dtype = {
                                'city_id': np.int16
                                ,'installments': np.int16
                                ,'merchant_category_id': np.int16
                                ,'month_lag': np.int8
                                ,'purchase_amount': np.float32
                                ,'state_id': np.int8
                                ,'subsector_id': np.int8
                            })   

#''../input/competicao-dsa-machine-learning-jun-2019/dataset_treino.csv
train = pd.read_csv('data/dataset_treino.csv'
                       ,parse_dates=['first_active_month']
                       ,dtype = {
                                'feature_1': np.int8
                                ,'feature_2': np.int8
                                ,'feature_3': np.int8
                            })

#'../input/competicao-dsa-machine-learning-jun-2019/comerciantes.csv'
com = pd.read_csv('data/comerciantes.csv')



In [None]:
# Read in the dataset as a dataframe
test = pd.read_csv('data/dataset_teste.csv'
                        ,parse_dates=['first_active_month']
                        ,dtype = {
                                'feature_1': np.int8
                               ,'feature_2': np.int8
                               ,'feature_3': np.int8
                            })

In [None]:
# Criar um index para o dataframe de treino
train = train.reset_index()

In [None]:
test = test.reset_index()

In [None]:
tmp = pd.concat( [hist, novas],axis=0,ignore_index=True)

In [None]:
novas.shape, hist.shape, tmp.shape

In [None]:
# Uniao dos dataset de treino com transacoes novas e historicas
df = pd.merge(train, tmp, on='card_id', how='left')

# Uniao dos dataset de treino e teste com comerciantes
df = pd.merge(df, com, on='merchant_id', how='left')

In [None]:
# Uniao dos dataset de teste com transacoes novas e historicas
dfTest = pd.merge(test, tmp, on='card_id', how='left')

# Uniao dos dataset de teste e teste com comerciantes
dfTest = pd.merge(dfTest, com, on='merchant_id', how='left')

In [None]:
del train, hist, tmp, novas, com
gc.collect()

Removendo algumas colunas do modelo repetidas e outras que nao afetaram o resultado (pelo menos ate agora)
Pode ser que criando novas features com essas colunas seja interessante para melhorar a performance do modelo

In [None]:
df.drop(columns = ["merchant_category_id_y",
                   "subsector_id_y", 
                   "city_id_y", 
                   "state_id_y", 
                   "category_1_y", 
                   "category_2_y",
                   "category_3", 
                   "merchant_id"
                  ], inplace = True) 

In [None]:
dfTest.drop(columns = ["merchant_category_id_y",
                   "subsector_id_y", 
                   "city_id_y", 
                   "state_id_y", 
                   "category_1_y", 
                   "category_2_y",
                   "category_3", 
                   "merchant_id"
                  ], inplace = True) 

Comparacao estatistica entre os dois datasets (treino e teste)

In [None]:
df.describe()

In [None]:
dfTest.describe()

Algumas observacoes daqui:

1. alguns valores infinito e NaN nas colunas avg_purchases_lag3, avg_purchases_lag6 e avg_purchases_lag12 em ambos os dataset (treino e teste)
2. a media dos dados estao relativamente proximos entre os dataset de treino e teste
3. o valor medio parece estar relativamente em um range pequeno
4. a feature numerical_1 e numerical_2 sao muito parecidas, pouca variacao
5. a media de valores da coluna avg_sales_lag6 entre os dois modelos é bem diferente (vale a pena verificar se impacta na performance do modelo)

Verificando valores missing

In [None]:
def percent_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0, len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
    
    return dict_x

missing = percent_missing(df)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:50]

In [None]:
# Setup do plot
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
sns.set_color_codes(palette='deep')

# Identificando os valores missing
missing = round(df.isnull().mean()*100,2)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(color="b")

# Visual presentation
ax.xaxis.grid(False)
ax.set(ylabel="Percent of missing values")
ax.set(xlabel="Features")
ax.set(title="Percent missing data by feature")
sns.despine(trim=True, left=True)

Vamos substituir os valores INF por NaN e preencher os valores NaN por 0

In [None]:
df = df.replace([np.inf, -np.inf], np.nan)
df.update(df.fillna(df.median()))
#df.update(df.fillna(0))

Vamos fazer uma copia do dataset de Treino para facilitar e acelerar algumas analises e acuracia do modelo. A principio usarei uma amostra de 100.000 registros

In [None]:
dfSample = df
dfSample.shape

In [None]:
dfSample.head()

## Análise Exploratória de Dados

In [None]:
# Verificando os tipos de dados do dataset
dfSample.dtypes

Vamos transformar algumas variaveis categoricas em numericas

In [None]:
# Transformando algumas variaveis string para inteiro
cleanup_nums = {"authorized_flag":             {"N": 0, "Y": 1},
                "category_1_x":                {"N": 0, "Y": 1},
                "category_3":                  {"A": 1, "B": 2, "C": 3},
                "category_4":                  {"N": 0, "Y": 1},
                "most_recent_sales_range":     {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5},
                "most_recent_purchases_range": {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}}

dfSample.replace(cleanup_nums, inplace=True)
dfSample.head()

In [None]:
# Transformando algumas variaveis string para inteiro
dfTest.replace(cleanup_nums, inplace=True)
dfTest.head()

Slipt de algumas features do dataset de treino e teste

In [None]:
# Dataset de Treino
dfSample['first_active_month'] = pd.to_datetime(dfSample['first_active_month'])
dfSample['active_dayofweek'] = dfSample.first_active_month.apply(lambda dt: dt.dayofweek)
dfSample['active_year'] = dfSample.first_active_month.apply(lambda dt: dt.year)
dfSample['active_month'] = dfSample.first_active_month.apply(lambda dt: dt.month)
dfSample.drop(columns =["first_active_month"], inplace = True) 

# Codigo abaixo comentado pois nao funcionou no Kernel (somente na maquina local)
dfSample['purchase_date'] = pd.to_datetime(dfSample['purchase_date'])
dfSample['purchase_date_day'] = dfSample.purchase_date.apply(lambda dt: dt.day)
dfSample['purchase_date_dayofweek'] = dfSample.purchase_date.apply(lambda dt: dt.dayofweek)
dfSample['purchase_date_month'] = dfSample.purchase_date.apply(lambda dt: dt.month)
dfSample['purchase_date_year'] = dfSample.purchase_date.apply(lambda dt: dt.year)
dfSample['purchase_date_hour'] = dfSample.purchase_date.apply(lambda dt: dt.hour)
dfSample.drop(columns =["purchase_date"], inplace = True)  

In [None]:
# Dataset de Test
dfTest['first_active_month'] = pd.to_datetime(dfTest['first_active_month'])
dfTest['active_dayofweek'] = dfTest.first_active_month.apply(lambda dt: dt.dayofweek)
dfTest['active_year'] = dfTest.first_active_month.apply(lambda dt: dt.year)
dfTest['active_month'] = dfTest.first_active_month.apply(lambda dt: dt.month)
dfTest.drop(columns =["first_active_month"], inplace = True) 

# Codigo abaixo comentado pois nao funcionou no Kernel (somente na maquina local)
dfTest['purchase_date'] = pd.to_datetime(dfTest['purchase_date'])
dfTest['purchase_date_day'] = dfTest.purchase_date.apply(lambda dt: dt.day)
dfTest['purchase_date_dayofweek'] = dfTest.purchase_date.apply(lambda dt: dt.dayofweek)
dfTest['purchase_date_month'] = dfTest.purchase_date.apply(lambda dt: dt.month)
dfTest['purchase_date_year'] = dfTest.purchase_date.apply(lambda dt: dt.year)
dfTest['purchase_date_hour'] = dfTest.purchase_date.apply(lambda dt: dt.hour)
dfTest.drop(columns =["purchase_date"], inplace = True)  

Vamos plotar um scatter plot para verificar os dados de treino. Vamos visualizar 5% dos dados. Na eixo x vamos colocar as features e no eixo y a variavel target.

In [None]:
def plot_feature_scatter(df1, df2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5,6,figsize=(24,24))

    for feature in features:
        i += 1
        plt.subplot(5,6,i)
        plt.scatter(df2[feature], df1['target'], marker='+')
        plt.xlabel(feature, fontsize=10)
    plt.show();

In [None]:
features = ['feature_1', 'feature_2','feature_3','month_lag', 'purchase_amount', 
            'avg_sales_lag3', 'avg_sales_lag6', 'avg_sales_lag12', 'avg_purchases_lag3', 'avg_purchases_lag6',
            'avg_purchases_lag12','active_months_lag3', 'active_months_lag6', 'active_months_lag12', 
            'active_dayofweek','active_month','active_year',
            'merchant_category_id_x', 'subsector_id_x', 'city_id_x', 'state_id_x',
            'category_1_x', 'category_2_x', 'authorized_flag', 'installments',
            'merchant_group_id', 'numerical_1', 'numerical_2'
           ]
plot_feature_scatter(dfSample,dfSample, features)

Verificando caracteristicas da variável target

In [None]:
from scipy import stats

sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))

# Fit a normal distribution
mu, std = norm.fit(dfSample['target'])

# Verificando a distribuicao de frequencia da variavel TARGET
sns.distplot(dfSample['target'], color="b", fit = stats.norm);
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="target")
ax.set(title="Target distribution: mu = %.2f,  std = %.2f" % (mu, std))
sns.despine(trim=True, left=True)

# Skewness: It is the degree of distortion from the symmetrical bell curve or the normal distribution
# If the skewness is between -0.5 and 0.5, the data are fairly symmetrical.
# If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed.
# If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed.

# Kurtosis: It is actually the measure of outliers present in the distribution.
# High kurtosis in a data set is an indicator that data has heavy tails or outliers. 
# Low kurtosis in a data set is an indicator that data has light tails or lack of outliers

ax.text(x=1.1, y=1, transform=ax.transAxes, s="Skewness: %f" % dfSample['target'].skew(),\
        fontweight='demibold', fontsize=10, verticalalignment='top', horizontalalignment='right',\
        backgroundcolor='white', color='xkcd:poo brown')
ax.text(x=1.1, y=0.95, transform=ax.transAxes, s="Kurtosis: %f" % dfSample['target'].kurt(),\
        fontweight='demibold', fontsize=10, verticalalignment='top', horizontalalignment='right',\
        backgroundcolor='white', color='xkcd:dried blood')

plt.show()

A partir desses graficos com features relacionadas a variavel target é possível perceber os outliers. Entre -20 e -30 nao tem valores... mas -33 tem alguns

Plot de como as features se correlacionam com cada uma e com a variavel target

In [None]:
dfSample[dfSample.columns.drop('target')].corrwith(dfSample.target)

Poucas variaveis se correlacionam fortemente com a target

In [None]:
fig = plt.subplots(figsize = (30,30))
sns.set(font_scale=1.5)
sns.heatmap(dfSample.corr(),square = True,cbar=True,annot=True,annot_kws={'size': 10})
plt.show()

Algumas observacoes daqui:

1. feature_1 e feature_3 tem um forte relacionamento positivo
2. avg_sales_lag3 e avg_purchases_lag3 tem um relacionamento praticamente 1 (talvez seja necessario retirar uma dessas variaveis ou juntar as duas por uma multiplicacao, totalizando vendas x compras)
3. o mesmo ocorre com as outras variaveis _lag6 e _lag12

Visualizar um boxplot de todas as variaveis numericas

In [None]:
# Fetch all numeric features
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in dfSample.columns:
    if dfSample[i].dtype in numeric_dtypes:
        numeric.append(i)
        
        
# Create box plots for all numeric features
sns.set_style("white")
f, ax = plt.subplots(figsize=(14, 11))
ax.set_xscale("log")
ax = sns.boxplot(data=dfSample[numeric] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)

Consideracoes sobre esse BoxPlot:

1. as features avg_sales_lagx e avg_purchase_lagx tem muitos outliers (talvez seja necessario trata-los dependendo do modelo preditivo)
2. tambem é necessário verificar a feature 'feature_3'pois exibe um comportamento diferente das demais features

## Feature Engineering

Visualizando novamente a distribuição da variavel Target

In [None]:
# Setup do plot
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))

# Verificando a distribuicao
sns.distplot(dfSample['target'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="Target")
ax.set(title="Target distribution")
sns.despine(trim=True, left=True)
plt.show()

Como verificado na analise exploratoria, temos alguns outliers que serao tratados

In [None]:
# Verificando mais de perto a variavel target
dfSample['target'].describe()

O valor minimo é -33.219281 e o valor maximo é de 15.994455, gerando um desvio padrão muito alto em relação a media
Neste primeiro momento vou remover estes outiliers diretamente do dataset, para fazer alguns testes

In [None]:
# Removendo outliers da variavei target (abaixo de -10 e acima de 10)
dfSample.drop(dfSample[(dfSample['target'] < -10)].index, inplace=True)
dfSample.drop(dfSample[(dfSample['target'] > 10)].index, inplace=True)

Vale a pena realizar uma transformacao logaritma, para auxiliar o modelo

In [None]:
# Realizando uma transformacao logaritma
# log(1+x) transform
dfSample["target"] = np.log1p(dfSample["target"])

Depois da transformacao, varios valores inf apareceram, por isso vamos preencher com zero

In [None]:
dfSample = dfSample.replace([np.inf, -np.inf], np.nan)
dfSample.update(dfSample["target"].fillna(0))

In [None]:
from scipy import stats

sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))

# Fit a normal distribution
mu, std = norm.fit(dfSample['target'])

# Verificando a distribuicao de frequencia da variavel TARGET
sns.distplot(dfSample['target'], color="b", fit = stats.norm);
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="target")
ax.set(title="Target distribution: mu = %.2f,  std = %.2f" % (mu, std))
sns.despine(trim=True, left=True)

ax.text(x=1.1, y=1, transform=ax.transAxes, s="Skewness: %f" % dfSample['target'].skew(),\
        fontweight='demibold', fontsize=10, verticalalignment='top', horizontalalignment='right',\
        backgroundcolor='white', color='xkcd:poo brown')
ax.text(x=1.1, y=0.95, transform=ax.transAxes, s="Kurtosis: %f" % dfSample['target'].kurt(),\
        fontweight='demibold', fontsize=10, verticalalignment='top', horizontalalignment='right',\
        backgroundcolor='white', color='xkcd:dried blood')

plt.show()

A variavel target ficou bem proximo de uma distribuicao normal

Verificando a distribuicao de outras variaveis com outliers detectado na analise exploratoria

In [None]:
# Setup do plot
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))

# Verificando a distribuicao
sns.distplot(dfSample['purchase_amount'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="purchase_amount")
ax.set(title="purchase_amount distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
dfSample['purchase_amount'].describe()

Feature 'purchase_amount' tem um desvio padrao muito alto. Para valores abaixo de zero, vamos atribuir 0.01 e para valores acima de 1 vamos atribuir 1 (penso que é o montante de compra, estranho estar negativo)

In [None]:
dfSample['purchase_amount'] = dfSample['purchase_amount'].apply(lambda x: 0.01 if x <= 0 else x)
dfSample['purchase_amount'] = dfSample['purchase_amount'].apply(lambda x: 1 if x > 1 else x)

In [None]:
# Setup do plot
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))

# Verificando a distribuicao
sns.distplot(dfSample['purchase_amount'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="purchase_amount")
ax.set(title="purchase_amount distribution")
sns.despine(trim=True, left=True)
plt.show()

Vamos agora dar uma olhada da distribuicao de frequencia das features avg_sales_lagx e avg_purchase_lagx, pois apresentaram muitos outliers

In [None]:
def plot_feature_dist(df1, features):
    i = 0
    sns.set_style('whitegrid')
    sns.set_color_codes(palette='deep')
    plt.figure()
    fig, ax = plt.subplots(3,2,figsize=(24,12))

    for feature in features:
        i += 1
        plt.subplot(3,2,i)
        sns.distplot(df1[feature], color="b");
        plt.xlabel(feature, fontsize=10)
    plt.show();

In [None]:
dfSample.head()

Realmente tem que verificar como proceder nessas variaveis, que acredito que podem ajudar na acuracia do modelo.
Vamos criar algumas variaveis multiplicando a media de venda pela media de compra

In [None]:
# Dataset de Treino
dfSample['var_lag3'] = dfSample['avg_sales_lag3'] * dfSample['avg_purchases_lag3']
dfSample['var_lag6'] = dfSample['avg_sales_lag6'] * dfSample['avg_purchases_lag6']
dfSample['var_lag12'] = dfSample['avg_sales_lag12'] * dfSample['avg_purchases_lag12']
dfSample['feature_4'] = (dfSample['feature_1'] * dfSample['feature_1'].mean()) + (dfSample['feature_2'] * dfSample['feature_2'].mean()) + (dfSample['feature_3'] * dfSample['feature_3'].mean())
dfSample['feature_5'] = (dfSample['month_lag'] * dfSample['purchase_date_month'])
dfSample['feature_6'] = (dfSample['avg_sales_lag3'] * dfSample['active_months_lag3'])
dfSample['feature_7'] = (dfSample['avg_sales_lag6'] * dfSample['active_months_lag6'])
dfSample['feature_8'] = (dfSample['avg_sales_lag12'] * dfSample['active_months_lag12'])
dfSample['feature_9'] = (dfSample['month_lag'] * dfSample['active_months_lag3'])
dfSample['feature_10'] = (dfSample['month_lag'] * dfSample['active_months_lag6'])
dfSample['feature_11'] = (dfSample['month_lag'] * dfSample['active_months_lag12'])
dfSample['feature_12'] = (dfSample['most_recent_sales_range'] * dfSample['avg_sales_lag3'])
dfSample['feature_13'] = (dfSample['most_recent_sales_range'] * dfSample['avg_sales_lag6'])
dfSample['feature_14'] = (dfSample['most_recent_sales_range'] * dfSample['avg_sales_lag12'])
dfSample['feature_15'] = (dfSample['feature_1'] * dfSample['category_1_x'] * dfSample['numerical_1'])
dfSample['feature_16'] = (dfSample['feature_2'] * dfSample['category_2_x'] * dfSample['numerical_2'])
dfSample['feature_17'] = (dfSample['purchase_amount'] / dfSample['state_id_x'])
dfSample['feature_18'] = (dfSample['purchase_amount'] / dfSample['subsector_id_x'])
dfSample['feature_19'] = (dfSample['purchase_amount'] / dfSample['merchant_category_id_x'])

In [None]:
dfSample.head()

In [None]:
# Dataset de Treino
dfTest['var_lag3'] = dfTest['avg_sales_lag3'] * dfTest['avg_purchases_lag3']
dfTest['var_lag6'] = dfTest['avg_sales_lag6'] * dfTest['avg_purchases_lag6']
dfTest['var_lag12'] = dfTest['avg_sales_lag12'] * dfSample['avg_purchases_lag12']
dfTest['feature_4'] = (dfTest['feature_1'] * dfTest['feature_1'].mean()) + (dfTest['feature_2'] * dfTest['feature_2'].mean()) + (dfTest['feature_3'] * dfTest['feature_3'].mean())
dfTest['feature_5'] = (dfTest['month_lag'] * dfTest['purchase_date_month'])
dfTest['feature_6'] = (dfTest['avg_sales_lag3'] * dfTest['active_months_lag3'])
dfTest['feature_7'] = (dfTest['avg_sales_lag6'] * dfTest['active_months_lag6'])
dfTest['feature_8'] = (dfTest['avg_sales_lag12'] * dfTest['active_months_lag12'])
dfTest['feature_9'] = (dfTest['month_lag'] * dfTest['active_months_lag3'])
dfTest['feature_10'] = (dfTest['month_lag'] * dfTest['active_months_lag6'])
dfTest['feature_11'] = (dfTest['month_lag'] * dfTest['active_months_lag12'])
dfTest['feature_12'] = (dfTest['most_recent_sales_range'] * dfTest['avg_sales_lag3'])
dfTest['feature_13'] = (dfTest['most_recent_sales_range'] * dfTest['avg_sales_lag6'])
dfTest['feature_14'] = (dfTest['most_recent_sales_range'] * dfTest['avg_sales_lag12'])
dfTest['feature_15'] = (dfTest['feature_1'] * dfTest['category_1_x'] * dfTest['numerical_1'])
dfTest['feature_16'] = (dfTest['feature_2'] * dfTest['category_2_x'] * dfTest['numerical_2'])
dfTest['feature_17'] = (dfTest['purchase_amount'] / dfTest['state_id_x'])
dfTest['feature_18'] = (dfTest['purchase_amount'] / dfTest['subsector_id_x'])
dfTest['feature_19'] = (dfTest['purchase_amount'] / dfTest['merchant_category_id_x'])

In [None]:
dfTest.head()

## Feature transformations

Vamos fazer algumas transformacoes nas features criadas (venda x compra) usando _log e _square

In [None]:
# Dataset de Treino
dfSample["var_lag3"] = np.log1p(dfSample["var_lag3"])
dfSample["var_lag6"] = np.log1p(dfSample["var_lag6"])
dfSample["var_lag12"] = np.log1p(dfSample["var_lag12"])
dfSample["feature_4"] = np.log1p(dfSample["feature_4"])
dfSample["feature_5"] = np.log1p(dfSample["feature_5"])
dfSample["feature_6"] = np.log1p(dfSample["feature_6"])
dfSample["feature_7"] = np.log1p(dfSample["feature_7"])
dfSample["feature_8"] = np.log1p(dfSample["feature_8"])
dfSample["feature_9"] = np.log1p(dfSample["feature_9"])
dfSample["feature_10"] = np.log1p(dfSample["feature_10"])
dfSample["feature_11"] = np.log1p(dfSample["feature_11"])
dfSample["feature_12"] = np.log1p(dfSample["feature_12"])
dfSample["feature_13"] = np.log1p(dfSample["feature_13"])
dfSample["feature_14"] = np.log1p(dfSample["feature_14"])
dfSample["feature_15"] = np.log1p(dfSample["feature_15"])
dfSample["feature_16"] = np.log1p(dfSample["feature_16"])
dfSample["feature_17"] = np.log1p(dfSample["feature_17"])
dfSample["feature_18"] = np.log1p(dfSample["feature_18"])
dfSample["feature_19"] = np.log1p(dfSample["feature_19"])

In [None]:
# Dataset de Teste
dfTest["var_lag3"] = np.log1p(dfTest["var_lag3"])
dfTest["var_lag6"] = np.log1p(dfTest["var_lag6"])
dfTest["var_lag12"] = np.log1p(dfTest["var_lag12"])
dfTest["feature_4"] = np.log1p(dfTest["feature_4"])
dfTest["feature_5"] = np.log1p(dfTest["feature_5"])
dfTest["feature_6"] = np.log1p(dfTest["feature_6"])
dfTest["feature_7"] = np.log1p(dfTest["feature_7"])
dfTest["feature_8"] = np.log1p(dfTest["feature_8"])
dfTest["feature_9"] = np.log1p(dfTest["feature_9"])
dfTest["feature_10"] = np.log1p(dfTest["feature_10"])
dfTest["feature_11"] = np.log1p(dfTest["feature_11"])
dfTest["feature_12"] = np.log1p(dfTest["feature_12"])
dfTest["feature_13"] = np.log1p(dfTest["feature_13"])
dfTest["feature_14"] = np.log1p(dfTest["feature_14"])
dfTest["feature_15"] = np.log1p(dfTest["feature_15"])
dfTest["feature_16"] = np.log1p(dfTest["feature_16"])
dfTest["feature_17"] = np.log1p(dfTest["feature_17"])
dfTest["feature_18"] = np.log1p(dfTest["feature_18"])
dfTest["feature_19"] = np.log1p(dfTest["feature_19"])

In [None]:
dfSample = dfSample.replace([np.inf, -np.inf], np.nan)
dfSample.update(dfSample.fillna(0))

In [None]:
dfTest = dfTest.replace([np.inf, -np.inf], np.nan)
dfTest.update(dfTest.fillna(0))

In [None]:
dfSample[dfSample.columns.drop('target')].corrwith(dfSample.target)

In [None]:
fig = plt.subplots(figsize = (30,30))
sns.set(font_scale=1.5)
sns.heatmap(dfSample.corr(),square = True,cbar=True,annot=True,annot_kws={'size': 10})
plt.show()

Percebe-se que as features criadas ficaram com baixa correlacao com a variavel target. Mas vamos verificar como esta o modelo, qualquer coisa voltamos aqui

## Feature Selection - Método Ensemble

In [None]:
# Split features and labels
X = dfSample.drop(['target', 'card_id'], axis=1)
y = dfSample['target']

# Aplicando a mesma escala nos dados
X = MinMaxScaler().fit_transform(X)

# Padronizando os dados (0 para a média, 1 para o desvio padrão)
X = StandardScaler().fit_transform(X)

# Split X into training and validation sets
validation_size = 0.3
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=validation_size, random_state=seed)


print("Fit Model")
model = XGBRegressor(n_jobs=-1,
                     random_state = seed,
                     learning_rate= 0.1,
                     n_estimators= 10,
                     max_depth= 10,
                     subsample= 0.9,
                     colsample_bytree=0.7
                     ) 
model.fit(X_train, Y_train)
print(model)

In [None]:
print("Feature Importances")
colsX = dfSample.columns
features = dict(zip(dfSample[colsX], model.feature_importances_))
features_sorted = sorted(features.items(), key=lambda kv: kv[1], reverse=True)
features_sorted

## Preparando os Dados para Machine Learning

Nesse ponto vamos criar mais uma copia, agora do dataset dfSample, para facilitar o retorno ate aqui

In [None]:
all_features = dfSample#.copy()
#all_features = all_features.filter(features_new)
all_features.shape

In [None]:
all_features.head()

Esse é um ponto a verificar, pois foi a unica forma que encontrei para agrupar os registros em um do cardId. Talvez seja necessario outra estrategia, mas nao encontrei ate o momento. Usei a media de cada variavel para o agrupamento

In [None]:
all_features = pd.DataFrame(all_features.groupby( ['card_id'] ).mean().to_dict())

In [None]:
all_features.head()

In [None]:
sub_final.head()

### Normalização e Padronização de features numericas

In [None]:
# Split features and labels
X = all_features.drop(['target'], axis=1)
y = all_features['target']

# Aplicando a mesma escala nos dados
X = MinMaxScaler().fit_transform(X)

# Padronizando os dados (0 para a média, 1 para o desvio padrão)
X = StandardScaler().fit_transform(X)

In [None]:
X.shape, y.shape, dfTest.shape

O dataset dfTest tem uma coluna a mais por conta do CARD_ID

## Criação e Validação dos Modelos de Machine Learning

Coloquei somente 2 splits por causa do tempo, para testar mais rapido

In [None]:
# Setup cross validation folds
kf = KFold(n_splits=2, random_state=123, shuffle=True)

In [None]:
# Defini a metrica de validacao (RMSL)
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

Nessa parte adicionei varios modelos para comparacao. A principio utilizei alguns parametros default.

In [None]:
# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=3000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       #verbose=0,
                       random_state=123)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=4000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       #verbosity=3,
                       random_state=123)

# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=2000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                #verbose=True,
                                random_state=123)  

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=2000,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          #verbose=True,
                          random_state=123)

# KerasRegressor
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=54, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

keras = KerasRegressor(build_fn=baseline_model, 
                       epochs=100, 
                       batch_size=5)

In [None]:
print(datetime.datetime.now().time())

In [None]:
scores = {}

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['lgb'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['xgb'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(svr)
print("svr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['svr'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(rf)
print("rf: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['rf'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['gbr'] = (score.mean(), score.std())

In [None]:
#score = cv_rmse(keras)
#print("keras: {:.4f} ({:.4f})".format(score.mean(), score.std()))
#scores['kr_norm'] = (score.mean(), score.std())

## Fit the models

In [None]:
print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)

In [None]:
print('xgboost')
xgb_model_full_data = xgboost.fit(X, y)

In [None]:
print('Svr')
svr_model_full_data = svr.fit(X, y)

In [None]:
print('RandomForest')
rf_model_full_data = rf.fit(X, y)

In [None]:
print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)

In [None]:
#print('KerasRegressor')
#keras_model_full_data = keras.fit(X, y)


### Blend models and get predictions

In [None]:
# Fazendo as previsoes finais
# Nao consegui colocar o keras pois ele grava um History (estudando como fazer)
def blended_predictions(X):
    return ((svr_model_full_data.predict(X)) + \
            (gbr_model_full_data.predict(X)) + \
            (xgb_model_full_data.predict(X)) + \
            (lgb_model_full_data.predict(X)) + \
            (rf_model_full_data.predict(X)))

In [None]:
# Verificando as predictions dos modelos
blended_score = rmsle(y, blended_predictions(X))
scores['blended'] = (blended_score, 0)
print('RMSLE score no dataset de Treino:')
print(blended_score)

### Testando Gradiente Boosting

In [None]:
# GradientBoostingRegressor Otimizado

def fit_predict(model, X, y, X_test):
    model.fit(X, y)
    return model.predict(X_test)

def rmspe(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmspe = sqrt(mse)   
    return rmspe

# Split X into training and validation sets
validation_size = 0.10
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=validation_size, random_state=seed)

# Melhores parameters
gbr_otm = GradientBoostingRegressor(subsample=1, 
                                  n_estimators=200, 
                                  max_features='sqrt',
                                  max_depth=5, 
                                  loss='huber', 
                                  learning_rate=0.1)

modelo_gbr_otm = fit_predict(gbr_otm, X_train, Y_train, X_test)
gbr_otm = rmsle(Y_test, modelo_gbr_otm)

print('GradientBoostingRegressor - Otimizado = %0.4f' % gbr_otm)

scores['gbr_otm'] = (gbr_otm.mean(), gbr_otm.std())

### Visualizando a melhor performance dos modelos

In [None]:
# Plot com a previsao de cada modelo
sns.set_style("white")
fig = plt.figure(figsize=(24, 12))

ax = sns.pointplot(x=list(scores.keys()), y=[score for score, _ in scores.values()], markers=['o'], linestyles=['-'])
for i, score in enumerate(scores.values()):
    ax.text(i, score[0] , '{:.6f}'.format(score[0]), horizontalalignment='left', size='22', color='black', weight='semibold')

plt.ylabel('Score (RMSE)', size=20, labelpad=12.5)
plt.xlabel('Model', size=20, labelpad=12.5)
plt.tick_params(axis='x', labelsize=13.5)
plt.tick_params(axis='y', labelsize=12.5)

plt.title('Scores of Models', size=20)

plt.show()

Nesse ponto da pra verificar que juntando os modelos tivemos melhor performance... mas aqui é o momento de usar o GridSearch para obter os melhores parametros e otimizar o modelo

Vamos agora fazer previsoes nos dados de teste e visualizar no plot

In [None]:
# Usando o split para separar dados de treino e dados de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 10)
predictions = blended_predictions(X_test)
predictions

In [None]:
sns.set_style("white")
fig = plt.figure(figsize=(24, 12))

plt.plot(range(y_test.shape[0]),y_test,label="Dados Originais")
plt.plot(range(y_test.shape[0]),predictions,label="Dados Previstos")
plt.legend(loc='best')
plt.ylabel('target')
plt.title('Comparacao com dados de teste')
plt.show()

## Submit Kaggle

In [None]:
print(datetime.datetime.now().time())

In [None]:
sub_final = pd.DataFrame(dfTest.groupby( ['card_id'] ).mean().to_dict())

In [None]:
sub_final = sub_final.replace([np.inf, -np.inf], np.nan)
sub_final.update(sub_final.fillna(0))

In [None]:
# Aplicando a mesma escala nos dados
X_final = MinMaxScaler().fit_transform(sub_final)

# Padronizando os dados (0 para a média, 1 para o desvio padrão)
X_final = StandardScaler().fit_transform(X_final)

In [None]:
sub_final.head()

In [None]:
X_final.shape

In [None]:
predictions = blended_predictions(X_final)
predictions

In [None]:
#Gerando Arquivo de Submissao
submission = pd.DataFrame({
    "card_id": sub_final.index, 
    "target": predictions
})

In [None]:
submission.head(30)

In [None]:
submission.to_csv('submission_file.csv', index=False)