Imports:


In [253]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin # Para definição de transformadores personalizados
from sklearn.preprocessing import OneHotEncoder, Imputer, FunctionTransformer
# from category_encoders import OrdinalEncoder
from future_encoders import OrdinalEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import time
from IPython.display import display

# 1. Introdução

Carregando dados:

In [2]:
converters = {key: lambda x: np.nan if not x else str(x) for key in ('param_1', 'param_2', 'param_3')}

df_train = pd.read_csv(
    './data/train.csv',
    parse_dates=['activation_date'],
    converters=converters                  
)

In [224]:
X_test = pd.read_csv(
    './data/test.csv',
    parse_dates=['activation_date'],
    converters=converters                  
)

In [3]:
df_train.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [3]:
X_train, y_train = df_train[df_train.columns.difference(['deal_probability'])].copy(), df_train[['deal_probability']].copy()

**TODO**: Visualizações de dados cairiam bem aqui. Consultar livro do Geron

# 2. Processando dados

In [17]:
print('Nº. de amostras:', len(df_train.index))

Nº. de amostras: 1503424


Justificar, com base nessa primeira olhada nos dados, as trasformações de preprocessamento a serem realizadas: imputação de dados faltantes, vetorização de dados textuais, redução de dimensionalidade com PCA (ou eliminação direta, perguntar ao Fabrício), normalização (apenas para os algoritmos que se beneficiam)

A partir dessa primeira exploração dos dados, podemos ver que o dataset é bastante heterogêneo. Nele, podemos observar features categóricas, numéricas e textuais, o que significa que teremos que executar algumas etapas de preprocessamento antes de treinar algum modelo.

In [194]:
# O atributo as_df determina se o resultado do transformador é um DataFrame. Isso é necessário caso queiramos efetuar
# seleções posteriores sobre o resultado desse transformador
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, as_ndarray=False):
        self.attribute_names = attribute_names
        self.as_ndarray = as_ndarray
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names] if not self.as_ndarray else X[self.attribute_names].values
    
class ToDataFrameTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)

## 2.1. Removendo atributos

O dataset tem alguns atributos que não são exatamente úteis para nossa modelagem, como *item_id* e *user_id*. Ainda, não faremos qualquer processamento de imagens. então, o atributo de identificadores de imagens, *image* não nos tem serventia.Vamos declarar um seletor inicial, responsável por produzir um DataFrame sem esses atributos. Esse DataFrame servirá de base para nossas transfomações posteriores:

In [195]:
features_selector = DataFrameSelector(X_train.columns.difference(['item_id', 'user_id', 'image']))

Quando aplicado ao dataset, temos:

In [196]:
X_transf = features_selector.transform(X_train)

In [197]:
X_transf.head()

Unnamed: 0,activation_date,category_name,city,description,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,title,user_type
0,2017-03-28,Товары для детей и игрушки,Екатеринбург,"Кокон для сна малыша,пользовались меньше месяц...",1008.0,2,Постельные принадлежности,,,Личные вещи,400.0,Свердловская область,Кокоби(кокон для сна),Private
1,2017-03-26,Мебель и интерьер,Самара,"Стойка для одежды, под вешалки. С бутика.",692.0,19,Другое,,,Для дома и дачи,3000.0,Самарская область,Стойка для Одежды,Private
2,2017-03-20,Аудио и видео,Ростов-на-Дону,"В хорошем состоянии, домашний кинотеатр с blu ...",3032.0,9,"Видео, DVD и Blu-ray плееры",,,Бытовая электроника,4000.0,Ростовская область,Philips bluray,Private
3,2017-03-25,Товары для детей и игрушки,Набережные Челны,Продам кресло от0-25кг,796.0,286,Автомобильные кресла,,,Личные вещи,2200.0,Татарстан,Автокресло,Company
4,2017-03-16,Автомобили,Волгоград,Все вопросы по телефону.,2264.0,3,С пробегом,ВАЗ (LADA),2110.0,Транспорт,40000.0,Волгоградская область,"ВАЗ 2110, 2003",Private


## 2.2. Tratando datas

Vamos transformar a coluna *activation_date* em três atributos mais relevantes: *month*, *day* e *weekday*:

In [198]:
# Essa transformação espera uma única série de timestamps e retorna um DataFrame
class TimestampTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.DataFrame(
            np.c_[
                X.dt.month,
                X.dt.day,
                X.dt.weekday
            ],
            columns=['month', 'day', 'weekday']
        )

Quando aplicamos esse transformador ao dataset obtemos:

In [199]:
process_date = Pipeline([
    ('date_series_selector', DataFrameSelector('activation_date')),
    ('date_features_transformer', TimestampTransformer())
])

X_test = process_date.transform(X_train)

In [34]:
X_test.head()

Unnamed: 0,month,day,weekday
0,3,28,1
1,3,26,6
2,3,20,0
3,3,25,5
4,3,16,3


## 2.2. Imputando dados faltantes

Para o atributo de preço, vamos imputar a média dos valores conhecidos:

In [200]:
price_imputer = Imputer(strategy='mean')

process_price = Pipeline([
    ('price_selector', DataFrameSelector(['price'])),
    ('price_imputer', price_imputer),
    ('to_data_frame', ToDataFrameTransformer(['price']))
])

In [201]:
X_price = process_price.fit_transform(df_train)

In [202]:
X_price.head()

Unnamed: 0,price
0,400.0
1,3000.0
2,4000.0
3,2200.0
4,40000.0


Para o atributo *image_top_1* vamos imputar o valor mais frequente:

In [203]:
image_imputer = Imputer(strategy='most_frequent')

process_image_class = Pipeline([
    ('image_selector', DataFrameSelector(['image_top_1'])),
    ('image_imputer', image_imputer),
    ('to_data_frame', ToDataFrameTransformer(['image_top_1']))
])

In [204]:
X_image = process_image_class.fit_transform(df_train)

In [53]:
X_image.head()

Unnamed: 0,image_top_1
0,1008.0
1,692.0
2,3032.0
3,796.0
4,2264.0


Para atributos categóricos, vamos também adotar a estratégia de imputar a classe mais frequente para cada coluna. No entanto, o imputador do SciKit Learn não consegue lidar com colunas não numéricas. Vamos então declarar um imputador personalizado para essa operação:

In [205]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent = pd.Series(X.values.ravel()).value_counts().index[0]
        return self
        
    def transform(self, X):
        return pd.DataFrame(pd.Series(X.values.ravel()).fillna(self.most_frequent))

In [206]:
impute_cat = Pipeline([
    ('cat_imputer', FeatureUnion([
        ('region_imputer', Pipeline([
            ('region_selector', DataFrameSelector('region')),
            ('region_imputer', CategoricalImputer())
        ])),
        ('city_imputer', Pipeline([
            ('city_selector', DataFrameSelector('city')),
            ('city_imputer', CategoricalImputer())
        ])),
        ('parent_category_name_imputer', Pipeline([
            ('parent_category_name_selector', DataFrameSelector('parent_category_name')),
            ('parent_category_name_imputer', CategoricalImputer())
        ])),
        ('category_name_imputer', Pipeline([
            ('category_name_selector', DataFrameSelector('category_name')),
            ('category_name_imputer', CategoricalImputer())
        ])),
        ('param_1_imputer', Pipeline([
            ('param_1_selector', DataFrameSelector('param_1')),
            ('param_1_imputer', CategoricalImputer())
        ])),
        ('param_2_imputer', Pipeline([
            ('param_2_selector', DataFrameSelector('param_2')),
            ('param_2_imputer', CategoricalImputer())
        ])),
        ('param_3_imputer', Pipeline([
            ('param_3_selector', DataFrameSelector('param_3')),
            ('param_3_imputer', CategoricalImputer())
        ])),
        ('user_type_imputer', Pipeline([
            ('user_type_selector', DataFrameSelector('user_type')),
            ('user_type_imputer', CategoricalImputer())
        ])),   
    ])),
    ('to_data_frame', ToDataFrameTransformer([
        'region',
        'city',
        'parent_category_name',
        'category_name',
        'param_1',
        'param_2',
        'param_3',
        'user_type'
    ]))
])

In [207]:
X = impute_cat.fit_transform(X_train)

In [166]:
X.head()

Unnamed: 0,region,city,parent_category_name,category_name,param_1,param_2,param_3,user_type
0,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,Обувь,Вторичка,Private
1,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,Обувь,Вторичка,Private
2,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",Обувь,Вторичка,Private
3,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,Обувь,Вторичка,Company
4,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110,Private


Finalmente, iremos preencher as células vazias da coluna *description* com strings vazias:

In [208]:
class TextImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.DataFrame(X.fillna(''))

In [136]:
process_description = Pipeline([
    ('description_selector', DataFrameSelector('description')),
    ('imputer', TextImputer())
])

In [33]:
X = process_description.transform(X_train)

In [34]:
X.head()

Unnamed: 0,description
0,"Кокон для сна малыша,пользовались меньше месяц..."
1,"Стойка для одежды, под вешалки. С бутика."
2,"В хорошем состоянии, домашний кинотеатр с blu ..."
3,Продам кресло от0-25кг
4,Все вопросы по телефону.


## 2.3. Codificando atributos categóricos

Contagem de classes nas features categóricas:

In [3]:
# print('user_ids:', len(np.unique(df_train['user_id'])))
print('Regions:', len(np.unique(df_train['region'])))
print('cities:', len(np.unique(df_train['city'])))
print('parent_category_names:', len(np.unique(df_train['parent_category_name'])))
print('category_names:', len(np.unique(df_train['category_name'])))
print('param_1:', len(np.unique(df_train['param_1'][~pd.isnull(df_train['param_1'].values)].values)))
print('param_1 as str:', len(np.unique(df_train['param_1'].astype(str).values)))
print('param_2:', len(np.unique(df_train['param_2'][~pd.isnull(df_train['param_2'].values)].values)))
print('param_3:', len(np.unique(df_train['param_3'][~pd.isnull(df_train['param_3'].values)].values)))
print('activation_dates:', len(np.unique(df_train['activation_date'])))
print('user_types:', len(np.unique(df_train['user_type'])))
print('image_top_1:', len(np.unique(df_train['image_top_1'])))

Regions: 28
cities: 1733
parent_category_names: 9
category_names: 47
param_1: 371
param_1 as str: 372
param_2: 271
param_3: 1219
activation_dates: 21
user_types: 3
image_top_1: 115650


As colunas *region*, *city*, *parent_category_name*, *category_name*, *param_1*, *param_2*, *param_3* e *user_type* são atributos categóricos nominais. Isso significa que, para cada amostra, essas entradas assumem um dentre um número finito de classes possíveis. Além disso, não existe uma ordem entre as possíveis classes de uma dessas categorias. Uma forma óbvia para codificar numericamente um atributo categórico é substituir cada classe desse atributo por um número inteiro. Entretanto, se fizésssemos isso, estaríamos inserindo uma falsa informação de ordem entre as classes. Uma técnica muito comum para lidar com essa situação é o One-Hot Encoding, onde uma coluna é criada para cada classe de um atributo categórico: 

Vamos criar um transformador personalizado para codificar essas categorias:

In [209]:
encode_cat = Pipeline([
    ('cat_selector', DataFrameSelector([
        'region',
        'city',
        'parent_category_name',
        'category_name',
        'param_1',
        'param_2',
        'param_3',
        'user_type'
    ])),
    ('ordinal_encoder', OrdinalEncoder(dtype=np.int64)),
    ('to_data_frame', ToDataFrameTransformer([
        'region',
        'city',
        'parent_category_name',
        'category_name',
        'param_1',
        'param_2',
        'param_3',
        'user_type'
    ]))
])

In [86]:
start = time.time()

X_transf = Pipeline([('impute', impute_cat), ('encode', encode_cat)]).fit_transform(X_train)

end = time.time()
print('Elapsed time:', end-start)

Elapsed time: 126.02638030052185


In [89]:
X_transf.head()

Unnamed: 0,region,city,parent_category_name,category_name,param_1,param_2,param_3,user_type
0,19,460,4,42,248,192,1173,1
1,17,1300,2,22,121,192,1173,1
2,16,1276,0,2,83,192,1173,1
3,21,940,4,42,37,192,1173,0
4,4,317,6,0,277,118,44,1


## 2.4. Codificando atributos textuais 

In [210]:
vect = TfidfVectorizer(sublinear_tf=True)

encode_title = Pipeline([
    ('title_selector', DataFrameSelector(['title'])),
    ('ravel_feature', FunctionTransformer(lambda f: f.values.ravel(), validate=False)),
    ('text_vectorizer', vect),
    ('dim_reduction', TruncatedSVD(random_state=1))
])

encode_description = Pipeline([
    ('description_selector', DataFrameSelector('description')),
    ('description_imputer', TextImputer()),
    ('ravel_feature', FunctionTransformer(lambda f: f.values.ravel(), validate=False)),
    ('text_vectorizer', vect),
    ('dim_reduction', TruncatedSVD(random_state=1))
])

encode_text = Pipeline([
    ('process_text', FeatureUnion([
        ('process_title', encode_title),
        ('process_description', encode_description)
    ])),
    ('to_data_frame', ToDataFrameTransformer(['title_svd_1', 'title_svd_2', 'description_svd_1', 'description_svd_2']))
])

In [None]:
start = time.time()

text_processed = encode_text.fit_transform(X_train)

end = time.time()

print('Elapsed time:', end - start)

In [91]:
text_processed.head()

Unnamed: 0,title_svd_1,title_svd_2,description_svd_1,description_svd_2
0,0.000258,0.006233,0.031397,0.03459
1,0.000457,0.010049,0.019601,0.024236
2,2.1e-05,2.6e-05,0.150771,-0.054072
3,5.2e-05,0.000226,0.036861,0.026744
4,1.1e-05,-2e-05,0.116751,0.134437


## 2.5. Unindo os transformadores

In [211]:
preprocess_transformer = Pipeline([
    ('working_feature_selector', features_selector),
    ('date_transf', FeatureUnion([
        ('other_features', DataFrameSelector(['category_name', 'city', 'description',
       'image_top_1', 'item_seq_number', 'param_1', 'param_2', 'param_3',
       'parent_category_name', 'price', 'region', 'title', 'user_type'])),
        ('process_date', process_date),
    ])),
    ('to_data_frame_1', ToDataFrameTransformer([['category_name', 'city', 'description',
       'image_top_1', 'item_seq_number', 'param_1', 'param_2', 'param_3',
       'parent_category_name', 'price', 'region', 'title', 'user_type', 'month', 'day', 'weekday']])),
    ('imputers', FeatureUnion([
        ('other_features', DataFrameSelector(['item_seq_number', 'title', 'month', 'day', 'weekday'])),
        ('impute_price', process_price),
        ('impute_image', process_image_class),
        ('impute_cat', impute_cat),
        ('impute_description', process_description)
    ])),
    ('to_data_frame_2', ToDataFrameTransformer([
        'item_seq_number',
        'title',
        'month',
        'day',
        'weekday',
        'price',
        'image_top_1',
        'region',
        'city',
        'parent_category_name',
        'category_name',
        'param_1',
        'param_2',
        'param_3',
        'user_type',
        'description'
    ])),
    ('encoders', FeatureUnion([
        ('other_features', DataFrameSelector(['item_seq_number', 'month', 'day', 'weekday', 'price', 'image_top_1'])),
        ('encode_cat', encode_cat),
        ('encode_text', encode_text)
    ])),
    ('to_data_frame_3', ToDataFrameTransformer([
        'item_seq_number',
        'month',
        'day',
        'weekday',
        'price',
        'image_top_1',
        'region',
        'city',
        'parent_category_name',
        'category_name',
        'param_1',
        'param_2',
        'param_3',
        'user_type',
        'title_svd_1',
        'title_svd_2',
        'description_svd_1',
        'description_svd_2'
    ]))
])

In [212]:
start = time.time()
X_train_processed = preprocess_transformer.fit_transform(X_train)
end = time.time()

print('Elapsed time:', end - start)

Selecting:  description


Unnamed: 0,description
0,"Кокон для сна малыша,пользовались меньше месяц..."
1,"Стойка для одежды, под вешалки. С бутика."


Elapsed time: 223.89729046821594


In [191]:
X_train_processed.head()

Unnamed: 0,item_seq_number,month,day,weekday,price,image_top_1,region,city,parent_category_name,category_name,param_1,param_2,param_3,user_type,title_svd_1,title_svd_2,description_svd_1,description_svd_2
0,2,3,28,1,400,1008,19,460,4,42,248,192,1173,1,0.00025798,0.00623332,0.0313975,0.0345895
1,19,3,26,6,3000,692,17,1300,2,22,121,192,1173,1,0.000457325,0.0100489,0.019601,0.0242359
2,9,3,20,0,4000,3032,16,1276,0,2,83,192,1173,1,2.09619e-05,2.58056e-05,0.150771,-0.0540722
3,286,3,25,5,2200,796,21,940,4,42,37,192,1173,0,5.20505e-05,0.000226047,0.0368607,0.0267437
4,3,3,16,3,40000,2264,4,317,6,0,277,118,44,1,1.06665e-05,-2.04614e-05,0.116751,0.134437


# 3. Treinando modelos

## 3.1 Árvore de decisão

In [234]:
dt_regressor = DecisionTreeRegressor(max_depth=10, random_state=1)

In [235]:
dt_regressor.fit(X_train_processed, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [236]:
y_pred = dt_regressor.predict(X_train_processed)

In [237]:
dt_rmse = np.sqrt(mean_squared_error(y_train, y_pred))

In [238]:
dt_rmse

0.23503383842198391

Melhor avaliando o modelo com validação cruzada:

In [254]:
scores = cross_val_score(dt_regressor, X_train_processed, y_train, scoring='neg_mean_squared_error', cv=10)

In [255]:
rmse_scores = np.sqrt(-scores)

In [256]:
print('Mean:', rmse_scores.mean())
print('std:', rmse_scores.std())

Mean: 0.2360702223964954
std: 0.0007038421166696164


Verificando se a regressão não gerou resultados fora do intervalo [0, 1]. 

In [252]:
y_pred[np.where(np.logical_or(y_pred < 0, y_pred > 1))].size

0