Imports:


In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin # Para definição de transformadores personalizados
from sklearn.preprocessing import OneHotEncoder, Imputer, FunctionTransformer
# from category_encoders import OrdinalEncoder
from future_encoders import OrdinalEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import time

In [110]:
matrix = np.array(
    [
        ['a', 'b', 'c', 'd', 'e'],
        ['f', 'g', 'h', 'i', 'j'],
        ['a', 'r', 'h', 'p', 'e']
    ]
)

In [89]:
matrix

array([['a', 'b', 'c', 'd', 'e'],
       ['f', 'g', 'h', 'i', 'j']], dtype='<U1')

Selecionando colunas específicas:

In [95]:
matrix[:, [0, 2, 3]]

array([['a', 'c', 'd'],
       ['f', 'h', 'i']], dtype='<U1')

Selecionando todas exceto colunas específicas

In [96]:
matrix_except = np.delete(matrix, [0, 2, 3], axis=1)

In [97]:
matrix_except

array([['b', 'e'],
       ['g', 'j']], dtype='<U1')

Concatenando vetores coluna a uma matriz

In [99]:
matrix_concat = np.c_[matrix_except, np.array(['1','2'])]

In [100]:
matrix_concat

array([['b', 'e', '1'],
       ['g', 'j', '2']], dtype='<U1')

Aplicando o label encoder a varias colunas. Isso é problemático pois estamos perdendo o fit, que tem de ser aplicado ao dataset de testes

In [111]:
le = LabelEncoder()
matrix_transf = np.apply_along_axis(le.fit_transform, 0, matrix)

In [112]:
matrix_transf

array([[0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1],
       [0, 2, 1, 2, 0]], dtype=int64)

In [161]:
test = np.array(['w', np.nan, 'q'])
le = LabelEncoder()

X = le.fit_transform(test)

In [163]:
X

array([2, 0, 1], dtype=int64)

Teste imputer - A pergunta é: Se aplico um imputer de most frequent a múltiplas colunas ele faz a imputação coluna a coluna ou mistura tudo?

In [47]:
test = np.array([
    [1, 2, 3],
    [0, 2, 3],
    [1, 3, 3],
    [1, 4, 3],
    [0, 5, 3],
    [-1, -1, -1]
])

imp = Imputer(missing_values=-1, strategy='most_frequent')

test_transf = imp.fit_transform(test)

In [48]:
test_transf

array([[1., 2., 3.],
       [0., 2., 3.],
       [1., 3., 3.],
       [1., 4., 3.],
       [0., 5., 3.],
       [1., 2., 3.]])

Conclusão: o imputador ser comporta de forma opropriada

Teste CountVectorizer - A pergunta é: Se aplico um vetorizer a múltiplas colunas, ele faz a vetorização corretamente?

In [99]:
test = np.array([
    ['aa bb', 'cc dd'],
    ['ww', 'aa ww dd']
])

test = pd.DataFrame(test, columns=['title', 'description'])

ravelTransformer = FunctionTransformer(lambda X: X.ravel(), validate=False)

pipe = FeatureUnion([
    ('title_vect', Pipeline([
        ('title_selector', DataFrameSelector(['title'])),
        ('ravel', ravelTransformer),
        ('vect', CountVectorizer())
    ])),
    ('description_vect', Pipeline([
        ('title_selector', DataFrameSelector(['description'])),
        ('ravel', ravelTransformer),
        ('vect', CountVectorizer())
    ]))
])

In [94]:
test

Unnamed: 0,title,description
0,aa bb,cc dd
1,ww,aa ww dd


In [100]:
X = pipe.fit_transform(test)

In [101]:
X.toarray()

array([[1, 1, 0, 0, 1, 1, 0],
       [0, 0, 1, 1, 0, 1, 1]], dtype=int64)

In [72]:
cv.get_feature_names()

['aa', 'bb', 'ww']

In [102]:
X = pd.DataFrame(X.toarray(), columns=pipe.get_feature_names())

AttributeError: Transformer title_vect (type Pipeline) does not provide get_feature_names.

In [85]:
X

Unnamed: 0,aa,bb,ww
0,1,1,0
1,0,0,1


Teste OrdinalEncoder

In [107]:
test = np.array([
    ['red', 'up', 'left'],
    ['green', 'middle', 'right'],
    ['blue', 'down', 'right'],
    [np.nan, np.nan, np.nan]
])

oe = OrdinalEncoder()

X = oe.fit_transform(test)

In [108]:
X

array([[3., 3., 0.],
       [1., 1., 2.],
       [0., 0., 2.],
       [2., 2., 1.]])

In [109]:
oe.categories_

[array(['blue', 'green', 'nan', 'red'], dtype='<U6'),
 array(['down', 'middle', 'nan', 'up'], dtype='<U6'),
 array(['left', 'nan', 'right'], dtype='<U6')]

In [112]:
oe.transform(np.array([np.nan]).reshape(-1, 1).astype(str))

array([[2.]])

# 1. Introdução

Carregando dados:

In [2]:
converters = {key: lambda x: np.nan if not x else str(x) for key in ('param_1', 'param_2', 'param_3')}

df_train = pd.read_csv(
    './data/train.csv',
    parse_dates=['activation_date'],
    converters=converters                  
)

In [3]:
df_train.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [119]:
X_train, y_train = df_train[df_train.columns.difference(['deal_probability'])], df_train[['deal_probability']]

In [43]:
X_train.head()

Unnamed: 0,activation_date,category_name,city,description,image,image_top_1,item_id,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,title,user_id,user_type
0,2017-03-28,Товары для детей и игрушки,Екатеринбург,"Кокон для сна малыша,пользовались меньше месяц...",d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,b912c3c6a6ad,2,Постельные принадлежности,,,Личные вещи,400.0,Свердловская область,Кокоби(кокон для сна),e00f8ff2eaf9,Private
1,2017-03-26,Мебель и интерьер,Самара,"Стойка для одежды, под вешалки. С бутика.",79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,2dac0150717d,19,Другое,,,Для дома и дачи,3000.0,Самарская область,Стойка для Одежды,39aeb48f0017,Private
2,2017-03-20,Аудио и видео,Ростов-на-Дону,"В хорошем состоянии, домашний кинотеатр с blu ...",b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,ba83aefab5dc,9,"Видео, DVD и Blu-ray плееры",,,Бытовая электроника,4000.0,Ростовская область,Philips bluray,91e2f88dd6e3,Private
3,2017-03-25,Товары для детей и игрушки,Набережные Челны,Продам кресло от0-25кг,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,02996f1dd2ea,286,Автомобильные кресла,,,Личные вещи,2200.0,Татарстан,Автокресло,bf5cccea572d,Company
4,2017-03-16,Автомобили,Волгоград,Все вопросы по телефону.,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,7c90be56d2ab,3,С пробегом,ВАЗ (LADA),2110.0,Транспорт,40000.0,Волгоградская область,"ВАЗ 2110, 2003",ef50846afc0b,Private


In [192]:
np.dtype(df_train['param_2'][4])

UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2: ordinal not in range(128)

In [62]:
df_train['param_2'][:1]

0    NaN
Name: param_2, dtype: object

In [78]:
len(np.unique(df_train['param_2'][~pd.isnull(df_train['param_2'].values)].values))

271

In [44]:
df_train.columns

Index(['item_id', 'user_id', 'region', 'city', 'parent_category_name',
       'category_name', 'param_1', 'param_2', 'param_3', 'title',
       'description', 'price', 'item_seq_number', 'activation_date',
       'user_type', 'image', 'image_top_1', 'deal_probability'],
      dtype='object')

**TODO**: Visualizações de dados cairiam bem aqui. Consultar livro do Geron

In [20]:
df_train.dtypes

item_id                         object
user_id                         object
region                          object
city                            object
parent_category_name            object
category_name                   object
param_1                         object
param_2                         object
param_3                         object
title                           object
description                     object
price                          float64
item_seq_number                  int64
activation_date         datetime64[ns]
user_type                       object
image                           object
image_top_1                    float64
deal_probability               float64
dtype: object

In [25]:
test_union = FeatureUnion([
    ('price_selector', DataFrameSelector(['price'], as_df=True)),
    ('item_seq_number', DataFrameSelector(['item_seq_number'], as_df=True))
])

X = test_union.fit_transform(df_train)

df_test = pd.DataFrame(X, columns=['price', 'item_seq_number'])

In [26]:
df_test.head()

Unnamed: 0,price,item_seq_number
0,400.0,2.0
1,3000.0,19.0
2,4000.0,9.0
3,2200.0,286.0
4,40000.0,3.0


Testando slicing com DataFrames:

In [19]:
# Listando colunas a serem tomadas
df_test = df_train[['activation_date']]

In [20]:
df_test.head()

Unnamed: 0,activation_date
0,2017-03-28
1,2017-03-26
2,2017-03-20
3,2017-03-25
4,2017-03-16


In [17]:
# Listando colunas a serem ignoradas
df_test = df_train[df_train.columns.difference(['param_1', 'param_2', 'param_3'])]

In [18]:
df_test

Unnamed: 0,activation_date,category_name,city,deal_probability,description,image,image_top_1,item_id,item_seq_number,parent_category_name,price,region,title,user_id,user_type
0,2017-03-28,Товары для детей и игрушки,Екатеринбург,0.12789,"Кокон для сна малыша,пользовались меньше месяц...",d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,b912c3c6a6ad,2,Личные вещи,400.0,Свердловская область,Кокоби(кокон для сна),e00f8ff2eaf9,Private
1,2017-03-26,Мебель и интерьер,Самара,0.00000,"Стойка для одежды, под вешалки. С бутика.",79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,2dac0150717d,19,Для дома и дачи,3000.0,Самарская область,Стойка для Одежды,39aeb48f0017,Private
2,2017-03-20,Аудио и видео,Ростов-на-Дону,0.43177,"В хорошем состоянии, домашний кинотеатр с blu ...",b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,ba83aefab5dc,9,Бытовая электроника,4000.0,Ростовская область,Philips bluray,91e2f88dd6e3,Private
3,2017-03-25,Товары для детей и игрушки,Набережные Челны,0.80323,Продам кресло от0-25кг,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,02996f1dd2ea,286,Личные вещи,2200.0,Татарстан,Автокресло,bf5cccea572d,Company
4,2017-03-16,Автомобили,Волгоград,0.20797,Все вопросы по телефону.,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,7c90be56d2ab,3,Транспорт,40000.0,Волгоградская область,"ВАЗ 2110, 2003",ef50846afc0b,Private
5,2017-03-28,Товары для детей и игрушки,Чистополь,0.80323,В хорошем состоянии,eb6ad1231c59d3dc7e4020e724ffe8e4d302023ddcbb99...,796.0,51e0962387f7,9,Личные вещи,1300.0,Татарстан,Авто люлька,bbfad0b1ad0a,Private
6,2017-03-23,Ремонт и строительство,Нижний Новгород,0.00000,Электро водонагреватель накопительный на 100 л...,0330f6ac561f5db1fa8226dd5e7e127b5671d44d075a98...,2823.0,c4f260a2b48a,125,Для дома и дачи,11000.0,Нижегородская область,Водонагреватель 100 литров нержавейка плоский,08f469d2e6f7,Private
7,2017-03-25,"Одежда, обувь, аксессуары",Пермь,0.80323,Бойфренды в хорошем состоянии.,9bab29a519e81c14f4582024adfebd4f11a4ac71d323a6...,567.0,6b71309d6a8a,61,Личные вещи,500.0,Пермский край,Бойфренды colins,fef86baa002c,Private
8,2017-03-17,"Одежда, обувь, аксессуары",Оренбург,0.00000,54 раз мер очень удобное,75ce06d1f939a31dfb2af8ac55f08fa998fa336d13ee05...,415.0,c5b969cb63a2,85,Личные вещи,500.0,Оренбургская область,Платье,055825270190,Private
9,2017-03-22,Детская одежда и обувь,Нижний Новгород,0.00000,По стельке 15.5см мерить приокский район. Цвет...,54fb8521135fda77a860bfd2fac6bf46867ab7c06796e3...,46.0,b1570962e68c,136,Личные вещи,400.0,Нижегородская область,Полу ботиночки замш натур.Бамбини,f9e8f831d94c,Company


# 2. Processando dados

In [17]:
print('Nº. de amostras:', len(df_train.index))

Nº. de amostras: 1503424


Justificar, com base nessa primeira olhada nos dados, as trasformações de preprocessamento a serem realizadas: imputação de dados faltantes, vetorização de dados textuais, redução de dimensionalidade com PCA (ou eliminação direta, perguntar ao Fabrício), normalização (apenas para os algoritmos que se beneficiam)

A partir dessa primeira exploração dos dados, podemos ver que o dataset é bastante heterogêneo. Nele, podemos observar features categóricas, numéricas e textuais, o que significa que teremos que executar algumas etapas de preprocessamento antes de treinar algum modelo.

In [5]:
# O atributo as_df determina se o resultado do transformador é um DataFrame. Isso é necessário caso queiramos efetuar
# seleções posteriores sobre o resultado desse transformador
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, as_df=False):
        self.attribute_names = attribute_names
        self.as_df = as_df
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names] if self.as_df else X[self.attribute_names].values

In [6]:
# Usando o transformador para tomar colunas específicas
selector = DataFrameSelector(['price'], as_df=True)
X_test = selector.transform(df_train)

In [32]:
X_test.head()

Unnamed: 0,price
0,400.0
1,3000.0
2,4000.0
3,2200.0
4,40000.0


In [33]:
# Usando o transformador para ignorar colunas específicas
selector = DataFrameSelector(df_train.columns.difference(['item_id', 'user_id', 'param_1', 'param_2', 'param_3']), as_df=True)
X_test = selector.transform(df_train)

In [34]:
X_test.head()

Unnamed: 0,activation_date,category_name,city,deal_probability,description,image,image_top_1,item_seq_number,parent_category_name,price,region,title,user_type
0,2017-03-28,Товары для детей и игрушки,Екатеринбург,0.12789,"Кокон для сна малыша,пользовались меньше месяц...",d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,2,Личные вещи,400.0,Свердловская область,Кокоби(кокон для сна),Private
1,2017-03-26,Мебель и интерьер,Самара,0.0,"Стойка для одежды, под вешалки. С бутика.",79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,19,Для дома и дачи,3000.0,Самарская область,Стойка для Одежды,Private
2,2017-03-20,Аудио и видео,Ростов-на-Дону,0.43177,"В хорошем состоянии, домашний кинотеатр с blu ...",b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,9,Бытовая электроника,4000.0,Ростовская область,Philips bluray,Private
3,2017-03-25,Товары для детей и игрушки,Набережные Челны,0.80323,Продам кресло от0-25кг,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,286,Личные вещи,2200.0,Татарстан,Автокресло,Company
4,2017-03-16,Автомобили,Волгоград,0.20797,Все вопросы по телефону.,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,3,Транспорт,40000.0,Волгоградская область,"ВАЗ 2110, 2003",Private


## 2.1. Removendo atributos

O dataset tem alguns atributos que não são exatamente úteis para nossa modelagem, como *item_id* e *user_id*. Ainda, não faremos qualquer processamento de imagens. então, o atributo de identificadores de imagens, *image* não nos tem serventia.Vamos declarar um seletor inicial, responsável por produzir um DataFrame sem esses atributos. Esse DataFrame servirá de base para nossas transfomações posteriores:

In [6]:
features_selector = DataFrameSelector(X_train.columns.difference(['item_id', 'user_id', 'image']), as_df=True)

Quando aplicado ao dataset, temos:

In [7]:
X_transf = features_selector.transform(X_train)

In [8]:
X_transf.head()

Unnamed: 0,activation_date,category_name,city,description,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,title,user_type
0,2017-03-28,Товары для детей и игрушки,Екатеринбург,"Кокон для сна малыша,пользовались меньше месяц...",1008.0,2,Постельные принадлежности,,,Личные вещи,400.0,Свердловская область,Кокоби(кокон для сна),Private
1,2017-03-26,Мебель и интерьер,Самара,"Стойка для одежды, под вешалки. С бутика.",692.0,19,Другое,,,Для дома и дачи,3000.0,Самарская область,Стойка для Одежды,Private
2,2017-03-20,Аудио и видео,Ростов-на-Дону,"В хорошем состоянии, домашний кинотеатр с blu ...",3032.0,9,"Видео, DVD и Blu-ray плееры",,,Бытовая электроника,4000.0,Ростовская область,Philips bluray,Private
3,2017-03-25,Товары для детей и игрушки,Набережные Челны,Продам кресло от0-25кг,796.0,286,Автомобильные кресла,,,Личные вещи,2200.0,Татарстан,Автокресло,Company
4,2017-03-16,Автомобили,Волгоград,Все вопросы по телефону.,2264.0,3,С пробегом,ВАЗ (LADA),2110.0,Транспорт,40000.0,Волгоградская область,"ВАЗ 2110, 2003",Private


In [65]:
df_train.isnull().sum()

region                       0
city                         0
parent_category_name         0
category_name                0
param_1                  61576
param_2                 654542
param_3                 862565
title                        0
description             116276
price                    85362
item_seq_number              0
activation_date              0
user_type                    0
image_top_1             112588
deal_probability             0
dtype: int64

In [116]:
X_train['param_1'].value_counts().index[0]

'Женская одежда'

## 2.2. Tratando datas

Vamos transformar a coluna *activation_date* em três atributos mais relevantes: *month*, *day* e *weekday*:

In [9]:
# Essa transformação espera uma única série de timestamps e retorna um DataFrame
class TimestampTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.DataFrame(
            np.c_[
                X.dt.month,
                X.dt.day,
                X.dt.weekday
            ],
            columns=['month', 'day', 'weekday']
        )

Quando aplicamos esse transformador ao dataset obtemos:

In [10]:
process_date = Pipeline([
    ('date_series_selector', DataFrameSelector('activation_date', as_df=True)),
    ('date_features_transformer', TimestampTransformer())
])

X_test = process_date.transform(X_train)

In [11]:
df_test = pd.DataFrame(X_test, columns=['month', 'day', 'weekday'])

In [12]:
df_test.head()

Unnamed: 0,month,day,weekday
0,3,28,1
1,3,26,6
2,3,20,0
3,3,25,5
4,3,16,3


## 2.2. Imputando dados faltantes

Para o atributo de preço, vamos imputar a média dos valores conhecidos:

In [13]:
price_imputer = Imputer(strategy='mean')

process_price = Pipeline([
    ('price_selector', DataFrameSelector(['price'])),
    ('price_imputer', price_imputer)
])

X_price = process_price.fit_transform(df_train)

In [14]:
df_test = pd.DataFrame(X_price, columns=['price'])

In [15]:
df_test.head()

Unnamed: 0,price
0,400.0
1,3000.0
2,4000.0
3,2200.0
4,40000.0


Para o atributo *image_top_1* vamos imputar o valor mais frequente:

In [16]:
image_imputer = Imputer(strategy='most_frequent')

process_image_class = Pipeline([
    ('image_selector', DataFrameSelector(['image_top_1'])),
    ('image_imputer', image_imputer)
])

X_image = process_image_class.fit_transform(df_train)

In [17]:
df_test = pd.DataFrame(X_image, columns=['image_top_1'])

In [18]:
df_test.head()

Unnamed: 0,image_top_1
0,1008.0
1,692.0
2,3032.0
3,796.0
4,2264.0


Para atributos categóricos, vamos também adotar a estratégia de imputar a classe mais frequente para cada coluna. No entanto, o imputador do SciKit Learn não consegue lidar com colunas não numéricas. Vamos então declarar um imputador personalizado para essa operação:

In [158]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # É esperado que X seja do tipo Pandas.Series
        if not isinstance(X, pd.Series):
            raise Exception('Tipo de entrada não esperado: %s', str(type(X)))
        self.most_frequent = X.value_counts().index[0]
        return self
        
    def transform(self, X):
        return pd.DataFrame(X.fillna(self.most_frequent))

In [159]:
cat_imputer = CategoricalImputer()
X = cat_imputer.fit_transform(X_train['param_2'])

In [160]:
X.values.shape

(1503424, 1)

In [223]:
X = DataFrameSelector(['param_2']).fit_transform(df_train)

In [167]:
pd.isnull(X).sum()

654542

In [168]:
X.shape

(1503424, 1)

In [224]:
X =  X.astype(str)

In [225]:
X

array([['nan'],
       ['nan'],
       ['nan'],
       ...,
       ['4'],
       ['Обувь'],
       ['nan']], dtype='<U34')

In [226]:
X = np.place(X, X == 'nan', np.nan)

In [227]:
X

In [202]:
le = LabelEncoder()
X_transf = le.fit_transform(X.ravel().astype(str))

In [170]:
X_transf

array([107, 107, 107, ...,   3, 193, 107], dtype=int64)

In [171]:
pd.isnull(X_transf).sum()

0

In [139]:
cat_pipeline = Pipeline([
    ('param_1_selector', DataFrameSelector(['param_1'])),
    ('label_encoder', LabelEncoder()),
    ('cat_imputer', Imputer(strategy='most_frequent'))
])
X_inp = cat_pipeline.fit_transform(df_train)

TypeError: fit_transform() takes 2 positional arguments but 3 were given

## 2.3. Codificando atributos categóricos

Contagem de classes nas features categóricas:

In [3]:
# print('user_ids:', len(np.unique(df_train['user_id'])))
print('Regions:', len(np.unique(df_train['region'])))
print('cities:', len(np.unique(df_train['city'])))
print('parent_category_names:', len(np.unique(df_train['parent_category_name'])))
print('category_names:', len(np.unique(df_train['category_name'])))
print('param_1:', len(np.unique(df_train['param_1'][~pd.isnull(df_train['param_1'].values)].values)))
print('param_1 as str:', len(np.unique(df_train['param_1'].astype(str).values)))
print('param_2:', len(np.unique(df_train['param_2'][~pd.isnull(df_train['param_2'].values)].values)))
print('param_3:', len(np.unique(df_train['param_3'][~pd.isnull(df_train['param_3'].values)].values)))
print('activation_dates:', len(np.unique(df_train['activation_date'])))
print('user_types:', len(np.unique(df_train['user_type'])))
print('image_top_1:', len(np.unique(df_train['image_top_1'])))

Regions: 28
cities: 1733
parent_category_names: 9
category_names: 47
param_1: 371
param_1 as str: 372
param_2: 271
param_3: 1219
activation_dates: 21
user_types: 3
image_top_1: 115650


As colunas *region*, *city*, *parent_category_name*, *category_name*, *param_1*, *param_2*, *param_3* e *user_type* são atributos categóricos nominais. Isso significa que, para cada amostra, essas entradas assumem um dentre um número finito de classes possíveis. Além disso, não existe uma ordem entre as possíveis classes de uma dessas categorias. Uma forma óbvia para codificar numericamente um atributo categórico é substituir cada classe desse atributo por um número inteiro. Entretanto, se fizésssemos isso, estaríamos inserindo uma falsa informação de ordem entre as classes. Uma técnica muito comum para lidar com essa situação é o One-Hot Encoding, onde uma coluna é criada para cada classe de um atributo categórico: 

**Atenção**: Aparentemente, fazer o One Hot Encoding por get_dummies é bem estúpido, pois não faz proveito das otimizações de matrizes esparsas do SciPy. Acho que deveria usar dataframes apenas para carregamento dos dados e após, usar apenas classes do numpy e scipy

Vamos criar um transformador personalizado para codificar essas categorias:

Também vamos aproveitar para fazer a imputação de categorias faltantes após a codificação

In [166]:
process_cat = Pipeline([
    ('imputed_categories', FeatureUnion([
        ('impute_region', Pipeline([
            ('region_selector', DataFrameSelector('region', as_df=True)),
            ('region_imputer', CategoricalImputer())
        ])),
        ('impute_city', Pipeline([
            ('city_selector', DataFrameSelector('city', as_df=True)),
            ('city_imputer', CategoricalImputer())
        ])),
        ('impute_parent_category_name', Pipeline([
            ('parent_category_name_selector', DataFrameSelector('parent_category_name', as_df=True)),
            ('parent_category_name_imputer', CategoricalImputer())
        ])),
        ('impute_category_name', Pipeline([
            ('category_name_selector', DataFrameSelector('category_name', as_df=True)),
            ('category_name_imputer', CategoricalImputer())
        ])),
        ('impute_param_1', Pipeline([
            ('param_1_selector', DataFrameSelector('param_1', as_df=True)),
            ('param_1_imputer', CategoricalImputer())
        ])),
        ('impute_param_2', Pipeline([
            ('param_2_selector', DataFrameSelector('param_2', as_df=True)),
            ('param_2_imputer', CategoricalImputer())
        ])),
        ('impute_param_3', Pipeline([
            ('param_3_selector', DataFrameSelector('param_3', as_df=True)),
            ('param_3_imputer', CategoricalImputer())
        ])),
        ('impute_user_type', Pipeline([
            ('user_type_selector', DataFrameSelector('user_type', as_df=True)),
            ('user_type_imputer', CategoricalImputer())
        ])),
    ])),
    ('ordinal_encoder', OrdinalEncoder(dtype=np.int64)),
])

start = time.time()

X_transf = process_cat.fit_transform(X_train)

end = time.time()
print('Elapsed time:', end-start)

Elapsed time: 143.02447032928467


In [164]:
X_transf.shape

(1503424, 8)

In [167]:
X_transf[:5]

array([[  19,  460,    4,   42,  248,  192, 1173,    1],
       [  17, 1300,    2,   22,  121,  192, 1173,    1],
       [  16, 1276,    0,    2,   83,  192, 1173,    1],
       [  21,  940,    4,   42,   37,  192, 1173,    0],
       [   4,  317,    6,    0,  277,  118,   44,    1]], dtype=int64)

## 2.4. Codificando atributos textuais 

In [61]:
vect = TfidfVectorizer(sublinear_tf=True)

process_text = Pipeline([
    ('title_selector', DataFrameSelector(['title'])),
    ('ravel_feature', FunctionTransformer(lambda f: f.ravel(), validate=False)),
    ('text_vectorizer', vect),
    ('dim_reduction', TruncatedSVD(random_state=1))
])


start = time.time()

title_processed = process_text.fit_transform(X_train)

end = time.time()

print('Elapsed time:', end - start)

Elapsed time: 16.03394341468811


In [35]:
cv.get_feature_names()

['00',
 '000',
 '0000',
 '00000',
 '0000010',
 '0000054',
 '00001',
 '00001548',
 '00002034',
 '00003',
 '000045',
 '00009',
 '0000xy',
 '0001',
 '00013198',
 '00015361',
 '0001580',
 '00017099',
 '00017230',
 '0002',
 '00020900',
 '000211',
 '00024',
 '00025375',
 '00027a',
 '0003',
 '0004',
 '0005',
 '00050',
 '00051',
 '00052',
 '00054',
 '00055',
 '00056',
 '00058',
 '00059',
 '0006',
 '00060',
 '0007',
 '00075',
 '00077',
 '0008',
 '0009',
 '000c',
 '000j00009422',
 '000mah',
 '000sr',
 '000tr',
 '000w',
 '000км',
 '000р',
 '000т',
 '000шт',
 '001',
 '0010',
 '00100xru',
 '001053',
 '0012',
 '00123',
 '0013',
 '00139',
 '0014',
 '00143',
 '0015',
 '00157a',
 '0016',
 '00160a',
 '0017',
 '00173a',
 '0018',
 '0019',
 '001eh',
 '001gb',
 '001i',
 '001m',
 '001oms',
 '001ru',
 '001uf',
 '001уз',
 '001ухл1',
 '002',
 '00201',
 '0021',
 '002100',
 '0022',
 '0023',
 '0024',
 '0025',
 '00252a',
 '00264c',
 '00266',
 '0027',
 '00289b',
 '002b',
 '002bk',
 '002c',
 '002m',
 '002ru',
 '002sr

In [60]:
title_processed.shape

(1503424, 2)

Pela quantidade de números no vocabulário, talvez devessemos preprocessar os documentos antes de vetorizar

In [4]:
df_train.isnull().sum()

item_id                      0
user_id                      0
region                       0
city                         0
parent_category_name         0
category_name                0
param_1                  61576
param_2                 654542
param_3                 862565
title                        0
description             116276
price                    85362
item_seq_number              0
activation_date              0
user_type                    0
image                   112588
image_top_1             112588
deal_probability             0
dtype: int64

param_1, param_2 e param_3 são parâmetros adicionais de classificação do modelo de anúncios da plataforma. Podem não ser muito essenciais. image é o identificador do jpeg associado ao anúncio. Como não analizaremos imagens, parece seguro ignorar essa feature. image_top_1 é uma incógnita. Não sei seu significado. Como é relacionado com imagens, irei ignorar. Talvez *item_seq_number* também seja ignorável.

In [31]:
df_train.drop(columns=['param_1', 'param_2', 'param_3' , 'image_top_1'], inplace=True)

In [33]:
df_train.drop(columns=['item_id', 'user_id', 'title', 'description', 'item_seq_number', 'activation_date', 'image'], inplace=True)

In [34]:
df_train.head(10)

Unnamed: 0,price,deal_probability,region_Башкортостан,region_Белгородская область,region_Владимирская область,region_Волгоградская область,region_Воронежская область,region_Иркутская область,region_Калининградская область,region_Кемеровская область,...,category_name_Собаки,category_name_Спорт и отдых,category_name_Телефоны,category_name_Товары для детей и игрушки,category_name_Товары для животных,category_name_Товары для компьютера,category_name_Фототехника,category_name_Часы и украшения,user_type_Private,user_type_Shop
0,400.0,0.12789,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,3000.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,4000.0,0.43177,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2200.0,0.80323,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,40000.0,0.20797,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,1300.0,0.80323,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
6,11000.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,500.0,0.80323,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,500.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,400.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


O que fazer quanto as datas?

In [6]:
df_train.activation_date.values[:10]

array(['2017-03-28', '2017-03-26', '2017-03-20', '2017-03-25',
       '2017-03-16', '2017-03-28', '2017-03-23', '2017-03-25',
       '2017-03-17', '2017-03-22'], dtype=object)

In [9]:
dates = pd.DatetimeIndex(df_train.activation_date.values)

In [10]:
dates

DatetimeIndex(['2017-03-28', '2017-03-26', '2017-03-20', '2017-03-25',
               '2017-03-16', '2017-03-28', '2017-03-23', '2017-03-25',
               '2017-03-17', '2017-03-22',
               ...
               '2017-03-24', '2017-03-15', '2017-03-24', '2017-03-27',
               '2017-03-17', '2017-03-20', '2017-03-28', '2017-03-21',
               '2017-03-22', '2017-03-21'],
              dtype='datetime64[ns]', length=1503424, freq=None)

In [8]:
dates.year.max()

2017

In [9]:
dates.year.min()

2017

In [16]:
dates.month.min()

3

In [17]:
dates.month.max()

4

Todas as datas são de 2017. Poderia projetá-las em 12 categorias representando os meses ou talvez uma categorização ainda mais fina

Não sei se existe uma técnica de imputação para lidar com dados textuais faltantes. Talvez tenhamos que eliminar as amostras que não tenham descrição.

Imputações deveriam ser feitas antes ou depois de eliminações de amostras?

Não gostaria de ignorar a feature 'preço', pois me parece extremamente relevante para o sucesso do anúncio. Nesse caso, uma imputação da média pode ser interessante:

In [10]:
df_train[['price']]

Unnamed: 0,price
0,400.0
1,3000.0
2,4000.0
3,2200.0
4,40000.0
5,1300.0
6,11000.0
7,500.0
8,500.0
9,400.0


In [11]:
from sklearn.preprocessing import Imputer

imr = Imputer(missing_values='NaN', strategy='mean', axis=0)

imr.fit(df_train[['price']].values)
imputed_prices = imr.transform(df_train[['price']].values)

df_train.price = imputed_prices

In [12]:
df_train.head(10)

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,title,description,price,item_seq_number,activation_date,user_type,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,0.20797
5,51e0962387f7,bbfad0b1ad0a,Татарстан,Чистополь,Личные вещи,Товары для детей и игрушки,Авто люлька,В хорошем состоянии,1300.0,9,2017-03-28,Private,0.80323
6,c4f260a2b48a,08f469d2e6f7,Нижегородская область,Нижний Новгород,Для дома и дачи,Ремонт и строительство,Водонагреватель 100 литров нержавейка плоский,Электро водонагреватель накопительный на 100 л...,11000.0,125,2017-03-23,Private,0.0
7,6b71309d6a8a,fef86baa002c,Пермский край,Пермь,Личные вещи,"Одежда, обувь, аксессуары",Бойфренды colins,Бойфренды в хорошем состоянии.,500.0,61,2017-03-25,Private,0.80323
8,c5b969cb63a2,055825270190,Оренбургская область,Оренбург,Личные вещи,"Одежда, обувь, аксессуары",Платье,54 раз мер очень удобное,500.0,85,2017-03-17,Private,0.0
9,b1570962e68c,f9e8f831d94c,Нижегородская область,Нижний Новгород,Личные вещи,Детская одежда и обувь,Полу ботиночки замш натур.Бамбини,По стельке 15.5см мерить приокский район. Цвет...,400.0,136,2017-03-22,Company,0.0


In [13]:
df_train.isnull().sum()

item_id                      0
user_id                      0
region                       0
city                         0
parent_category_name         0
category_name                0
title                        0
description             116276
price                        0
item_seq_number              0
activation_date              0
user_type                    0
deal_probability             0
dtype: int64

**Lembrar de aplicar transformações também aos dados de teste, MAS O TRANSFORMADOR DEVE SER TREINADO APENAS COM OS DADOS DE TREINO**

Agora, é preciso codificar as features categóricas nominais: region, city, parent_category_name e user_type. Provavelmente teremos de usar one-hot encoding