# Создание новых признаков для таблиц

In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
import sys, os
import pathlib
from pathlib import Path
from pydantic import ValidationError
from olist_churn_prediction.paths import SRC_DIR, PROCESSED_DIR, INTERIM_DIR, RAW_DIR
from olist_churn_prediction import feature_processing
from olist_churn_prediction.feature_engineering import DateDiffTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

## classified_data_interim

In [18]:
from olist_churn_prediction.schemas_interim import MainClassifiedSchemaInterim

classified_data = feature_processing.load_data(INTERIM_DIR / "classified_data_interim.parquet",
                      schema = MainClassifiedSchemaInterim,
                      validate = True) # в проде можно False

In [19]:
classified_data.head()

Unnamed: 0,order_status,order_products_value,order_freight_value,order_items_qty,order_purchase_timestamp,order_aproved_at,order_estimated_delivery_date,order_delivered_customer_date,customer_city,customer_state,...,votes_low_quality,votes_return,votes_not_as_anounced,votes_partial_delivery,votes_other_delivery,votes_other_order,votes_satisfied,most_voted_subclass,most_voted_class,product_category_name_english
0,delivered,89.989998,14.38,1,2017-08-30 11:41:01,2017-08-30 11:55:08.970352,2017-09-21,2017-09-08 20:35:27.276847,belo_horizonte,mg,...,0,0,0,0,0,0,3,satisfeito,satisfeito_com_pedido,health_beauty
1,delivered,69.0,15.23,1,2017-09-26 09:13:36,2017-09-26 09:28:10.922048,2017-10-24,2017-09-29 21:13:04.984841,pocos_de_caldas,mg,...,0,0,0,0,0,0,0,antes_prazo,satisfeito_com_pedido,toys
2,delivered,99.800003,15.86,2,2018-01-15 15:50:42,2018-01-17 07:29:56.419769,2018-02-05,2018-01-23 17:51:31.134866,sao_jose_dos_campos,sp,...,0,0,0,3,0,0,0,entrega_parcial,problemas_de_entrega,garden_tools
3,delivered,87.0,12.74,1,2018-02-04 11:16:42,2018-02-06 05:31:50.990164,2018-03-13,2018-02-20 19:38:06.633080,ribeirao_preto,sp,...,0,0,0,0,0,0,0,atrasado,problemas_de_entrega,computers_accessories
4,delivered,99.900002,17.950001,1,2017-12-07 11:58:42,2017-12-08 02:36:49.587515,2018-01-03,2017-12-19 22:33:18.952512,rio_de_janeiro,rj,...,0,0,0,3,0,0,0,entrega_parcial,problemas_de_entrega,bed_bath_table


In [20]:
classified_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3467 entries, 0 to 3466
Data columns (total 30 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_status                   3467 non-null   string        
 1   order_products_value           3467 non-null   float32       
 2   order_freight_value            3467 non-null   float32       
 3   order_items_qty                3467 non-null   Int64         
 4   order_purchase_timestamp       3467 non-null   datetime64[ns]
 5   order_aproved_at               3467 non-null   datetime64[ns]
 6   order_estimated_delivery_date  3467 non-null   datetime64[ns]
 7   order_delivered_customer_date  3467 non-null   datetime64[ns]
 8   customer_city                  3467 non-null   string        
 9   customer_state                 3467 non-null   string        
 10  customer_zip_code_prefix       3467 non-null   string        
 11  product_name_leng

### Создание временных признаков

#### На основе временных признаков можно создать новые полезные и хорошо интерпретируемые:
1. Время между оплатой заказа и подтверждением заказа на сайте (order_approved_at - order_purchase_timestamp): *payment_lag*
2. Время между подтверждением заказа и ориентировочным времени доставки (order_estimated_delivery_date - order_aproved_at): *estimated_shipping_lag*
3. Время между фактическим времени доставки и ожидаемым (order_delivered_customer_date - order_estimated_delivery_date): *actual_and_estimated_shipping_lag*
4. Время между рассылкой с просьбой оценить качество и ответом покупателя (review_answer_timestamp - review_creation_date): *customer_review_response_lag*
5. Время между подтверждением заказа и его прибытием (order_delivered_customer_date - order_aproved_at): *approval_and_arrival_lag*

In [21]:
diffs = FeatureUnion([
    ('payment_lag', DateDiffTransformer('order_purchase_timestamp',
                                        'order_aproved_at',
                                        'payment_lag_hours')),
    ('estimated_shipping_lag', DateDiffTransformer('order_aproved_at',
                                         'order_estimated_delivery_date',
                                         'estimated_shipping_lag_hours')),
    ('actual_and_estimated_shipping_lag', DateDiffTransformer('order_estimated_delivery_date',
                                         'order_delivered_customer_date',
                                         'actual_and_estimated_shipping_lag_hours')),
    ('customer_review_response_lag', DateDiffTransformer('review_creation_date',
                                         'review_answer_timestamp',
                                         'customer_review_response_lag_hours')),
    ('approval_and_arrival_lag', DateDiffTransformer('order_aproved_at',
                                         'order_delivered_customer_date',
                                         'approval_and_arrival_lag_hours')),
])

In [22]:
diff_matrix = diffs.fit_transform(classified_data)

In [23]:
print(diff_matrix)

[[   0  516 -292   27  224]
 [   0  662 -579   77   83]
 [  39  448 -295  233  154]
 ...
 [  31  500 -322   45  178]
 [   0 1063 -453   21  611]
 [   0  256 -204  113   52]]


In [24]:
new_cols = [tr.new_col for _, tr in diffs.transformer_list]
classified_data_diffs  = pd.DataFrame(diff_matrix,
                         columns=new_cols,
                         index=classified_data.index)

In [25]:
classified_data = classified_data.join(classified_data_diffs)

In [26]:
display(classified_data.head())

Unnamed: 0,order_status,order_products_value,order_freight_value,order_items_qty,order_purchase_timestamp,order_aproved_at,order_estimated_delivery_date,order_delivered_customer_date,customer_city,customer_state,...,votes_other_order,votes_satisfied,most_voted_subclass,most_voted_class,product_category_name_english,payment_lag_hours,estimated_shipping_lag_hours,actual_and_estimated_shipping_lag_hours,customer_review_response_lag_hours,approval_and_arrival_lag_hours
0,delivered,89.989998,14.38,1,2017-08-30 11:41:01,2017-08-30 11:55:08.970352,2017-09-21,2017-09-08 20:35:27.276847,belo_horizonte,mg,...,0,3,satisfeito,satisfeito_com_pedido,health_beauty,0,516,-292,27,224
1,delivered,69.0,15.23,1,2017-09-26 09:13:36,2017-09-26 09:28:10.922048,2017-10-24,2017-09-29 21:13:04.984841,pocos_de_caldas,mg,...,0,0,antes_prazo,satisfeito_com_pedido,toys,0,662,-579,77,83
2,delivered,99.800003,15.86,2,2018-01-15 15:50:42,2018-01-17 07:29:56.419769,2018-02-05,2018-01-23 17:51:31.134866,sao_jose_dos_campos,sp,...,0,0,entrega_parcial,problemas_de_entrega,garden_tools,39,448,-295,233,154
3,delivered,87.0,12.74,1,2018-02-04 11:16:42,2018-02-06 05:31:50.990164,2018-03-13,2018-02-20 19:38:06.633080,ribeirao_preto,sp,...,0,0,atrasado,problemas_de_entrega,computers_accessories,42,834,-485,26,350
4,delivered,99.900002,17.950001,1,2017-12-07 11:58:42,2017-12-08 02:36:49.587515,2018-01-03,2017-12-19 22:33:18.952512,rio_de_janeiro,rj,...,0,0,entrega_parcial,problemas_de_entrega,bed_bath_table,14,621,-338,76,283


### Целевая переменная для датасетов classified_data_interim и public_data_interim

#### Так как у нас нет меток покупателей "ушел" или "остался", то создадим их самостоятельно. Начнем с простого "после N времени не купил товар, значит ушел". Возьмем N = 90 дней. Так как в classified-датасете нет id пользователей, то неизвестно какие заказы были кем сделаны, а следовательно и целевую переменную создать для него невозможно. Данная задача останется только для public_data.

### Новые классы из votes-признаков

Пусть у нас для индекса 0 будет 3 голоса за votes_satisfied и 0 голосов за остальные, тогда однозначно можно отнести строку индекса 0 к новому классу satisfied=1. Порогом для класса '1' будем считать 2 голоса и больше. В случаях, где голоса распределены поровну, например [1, 1, 1], отнесем комментарий ко всем классам с голосами. В отдельном ноутбуке позже создадим эти классы.

In [28]:
votes_features = ['votes_before_estimate', 'votes_delayed', 'votes_low_quality',
                  'votes_return', 'votes_not_as_anounced', 'votes_partial_delivery',
                  'votes_other_delivery', 'votes_other_order', 'votes_satisfied']

In [29]:
display(classified_data[votes_features].head(50))

Unnamed: 0,votes_before_estimate,votes_delayed,votes_low_quality,votes_return,votes_not_as_anounced,votes_partial_delivery,votes_other_delivery,votes_other_order,votes_satisfied
0,0,0,0,0,0,0,0,0,3
1,3,0,0,0,0,0,0,0,0
2,0,0,0,0,0,3,0,0,0
3,0,3,0,0,0,0,0,0,0
4,0,0,0,0,0,3,0,0,0
5,0,0,0,0,0,1,1,1,0
6,0,0,0,0,0,0,3,0,0
7,2,0,0,0,0,0,0,0,1
8,1,0,0,0,0,0,0,0,2
9,0,0,0,0,0,0,0,0,3


In [30]:
classified_data = classified_data.dropna()

In [31]:
classified_data.shape

(3303, 35)

In [38]:
base = classified_data.columns.drop(votes_features).drop(['most_voted_class', 'most_voted_subclass']) # все прочие фичи
with_votes = classified_data.columns
no_votes = classified_data.columns.drop(votes_features)

for cols in [with_votes, no_votes]:
    m = CatBoostClassifier(loss_function='MultiClass')
    cv = cross_val_score(m, classified_data[cols], y, cv=5, metric='macro_f1')
    print(cols, cv.mean())

NameError: name 'y' is not defined

In [39]:
# max_votes: максимальное число голосов в строке
# ties_mask: True там, где столбец == max_votes
max_votes  = classified_data[votes_features].max(axis=1)
ties_mask  = classified_data[votes_features].eq(max_votes, axis=0) # широкая матрица True/False такой же формы

display(max_votes.head())
display(ties_mask.head())

0    3
1    3
2    3
3    3
4    3
dtype: int64

Unnamed: 0,votes_before_estimate,votes_delayed,votes_low_quality,votes_return,votes_not_as_anounced,votes_partial_delivery,votes_other_delivery,votes_other_order,votes_satisfied
0,False,False,False,False,False,False,False,False,True
1,True,False,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False
3,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False


In [40]:
# формируем новые индикаторы
new_cols = [c.replace('votes_', '') for c in votes_features] # ['satisfied', 'partial_delivery', 'not_as_anounced', ...]
classified_data[new_cols] = ties_mask.astype(int).values # True→1, False→0

In [41]:
display(classified_data[new_cols].head())

Unnamed: 0,before_estimate,delayed,low_quality,return,not_as_anounced,partial_delivery,other_delivery,other_order,satisfied
0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0


In [42]:
display(classified_data.head())

Unnamed: 0,order_status,order_products_value,order_freight_value,order_items_qty,order_purchase_timestamp,order_aproved_at,order_estimated_delivery_date,order_delivered_customer_date,customer_city,customer_state,...,approval_and_arrival_lag_hours,before_estimate,delayed,low_quality,return,not_as_anounced,partial_delivery,other_delivery,other_order,satisfied
0,delivered,89.989998,14.38,1,2017-08-30 11:41:01,2017-08-30 11:55:08.970352,2017-09-21,2017-09-08 20:35:27.276847,belo_horizonte,mg,...,224,0,0,0,0,0,0,0,0,1
1,delivered,69.0,15.23,1,2017-09-26 09:13:36,2017-09-26 09:28:10.922048,2017-10-24,2017-09-29 21:13:04.984841,pocos_de_caldas,mg,...,83,1,0,0,0,0,0,0,0,0
2,delivered,99.800003,15.86,2,2018-01-15 15:50:42,2018-01-17 07:29:56.419769,2018-02-05,2018-01-23 17:51:31.134866,sao_jose_dos_campos,sp,...,154,0,0,0,0,0,1,0,0,0
3,delivered,87.0,12.74,1,2018-02-04 11:16:42,2018-02-06 05:31:50.990164,2018-03-13,2018-02-20 19:38:06.633080,ribeirao_preto,sp,...,350,0,1,0,0,0,0,0,0,0
4,delivered,99.900002,17.950001,1,2017-12-07 11:58:42,2017-12-08 02:36:49.587515,2018-01-03,2017-12-19 22:33:18.952512,rio_de_janeiro,rj,...,283,0,0,0,0,0,1,0,0,0


In [44]:
print(classified_data['most_voted_class'].value_counts())

most_voted_class
satisfeito_com_pedido     1976
problemas_de_entrega       868
problemas_de_qualidade     459
Name: count, dtype: Int64


In [45]:
print(classified_data['most_voted_subclass'].value_counts())

most_voted_subclass
satisfeito                1486
antes_prazo                490
entrega_parcial            397
atrasado                   366
diferente_do_anunciado     163
baixa_qualidade            146
devolucao                  127
outro_entrega              105
outro_pedido                23
Name: count, dtype: Int64
