## <center> Feature engineering

Read files from ETL notebook

In [1]:
%load_ext jupyternotify
%store -r item_cat
%store -r item
%store -r sub
%store -r shops
%store -r sales_test
%store -r sales_train

In [95]:
from IPython.display import clear_output


import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost import plot_importance

import warnings

from plotly.offline import init_notebook_mode, iplot
from plotly import graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px

# Initialize plotly
init_notebook_mode(connected=True)

from IPython.core.debugger import set_trace
from time import time


from tqdm import tqdm
tqdm.pandas(desc="")
warnings.filterwarnings("ignore")
import os


# SKLEARN
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, KFold
from scipy.stats import randint as sp_randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans

# TSFRESH
from tsfresh.feature_extraction import ComprehensiveFCParameters, extract_features, MinimalFCParameters, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features

# LightGBM
from lightgbm import LGBMRegressor

# Sklearn-pandas
from sklearn_pandas import CategoricalImputer, FunctionTransformer, DataFrameMapper

# Bayessian Optimization
from bayes_opt import BayesianOptimization

# vecstack
from vecstack import stacking

# Transformers
from features_transformers import DateFeatureExtractor, TSFreshTransformer
from split_dataset import split_dataset, split_and_transform

# PCA
from sklearn.decomposition import PCA

class TSFreshTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_id, column_sort, column_value, extraction_settings):
        self.column_id = column_id
        self.column_sort = column_sort
        self.column_value = column_value
        self.extraction_settings = extraction_settings
    
    def fit(self, train_x, train_y=None, **fit_params):
        return self
   
    def transform(self, X_train, y_train=None, **fit_params):

        X_features = extract_features(
            X_train,
            column_id=self.column_id, 
            column_sort=self.column_sort, 
            column_value=self.column_value, 
            default_fc_parameters=self.extraction_settings)
        
        impute(X_features)
        return X_features 
    
    def get_params(self):
         return {'column_id': self.column_id, 
                'column_sort': self.column_sort,
                'column_value': self.column_value,
                'extraction_settings': self.extraction_settings}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X).transform(X)

Table denormalization and joining tables. Then was created features:
- key = combined_key. item_id + shop_id
- is_actual = current item is actual in shop
- item_price_{number} - item price category (created by KMeans)
- months_period_of_product - life period of item
- item_price_{statistic} - item_price statistics
- date_block_num_{statistic} - date_block_num statistics
- item_cnt_day - month sum of sold items
- returned_items_{statistic} count of returned items 
- pca_{number}_{month}_{statistic + ts features} - time series features which derived from item_cnt_day by each period of months. PCA used for dimensionaility reduction


In [91]:
%%notify
df = sales_train.reset_index().copy()
# sales train + item
df_cat_id = df.set_index('item_id').join(item[['item_id', 'item_category_id']].set_index('item_id'), how='left').reset_index()

df_gf = DateFeatureExtractor('date').fit_transform(df_cat_id)
df_cleaned = df_gf\
.groupby(['shop_id', 'item_id', 'date_block_num', 'month', 'year', 'item_category_id'], as_index=False)\
.agg({'item_price':'median', 'item_cnt_day':'sum'})

df_cleaned['key'] = df_cleaned.progress_apply(lambda x: str(int(x['shop_id'])) + "_" + str(int(x['item_id'])), axis=1)
df_cleaned['key'] = LabelEncoder().fit_transform(df_cleaned['key'])

UsageError: Cell magic `%%notify` not found.


Product is actual

In [9]:
actual_items_id = df_cleaned[df_cleaned['date_block_num'] == df_cleaned['date_block_num'].max()]['item_id']
actual_items_id = np.isin(df_cleaned['item_id'], actual_items_id)
df_cleaned['is_actual'] = np.where(actual_items_id, 1, 0)

item_price_{number} - item price category (created by KMeans)

In [10]:
def generate_name_for_item_price_clust(col_name="item_price "):
    CLUSTERS_COUNT = 6
    return list(map(lambda x: col_name + str(x), range(0, CLUSTERS_COUNT)))


df_cleaned['median_item_price'] = df_cleaned[['key', 'item_price']].groupby('key').transform('median')
it = df_cleaned['median_item_price'].values.reshape(-1, 1)

kmeans = KMeans(6)
kmeans.fit(it)

clusters = kmeans.predict(it)
ohe_clusters = pd.get_dummies(clusters)
ohe_clusters.columns = generate_name_for_item_price_clust()
df_cleaned = pd.concat([df_cleaned, ohe_clusters], axis=1)

Split methods:

In [15]:
def generate_features_from_train_set(X_train, X_test, key=['key']):
    global ad
    def min_max_transformation(df, col):
        assert len(df) != 0
        assert col != ""       
        features = df[[*key, col]]\
             .groupby(key, as_index=False)\
             .agg(['max', 'min', 'std', 'mean', 'median'])
        
        features.columns = ["_".join(x) for x in features.columns.ravel()]
        return features
        
    def merge_train_test(train_df, test_df, result):
        train_df = train_df.merge(result, on=key, right_index=False, how='left')
        test_df = test_df.merge(result, on=key, right_index=False, how='left')
        return train_df, test_df

    ## Features
        
    x_item_price = min_max_transformation(X_train, 'item_price')
    x_date_block_num = min_max_transformation(X_train, 'date_block_num')

    months_period_of_product = X_train[['key', 'date_block_num']]\
                               .groupby('key')\
                               .progress_apply(lambda x: x['date_block_num'].max() - x['date_block_num'].min())\
                               .to_frame()\
                               .reset_index()
    
    months_period_of_product.columns = ['key', 'months_period_of_product']

    for i, tr in enumerate([x_item_price, months_period_of_product, x_date_block_num]):
        X_train, X_test = merge_train_test(X_train, X_test, tr)
        print(X_train.shape)

    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)
    
    return X_train, X_test

def split_dataset(gb_df, date_col, pred_col, prediction_size = 5):
    gb_df = gb_df.copy().sort_values(by=[date_col])
    gb_df = gb_df.dropna()
    X = gb_df.drop(pred_col, axis=1)
    y = gb_df[pred_col]
    
    max_month = X[date_col].max()
    
    train_val_condition = X[date_col] < (max_month - prediction_size)
    test_condition = X[date_col] >= (max_month - prediction_size)
    
    X_train_val = X[train_val_condition].reset_index(drop=True)
    X_test = X[test_condition].reset_index(drop=True)
    y_train_val = y[train_val_condition].reset_index(drop=True)
    y_test = y[test_condition].reset_index(drop=True)
    
    return X_train_val, y_train_val, X_test, y_test

def split_and_transform(df, **kwargs):
  ## Add transformers 
    X_train_val, y_train_val, X_test, y_test = split_dataset(df, **kwargs)
    print(X_train_val.shape)
    X_train_val, X_test = generate_features_from_train_set(X_train_val, X_test)    
    print(X_train_val.shape)
    return X_train_val, y_train_val, X_test, y_test

In [144]:
X_train_val.head()

Unnamed: 0,shop_id,item_id,date_block_num,month,year,item_category_id,item_price,key,is_actual,median_item_price,...,item_price 2,item_price 3,item_price 4,item_price 5,item_price_max,item_price_min,item_price_std,item_price_mean,item_price_median,months_period_of_product
0,10,4885,0,1,0,23,716.0,8163,1,808.0,...,0,0,0,0,1165.339966,716.0,144.775803,823.758179,716.0,22.0
1,23,3533,0,1,0,30,999.0,94453,0,999.0,...,0,0,0,0,999.0,999.0,0.0,999.0,999.0,0.0
2,23,3527,0,1,0,77,199.0,94451,0,199.0,...,1,0,0,0,199.0,199.0,0.0,199.0,199.0,0.0
3,23,3470,0,3,0,30,349.0,94450,0,349.0,...,1,0,0,0,349.0,349.0,0.0,349.0,349.0,0.0
4,23,3470,0,1,0,30,349.0,94450,0,349.0,...,1,0,0,0,349.0,349.0,0.0,349.0,349.0,0.0


In [14]:
# ///// CODE TO TEST FEATURE ENGINEERING FEATURES

# first_ten_products = df_cleaned[df_cleaned['key'] == 303844]
# #full = df_cleaned[np.isin(df_cleaned['key'], first_ten_products)]
# X_train_val, y_train_val, X_test, y_test = split_and_transform(first_ten_products, date_col='date_block_num',\
#                                                               pred_col='item_cnt_day', prediction_size=1)

Save model

In [26]:
%%notify
X_train_val, y_train_val, X_test, y_test = split_and_transform(df_cleaned, date_col='date_block_num',\
                                                              pred_col='item_cnt_day', prediction_size=1)

clear_output()

In [60]:
def generate_ts_features(df, X_train, X_test, it, month_threshold):
    def merge(df, features, it):
        return df.merge(features, how='left', left_on='key', right_on='id', right_index=False, suffixes=('', f'_{str(it)}'))
    
    def reduce_dimensions(features, month_threshold):
        n_components=3
        pca = PCA(n_components=n_components)
        pca_features = pca.fit_transform(features)

        print(f"Explained variance {pca.explained_variance_ratio_}")  

        column_names = list(map(lambda x: f"pca_{x}_{month_threshold}", range(0, n_components)))
        pca_df = pd.DataFrame(pca_features, columns = column_names).reset_index(drop=True)
        features = pd.concat([features.reset_index()['id'], pca_df], axis=1)
        return features
    
    df_more_than_0 = df[(df['months_period_of_product'] + 1) >= month_threshold]
    
    if df_more_than_0.shape[0] == 0:
        return X_train, X_test
  
    print("item_cnt_day")
    features = TSFreshTransformer('key', 'date_block_num', 'item_cnt_day', EfficientFCParameters())\
        .fit_transform(df_more_than_0, axis=0)
   
    features = reduce_dimensions(features, month_threshold)

    X_train = merge(X_train, features, it)
    X_test = merge(X_test, features, it)
    
    print("returned_items")
    # return products tsfresh
    returned_items = df_more_than_0['item_cnt_day'].apply(lambda x: x if x < 0 else 0).to_frame()
    returned_items.columns = ['returned_items']        

    pred_ts_data = pd.concat([df_more_than_0.reset_index(), returned_items.reset_index()], axis=1)

    features = TSFreshTransformer('key', 'date_block_num', 'returned_items', MinimalFCParameters())\
    .fit_transform(pred_ts_data, axis=0)
    
    features = reduce_dimensions(features, month_threshold)
        
    X_train = merge(X_train, features, it)
    X_test = merge(X_test, features, it)
    
    return X_train, X_test    

In [61]:
def generate_ts(X_train_val, y_train_val, X_test):
    
    assert X_train_val.shape[0] > X_test.shape[0]
    assert X_train_val.shape[0] == y_train_val.shape[0]
    
    X_train_val_func = X_train_val.copy()
    X_test_func = X_test.copy()

    X = pd.concat([X_train_val.reset_index(), y_train_val.reset_index()], axis=1)

    for i, month in tqdm(enumerate([12, 24])):
        X_train_val_func, X_test_func = generate_ts_features(X, X_train_val_func, X_test_func, month, month)
        
    X_train_val_func = X_train_val_func.fillna(0)
    X_test_func = X_test_func.fillna(0)
  
    return X_train_val_func, X_test_func

In [62]:
#%%notify
X_train_test, X_test_test = generate_ts(X_train_val, y_train_val, X_test)







0it [00:00, ?it/s][A[A[A[A[A[A

returned_items









Feature Extraction:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A






Feature Extraction:  10%|█         | 1/10 [14:46<2:12:57, 886.37s/it][A[A[A[A[A[A[A






Feature Extraction:  20%|██        | 2/10 [15:16<1:23:55, 629.38s/it][A[A[A[A[A[A[A






Feature Extraction:  30%|███       | 3/10 [28:02<1:18:13, 670.49s/it][A[A[A[A[A[A[A






Feature Extraction:  40%|████      | 4/10 [28:49<48:20, 483.50s/it]  [A[A[A[A[A[A[A






Feature Extraction:  50%|█████     | 5/10 [41:56<47:52, 574.51s/it][A[A[A[A[A[A[A






Feature Extraction:  60%|██████    | 6/10 [42:53<27:56, 419.22s/it][A[A[A[A[A[A[A






Feature Extraction:  70%|███████   | 7/10 [53:56<24:36, 492.24s/it][A[A[A[A[A[A[A






Feature Extraction:  80%|████████  | 8/10 [54:59<12:07, 363.51s/it][A[A[A[A[A[A[A






Feature Extraction:  90%|█████████ | 9/10 [1:07:53<08:06, 486.65s/it][A[A[A[A[A[A[A






Feature Extraction: 100%|██████████| 10/10

Explained variance [9.99999810e-01 1.89930168e-07 3.92175999e-22]
returned_items









Feature Extraction:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A






Feature Extraction:  10%|█         | 1/10 [00:03<00:30,  3.39s/it][A[A[A[A[A[A[A






Feature Extraction:  20%|██        | 2/10 [00:04<00:22,  2.75s/it][A[A[A[A[A[A[A






Feature Extraction:  30%|███       | 3/10 [00:05<00:15,  2.27s/it][A[A[A[A[A[A[A






Feature Extraction:  40%|████      | 4/10 [00:07<00:11,  1.97s/it][A[A[A[A[A[A[A






Feature Extraction:  50%|█████     | 5/10 [00:08<00:09,  1.88s/it][A[A[A[A[A[A[A






Feature Extraction:  60%|██████    | 6/10 [00:10<00:07,  2.00s/it][A[A[A[A[A[A[A






Feature Extraction:  70%|███████   | 7/10 [00:12<00:05,  1.94s/it][A[A[A[A[A[A[A






Feature Extraction:  80%|████████  | 8/10 [00:14<00:03,  1.84s/it][A[A[A[A[A[A[A






Feature Extraction:  90%|█████████ | 9/10 [00:15<00:01,  1.62s/it][A[A[A[A[A[A[A






Feature Extraction: 100%|██████████| 10/10 [00:16<00:00,  1.4

Explained variance [9.99745621e-01 2.49204164e-04 2.94584582e-06]








1it [1:50:23, 6623.81s/it][A[A[A[A[A[A

returned_items









Feature Extraction:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A






Feature Extraction:  10%|█         | 1/10 [02:35<23:15, 155.09s/it][A[A[A[A[A[A[A






Feature Extraction:  20%|██        | 2/10 [02:39<14:40, 110.02s/it][A[A[A[A[A[A[A






Feature Extraction:  30%|███       | 3/10 [05:26<14:48, 126.88s/it][A[A[A[A[A[A[A






Feature Extraction:  40%|████      | 4/10 [05:37<09:13, 92.17s/it] [A[A[A[A[A[A[A






Feature Extraction:  50%|█████     | 5/10 [08:24<09:32, 114.59s/it][A[A[A[A[A[A[A






Feature Extraction:  60%|██████    | 6/10 [08:27<05:24, 81.03s/it] [A[A[A[A[A[A[A






Feature Extraction:  70%|███████   | 7/10 [11:05<05:13, 104.36s/it][A[A[A[A[A[A[A






Feature Extraction:  80%|████████  | 8/10 [11:13<02:30, 75.38s/it] [A[A[A[A[A[A[A






Feature Extraction:  90%|█████████ | 9/10 [13:57<01:42, 102.03s/it][A[A[A[A[A[A[A






Feature Extraction: 100%|██████████| 10/10 [13:59<00

Explained variance [1.00000000e+00 2.59181987e-21 2.02406738e-21]
returned_items









Feature Extraction:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A






Feature Extraction:  10%|█         | 1/10 [00:00<00:08,  1.05it/s][A[A[A[A[A[A[A






Feature Extraction:  30%|███       | 3/10 [00:02<00:06,  1.15it/s][A[A[A[A[A[A[A






Feature Extraction:  40%|████      | 4/10 [00:02<00:04,  1.48it/s][A[A[A[A[A[A[A






Feature Extraction:  50%|█████     | 5/10 [00:03<00:03,  1.56it/s][A[A[A[A[A[A[A






Feature Extraction:  60%|██████    | 6/10 [00:03<00:02,  1.95it/s][A[A[A[A[A[A[A






Feature Extraction:  70%|███████   | 7/10 [00:03<00:01,  1.94it/s][A[A[A[A[A[A[A






Feature Extraction:  80%|████████  | 8/10 [00:04<00:00,  2.15it/s][A[A[A[A[A[A[A






Feature Extraction:  90%|█████████ | 9/10 [00:04<00:00,  2.22it/s][A[A[A[A[A[A[A






Feature Extraction: 100%|██████████| 10/10 [00:04<00:00,  2.71it/s][A[A[A[A[A[A[A

Explained variance [9.99818392e-01 1.79126054e-04 1.50188990e-06]








2it [2:07:01, 4935.92s/it][A[A[A[A[A[A

In [82]:
def change_types(df):
    for i in df.columns:
        df[i] = df[i].astype('float32')    
   
    df['shop_id'] = df['shop_id'].astype('int32')
    df['item_id'] = df['item_id'].astype('int32')
    df['key'] = df['key'].astype('int32')
    
    df['is_actual'] = df['is_actual'].astype('int32')
    df['months_period_of_product'] = df['months_period_of_product'].astype('int32')
    
    for col in generate_name_for_item_price_clust():
        df[col] = df[col].astype('int32')
    
    drop_cols = df.columns[list(map(lambda x: x.startswith("id"), df.columns))].values
    df = df.drop(drop_cols, axis=1)
    
    date_block_nums = df.columns[list(map(lambda x: x.startswith("date_block_num"), df.columns))].values

    for col in date_block_nums:
        df[col] = df[col].astype('int32')
    
    return df

X_train_test = change_types(X_train_test)
X_test_test = change_types(X_test_test)

In [78]:
drop_cols = X_train_test.columns[list(map(lambda x: x.startswith("id"), X_train_test.columns))].values
X_train_test = X_train_test.drop(drop_cols, axis=1)
X_test_test = X_test_test.drop(drop_cols, axis=1)

In [84]:
X_train_val = X_train_test
X_test = X_test_test
y_train_val = y_train_val
y_test = y_test

In [90]:
%store X_train_val
%store X_test
%store y_train_val
%store y_test

Stored 'X_train_val' (DataFrame)
Stored 'X_test' (DataFrame)
Stored 'y_train_val' (Series)
Stored 'y_test' (Series)


In [86]:
X_train_val.to_pickle('X_train_val.pickle')
y_train_val.to_pickle('y_train_val.pickle')

X_test.to_pickle('X_test.pickle')
y_test.to_pickle('y_test.pickle')

In [89]:
X_train_val.columns

Index(['shop_id', 'item_id', 'date_block_num', 'month', 'year',
       'item_category_id', 'item_price', 'key', 'is_actual',
       'median_item_price', 'item_price 0', 'item_price 1', 'item_price 2',
       'item_price 3', 'item_price 4', 'item_price 5', 'item_price_max',
       'item_price_min', 'item_price_std', 'item_price_mean',
       'item_price_median', 'months_period_of_product', 'date_block_num_max',
       'date_block_num_min', 'date_block_num_std', 'date_block_num_mean',
       'date_block_num_median', 'pca_0_12', 'pca_1_12', 'pca_2_12',
       'pca_0_12_12', 'pca_1_12_12', 'pca_2_12_12', 'pca_0_24', 'pca_1_24',
       'pca_2_24', 'pca_0_24_24', 'pca_1_24_24', 'pca_2_24_24'],
      dtype='object')