# <center> Feature engineering

## Summary
1. Shop features generation
2. Shop features generation

## Initial Setup

In [1]:
does_it_for_submission = True

In [2]:
%load_ext jupyternotify

%store -r item_cat
%store -r item
%store -r shops
%store -r sales_train
%store -r train
%store -r train_test

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [3]:
__ipy

Helper ipython script loaded


In [4]:
__da

  from pandas import Panel


Basic Data Analysis tools was loaded


In [5]:
import googlemaps
import plotly.express as px
from functools import partial

# SKLEARN
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, KFold
from scipy.stats import randint as sp_randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# TSFRESH
from tsfresh.feature_extraction import ComprehensiveFCParameters, extract_features, MinimalFCParameters, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features

# Sklearn-pandas
from sklearn_pandas import CategoricalImputer, FunctionTransformer, DataFrameMapper

# SCIPY
from scipy.sparse import csr_matrix

# My files
from basic_text_preprocessing import BasicPreprocessText

gmaps = googlemaps.Client(key='AIzaSyCW4PTjjIz6yGUgAmqrG2cLy9euzbim23M')

from math import cos, asin, sqrt
import time

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maksymsuprunenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Shop features generation

Features:
1. **lat** - latitude
2. **lng** - longitude
3. **distance_to_moskov** - distance to Moscow city (Label Encoded)
4. **city** - city (Label Encoded)

In [6]:
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295     #Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 12742 * np.arcsin(sqrt(a))

def not_city_str(x, t):
    return 1 if t in "".join(x.split()[1:]) else 0

def get_location(x):
    loc = gmaps.geocode(x)
    return loc[0]['geometry']['location'] if len(loc) != 0 else {'lat': 0, 'lng': 0}

moskov_lat, moskov_lng = get_location('Moscow')

# new_shops = shops.copy()
# cleaned_shop_name = BasicPreprocessText().vectorize_process_text(shops['shop_name'])
# new_shops['shop_name'] = cleaned_shop_name
# new_shops['city'] = new_shops['shop_name'].apply(lambda x: x.split()[0])
# city = new_shops['city'] .value_counts()\
# .to_frame().reset_index().rename(columns={'index': 'shop_name', 'city': 'count_shops'})


# new_shops['is_mal'] = new_shops['shop_name'].apply(partial(not_city_str, t='тц')).astype(np.int8)
# new_shops['is_en_mal'] = new_shops['shop_name'].apply(partial(not_city_str, t='трк')).astype(np.int8)

# locations = new_shops['shop_name'].progress_apply(get_location) 

# new_shops_with_coords = pd.concat([new_shops, pd.DataFrame.from_records(locations.values)], axis=1)

# new_shops_with_coords.to_pickle("new_shops_with_coords.pickle")

new_shops_with_coords = pd.read_pickle("new_shops_with_coords.pickle")
moskov_lat, moskov_lng = list(get_location('Moscow').values())

new_shops_with_coords['lat'] = new_shops_with_coords['lat'].astype(np.float16, copy=False)
new_shops_with_coords['lng'] = new_shops_with_coords['lng'].astype(np.float16, copy=False)

new_shops_with_coords['distance_to_moskov'] = \
    new_shops_with_coords[['lat', 'lng']].apply(lambda x: distance(x[0], x[1], moskov_lat, moskov_lng), axis=1)\
    .astype(np.float16)

le_shop_dtm = LabelEncoder().fit(new_shops_with_coords['distance_to_moskov'].sort_values().values)

new_shops_with_coords['distance_to_moskov'] = \
    le_shop_dtm.transform(new_shops_with_coords['distance_to_moskov']).astype(np.float16)

new_shops_with_coords['city'] = LabelEncoder().fit_transform(new_shops_with_coords['city']).astype(np.int8)

new_shops_with_coords = new_shops_with_coords.drop('shop_name', axis=1)

### Item feature generation

Remove item name from dataset.

In [7]:
%%notify
item_cleaned = item.copy()
item_cleaned = item_cleaned.drop('item_name', axis=1)

<IPython.core.display.Javascript object>

## Item category generation

Features:
1. item_category_names_category_1_enc
2. item_category_names_category_2_enc
3. is_digital

In [8]:
item_cat_cleaned = item_cat.copy()

item_category_names = pd.Series(
    BasicPreprocessText().vectorize_process_text(item_cat_cleaned['item_category_name'], ['-'])
)

idx = [8, 9, 32, 79, 80, 81, 82, 83]
fixed_first_level = item_category_names[np.isin(item_cat_cleaned.index, idx)].apply(lambda x: str(np.abs(np.random.normal())) + "-" + x)
item_category_names[idx] = fixed_first_level

item_cat_cleaned['item_category_names_category_1'] = item_category_names.apply(lambda x: x.split("-")[0])
item_cat_cleaned['item_category_names_category_2'] = item_category_names.apply(lambda x: " ".join(x.split("-")[1:]))

item_cat_cleaned['item_category_names_category_1_enc'] = \
    LabelEncoder().fit_transform(item_cat_cleaned['item_category_names_category_1']).astype(np.int8)

item_cat_cleaned['item_category_names_category_2_enc'] = \
    LabelEncoder().fit_transform(item_cat_cleaned['item_category_names_category_2']).astype(np.int8)

item_cat_cleaned['is_digital'] = item_cat_cleaned.apply(lambda x: 'цыфра' in x).astype(np.int8)

item_cat_cleaned = item_cat_cleaned.drop(
    ['item_category_name', 'item_category_names_category_1', 'item_category_names_category_2'], 
    axis=1)

## Join on everthing

Join tables:
1. item
2. item_categories
3. sales_train
4. shops

In [11]:
%%notify

predict_month = 34 if does_it_for_submission else 33
train_df = train_test.copy() if does_it_for_submission else train.copy()

train_df = train_df.merge(item_cleaned[['item_id', 'item_category_id']], how='left', on='item_id', suffixes=("", "_item"), right_index=False)
train_df = train_df.merge(new_shops_with_coords, how='left', on='shop_id', suffixes=("", "_shops"), right_index=False)
train_df = train_df.merge(item_cat_cleaned, how='left', on='item_category_id', suffixes=("", "_item_cat"), right_index=False)

#train_df_file_name = "submission" if does_it_for_submission else "validation"
#train_df.to_pickle(f"train_df_{train_df_file_name}_.pickle")

<IPython.core.display.Javascript object>

## General features
1. key - compound key of shop_id and item_id
2. year 
3. month

In [12]:
train_df['key'] = train_df.progress_apply(lambda x: str(int(x['shop_id'])) + "_" + str(int(x['item_id'])), axis=1)
train_df['key'] = LabelEncoder().fit_transform(train_df['key']).astype(np.int32)

train_df['year'] = (train_df['date_block_num'] // 12).astype(np.int8)
train_df['month'] = (train_df['date_block_num'] % 12).astype(np.int8)


train_df['revenue'] = train_df['item_price'] * train_df['item_cnt_month']


  0%|          | 0/11128050 [00:00<?, ?it/s][A
  0%|          | 1/11128050 [00:04<12547:01:45,  4.06s/it][A
  0%|          | 460/11128050 [00:04<8782:45:37,  2.84s/it][A
  0%|          | 3941/11128050 [00:04<6146:02:08,  1.99s/it][A
  0%|          | 8323/11128050 [00:04<4300:33:04,  1.39s/it][A
  0%|          | 12430/11128050 [00:04<3009:17:47,  1.03it/s][A
  0%|          | 16719/11128050 [00:04<2105:42:59,  1.47it/s][A
  0%|          | 21046/11128050 [00:04<1473:26:56,  2.09it/s][A
  0%|          | 25184/11128050 [00:04<1031:03:08,  2.99it/s][A
  0%|          | 29762/11128050 [00:04<721:27:33,  4.27it/s] [A
  0%|          | 34547/11128050 [00:04<504:49:22,  6.10it/s][A
  0%|          | 38762/11128050 [00:05<353:15:50,  8.72it/s][A
  0%|          | 43180/11128050 [00:05<247:12:25, 12.46it/s][A
  0%|          | 47493/11128050 [00:05<172:59:56, 17.79it/s][A
  0%|          | 52179/11128050 [00:05<121:04:04, 25.41it/s][A
  1%|          | 56628/11128050 [00:05<84:44:03, 36.2

  5%|▍         | 547605/11128050 [00:18<03:59, 44140.79it/s][A
  5%|▍         | 552506/11128050 [00:18<03:52, 45494.71it/s][A
  5%|▌         | 557386/11128050 [00:18<03:47, 46437.91it/s][A
  5%|▌         | 562054/11128050 [00:18<03:54, 45100.29it/s][A
  5%|▌         | 566590/11128050 [00:18<04:09, 42263.35it/s][A
  5%|▌         | 570901/11128050 [00:18<04:08, 42513.10it/s][A
  5%|▌         | 575406/11128050 [00:18<04:04, 43243.01it/s][A
  5%|▌         | 580256/11128050 [00:18<03:55, 44695.82it/s][A
  5%|▌         | 584896/11128050 [00:18<03:53, 45191.43it/s][A
  5%|▌         | 589619/11128050 [00:19<03:50, 45780.93it/s][A
  5%|▌         | 594451/11128050 [00:19<03:46, 46512.56it/s][A
  5%|▌         | 599119/11128050 [00:19<03:51, 45421.75it/s][A
  5%|▌         | 603794/11128050 [00:19<03:49, 45812.11it/s][A
  5%|▌         | 608388/11128050 [00:19<04:02, 43436.09it/s][A
  6%|▌         | 613225/11128050 [00:19<03:54, 44805.23it/s][A
  6%|▌         | 617914/11128050 [00:19<

 10%|▉         | 1112150/11128050 [00:31<04:30, 36969.29it/s][A
 10%|█         | 1115892/11128050 [00:31<04:53, 34131.32it/s][A
 10%|█         | 1119588/11128050 [00:31<04:46, 34931.21it/s][A
 10%|█         | 1123138/11128050 [00:31<05:33, 29990.47it/s][A
 10%|█         | 1126300/11128050 [00:31<05:37, 29634.41it/s][A
 10%|█         | 1131060/11128050 [00:32<04:59, 33417.69it/s][A
 10%|█         | 1135064/11128050 [00:32<04:44, 35161.89it/s][A
 10%|█         | 1138829/11128050 [00:32<04:38, 35869.53it/s][A
 10%|█         | 1142555/11128050 [00:32<04:39, 35686.11it/s][A
 10%|█         | 1146221/11128050 [00:32<04:40, 35526.66it/s][A
 10%|█         | 1149842/11128050 [00:32<05:10, 32149.58it/s][A
 10%|█         | 1154665/11128050 [00:32<04:39, 35722.20it/s][A
 10%|█         | 1159664/11128050 [00:32<04:15, 39066.21it/s][A
 10%|█         | 1164110/11128050 [00:32<04:05, 40538.18it/s][A
 11%|█         | 1168682/11128050 [00:32<03:57, 41963.22it/s][A
 11%|█         | 1173190/

 15%|█▍        | 1618069/11128050 [00:45<03:39, 43411.34it/s][A
 15%|█▍        | 1622933/11128050 [00:45<03:31, 44857.36it/s][A
 15%|█▍        | 1627604/11128050 [00:45<03:29, 45397.14it/s][A
 15%|█▍        | 1632170/11128050 [00:45<03:34, 44218.56it/s][A
 15%|█▍        | 1636617/11128050 [00:46<03:40, 43061.02it/s][A
 15%|█▍        | 1640948/11128050 [00:46<03:41, 42922.28it/s][A
 15%|█▍        | 1645355/11128050 [00:46<03:39, 43259.73it/s][A
 15%|█▍        | 1649833/11128050 [00:46<03:36, 43704.11it/s][A
 15%|█▍        | 1654220/11128050 [00:46<03:36, 43751.92it/s][A
 15%|█▍        | 1658603/11128050 [00:46<03:47, 41670.87it/s][A
 15%|█▍        | 1663075/11128050 [00:46<03:42, 42540.15it/s][A
 15%|█▍        | 1667946/11128050 [00:46<03:33, 44219.92it/s][A
 15%|█▌        | 1672709/11128050 [00:46<03:29, 45184.64it/s][A
 15%|█▌        | 1677470/11128050 [00:46<03:25, 45883.38it/s][A
 15%|█▌        | 1682183/11128050 [00:47<03:24, 46250.24it/s][A
 15%|█▌        | 1687135/

 19%|█▉        | 2147700/11128050 [00:59<03:36, 41456.69it/s][A
 19%|█▉        | 2151934/11128050 [00:59<03:35, 41717.53it/s][A
 19%|█▉        | 2156405/11128050 [00:59<03:30, 42571.87it/s][A
 19%|█▉        | 2161122/11128050 [00:59<03:24, 43850.19it/s][A
 19%|█▉        | 2165717/11128050 [00:59<03:21, 44457.81it/s][A
 20%|█▉        | 2170180/11128050 [00:59<03:30, 42483.75it/s][A
 20%|█▉        | 2174459/11128050 [00:59<03:36, 41431.93it/s][A
 20%|█▉        | 2178630/11128050 [00:59<03:35, 41498.49it/s][A
 20%|█▉        | 2182799/11128050 [01:00<03:36, 41398.89it/s][A
 20%|█▉        | 2187505/11128050 [01:00<03:28, 42945.67it/s][A
 20%|█▉        | 2192179/11128050 [01:00<03:23, 44015.58it/s][A
 20%|█▉        | 2196957/11128050 [01:00<03:18, 45080.85it/s][A
 20%|█▉        | 2201488/11128050 [01:00<03:39, 40717.78it/s][A
 20%|█▉        | 2206196/11128050 [01:00<03:30, 42436.64it/s][A
 20%|█▉        | 2210771/11128050 [01:00<03:25, 43378.64it/s][A
 20%|█▉        | 2215175/

 25%|██▍       | 2728927/11128050 [01:12<03:34, 39130.84it/s][A
 25%|██▍       | 2732901/11128050 [01:12<03:36, 38767.56it/s][A
 25%|██▍       | 2737386/11128050 [01:12<03:27, 40411.50it/s][A
 25%|██▍       | 2741883/11128050 [01:12<03:21, 41678.15it/s][A
 25%|██▍       | 2746095/11128050 [01:12<03:22, 41336.41it/s][A
 25%|██▍       | 2750260/11128050 [01:12<03:36, 38695.36it/s][A
 25%|██▍       | 2754186/11128050 [01:13<04:27, 31344.38it/s][A
 25%|██▍       | 2757825/11128050 [01:13<04:15, 32704.68it/s][A
 25%|██▍       | 2761297/11128050 [01:13<04:57, 28080.36it/s][A
 25%|██▍       | 2764917/11128050 [01:13<04:37, 30083.53it/s][A
 25%|██▍       | 2768895/11128050 [01:13<04:17, 32456.81it/s][A
 25%|██▍       | 2772784/11128050 [01:13<04:04, 34151.26it/s][A
 25%|██▍       | 2776368/11128050 [01:13<04:06, 33874.58it/s][A
 25%|██▍       | 2780544/11128050 [01:13<03:52, 35907.91it/s][A
 25%|██▌       | 2784647/11128050 [01:13<03:43, 37303.71it/s][A
 25%|██▌       | 2788473/

 29%|██▉       | 3240596/11128050 [01:26<03:03, 43001.97it/s][A
 29%|██▉       | 3245209/11128050 [01:26<02:59, 43893.67it/s][A
 29%|██▉       | 3249624/11128050 [01:26<03:10, 41328.63it/s][A
 29%|██▉       | 3253806/11128050 [01:27<03:25, 38272.89it/s][A
 29%|██▉       | 3257713/11128050 [01:27<03:42, 35407.50it/s][A
 29%|██▉       | 3261352/11128050 [01:27<03:41, 35440.60it/s][A
 29%|██▉       | 3265069/11128050 [01:27<03:38, 35938.34it/s][A
 29%|██▉       | 3268713/11128050 [01:27<03:57, 33140.12it/s][A
 29%|██▉       | 3272631/11128050 [01:27<03:46, 34745.41it/s][A
 29%|██▉       | 3276389/11128050 [01:27<03:40, 35548.03it/s][A
 29%|██▉       | 3280774/11128050 [01:27<03:28, 37687.20it/s][A
 30%|██▉       | 3284886/11128050 [01:27<03:22, 38653.24it/s][A
 30%|██▉       | 3289537/11128050 [01:28<03:12, 40716.28it/s][A
 30%|██▉       | 3293679/11128050 [01:28<03:24, 38349.05it/s][A
 30%|██▉       | 3297592/11128050 [01:28<03:23, 38444.17it/s][A
 30%|██▉       | 3302034/

 34%|███▍      | 3784091/11128050 [01:40<02:54, 42052.90it/s][A
 34%|███▍      | 3788355/11128050 [01:40<02:54, 42078.31it/s][A
 34%|███▍      | 3792604/11128050 [01:40<02:59, 40791.11it/s][A
 34%|███▍      | 3796858/11128050 [01:40<02:57, 41297.63it/s][A
 34%|███▍      | 3801016/11128050 [01:40<03:03, 39849.46it/s][A
 34%|███▍      | 3805512/11128050 [01:40<02:57, 41255.20it/s][A
 34%|███▍      | 3810430/11128050 [01:40<02:48, 43349.93it/s][A
 34%|███▍      | 3815130/11128050 [01:40<02:44, 44383.63it/s][A
 34%|███▍      | 3819611/11128050 [01:40<02:47, 43702.08it/s][A
 34%|███▍      | 3824014/11128050 [01:41<02:51, 42598.21it/s][A
 34%|███▍      | 3828303/11128050 [01:41<02:55, 41523.67it/s][A
 34%|███▍      | 3832482/11128050 [01:41<02:58, 40812.76it/s][A
 34%|███▍      | 3836967/11128050 [01:41<02:53, 41942.56it/s][A
 35%|███▍      | 3841729/11128050 [01:41<02:47, 43498.02it/s][A
 35%|███▍      | 3846312/11128050 [01:41<02:44, 44171.89it/s][A
 35%|███▍      | 3850754/

 39%|███▉      | 4330247/11128050 [01:53<02:55, 38774.99it/s][A
 39%|███▉      | 4334829/11128050 [01:53<02:47, 40649.49it/s][A
 39%|███▉      | 4339544/11128050 [01:53<02:40, 42402.87it/s][A
 39%|███▉      | 4343975/11128050 [01:54<02:37, 42951.66it/s][A
 39%|███▉      | 4348340/11128050 [01:54<02:39, 42540.27it/s][A
 39%|███▉      | 4353397/11128050 [01:54<02:31, 44666.85it/s][A
 39%|███▉      | 4357927/11128050 [01:54<02:35, 43592.64it/s][A
 39%|███▉      | 4362336/11128050 [01:54<03:28, 32454.26it/s][A
 39%|███▉      | 4366033/11128050 [01:54<03:42, 30392.28it/s][A
 39%|███▉      | 4369753/11128050 [01:54<03:30, 32135.03it/s][A
 39%|███▉      | 4373894/11128050 [01:54<03:16, 34449.58it/s][A
 39%|███▉      | 4377573/11128050 [01:54<03:14, 34679.37it/s][A
 39%|███▉      | 4381205/11128050 [01:55<03:15, 34598.21it/s][A
 39%|███▉      | 4384780/11128050 [01:55<03:16, 34297.59it/s][A
 39%|███▉      | 4389445/11128050 [01:55<03:00, 37256.91it/s][A
 39%|███▉      | 4393302/

 44%|████▍     | 4883995/11128050 [02:06<02:23, 43490.44it/s][A
 44%|████▍     | 4888349/11128050 [02:06<02:23, 43414.21it/s][A
 44%|████▍     | 4892984/11128050 [02:06<02:20, 44253.73it/s][A
 44%|████▍     | 4897416/11128050 [02:07<02:21, 44040.40it/s][A
 44%|████▍     | 4901959/11128050 [02:07<02:20, 44433.51it/s][A
 44%|████▍     | 4906602/11128050 [02:07<02:18, 45012.24it/s][A
 44%|████▍     | 4911108/11128050 [02:07<02:19, 44697.05it/s][A
 44%|████▍     | 4915582/11128050 [02:07<02:19, 44646.46it/s][A
 44%|████▍     | 4920098/11128050 [02:07<02:18, 44797.56it/s][A
 44%|████▍     | 4924580/11128050 [02:07<02:19, 44610.72it/s][A
 44%|████▍     | 4929043/11128050 [02:07<02:19, 44594.83it/s][A
 44%|████▍     | 4933504/11128050 [02:07<02:20, 44212.86it/s][A
 44%|████▍     | 4937977/11128050 [02:07<02:19, 44364.61it/s][A
 44%|████▍     | 4942600/11128050 [02:08<02:17, 44907.06it/s][A
 44%|████▍     | 4947141/11128050 [02:08<02:17, 45056.39it/s][A
 44%|████▍     | 4951649/

 49%|████▉     | 5456991/11128050 [02:19<02:07, 44443.77it/s][A
 49%|████▉     | 5461439/11128050 [02:19<02:07, 44299.26it/s][A
 49%|████▉     | 5466066/11128050 [02:19<02:06, 44871.67it/s][A
 49%|████▉     | 5470788/11128050 [02:19<02:04, 45543.24it/s][A
 49%|████▉     | 5475399/11128050 [02:19<02:03, 45611.81it/s][A
 49%|████▉     | 5480098/11128050 [02:20<02:02, 46015.20it/s][A
 49%|████▉     | 5484703/11128050 [02:20<02:03, 45541.05it/s][A
 49%|████▉     | 5489365/11128050 [02:20<02:02, 45857.69it/s][A
 49%|████▉     | 5493954/11128050 [02:20<02:03, 45699.95it/s][A
 49%|████▉     | 5498527/11128050 [02:20<02:04, 45146.75it/s][A
 49%|████▉     | 5503113/11128050 [02:20<02:04, 45355.03it/s][A
 49%|████▉     | 5507747/11128050 [02:20<02:03, 45645.66it/s][A
 50%|████▉     | 5512346/11128050 [02:20<02:02, 45745.05it/s][A
 50%|████▉     | 5516923/11128050 [02:20<02:05, 44647.49it/s][A
 50%|████▉     | 5521614/11128050 [02:20<02:03, 45302.47it/s][A
 50%|████▉     | 5526152/

 54%|█████▍    | 6029665/11128050 [02:32<01:54, 44695.10it/s][A
 54%|█████▍    | 6034139/11128050 [02:32<01:54, 44469.81it/s][A
 54%|█████▍    | 6038590/11128050 [02:32<01:54, 44457.30it/s][A
 54%|█████▍    | 6043247/11128050 [02:32<01:52, 45069.13it/s][A
 54%|█████▍    | 6047758/11128050 [02:32<01:54, 44428.93it/s][A
 54%|█████▍    | 6052206/11128050 [02:32<01:54, 44227.10it/s][A
 54%|█████▍    | 6056633/11128050 [02:33<01:56, 43657.66it/s][A
 54%|█████▍    | 6061003/11128050 [02:33<01:58, 42919.56it/s][A
 55%|█████▍    | 6065617/11128050 [02:33<01:55, 43836.86it/s][A
 55%|█████▍    | 6070180/11128050 [02:33<01:54, 44359.69it/s][A
 55%|█████▍    | 6074624/11128050 [02:33<01:55, 43705.58it/s][A
 55%|█████▍    | 6079315/11128050 [02:33<01:53, 44619.72it/s][A
 55%|█████▍    | 6083787/11128050 [02:33<01:54, 44074.56it/s][A
 55%|█████▍    | 6088203/11128050 [02:33<01:55, 43496.81it/s][A
 55%|█████▍    | 6092839/11128050 [02:33<01:53, 44316.27it/s][A
 55%|█████▍    | 6097280/

 59%|█████▉    | 6596247/11128050 [02:45<01:49, 41282.16it/s][A
 59%|█████▉    | 6600494/11128050 [02:45<01:48, 41629.41it/s][A
 59%|█████▉    | 6604757/11128050 [02:45<01:47, 41923.60it/s][A
 59%|█████▉    | 6609010/11128050 [02:45<01:47, 42103.68it/s][A
 59%|█████▉    | 6613574/11128050 [02:45<01:44, 43103.58it/s][A
 59%|█████▉    | 6618180/11128050 [02:45<01:42, 43947.49it/s][A
 60%|█████▉    | 6622587/11128050 [02:46<01:50, 40881.43it/s][A
 60%|█████▉    | 6626859/11128050 [02:46<01:48, 41413.88it/s][A
 60%|█████▉    | 6631357/11128050 [02:46<01:45, 42421.70it/s][A
 60%|█████▉    | 6636047/11128050 [02:46<01:42, 43670.66it/s][A
 60%|█████▉    | 6640447/11128050 [02:46<01:43, 43391.47it/s][A
 60%|█████▉    | 6644809/11128050 [02:46<01:43, 43205.13it/s][A
 60%|█████▉    | 6649268/11128050 [02:46<01:42, 43610.60it/s][A
 60%|█████▉    | 6653766/11128050 [02:46<01:41, 44011.55it/s][A
 60%|█████▉    | 6658185/11128050 [02:46<01:41, 44064.66it/s][A
 60%|█████▉    | 6662599/

 64%|██████▍   | 7153317/11128050 [02:58<01:29, 44587.51it/s][A
 64%|██████▍   | 7157806/11128050 [02:58<01:28, 44676.47it/s][A
 64%|██████▍   | 7162340/11128050 [02:58<01:28, 44873.24it/s][A
 64%|██████▍   | 7166832/11128050 [02:58<01:29, 44480.56it/s][A
 64%|██████▍   | 7171284/11128050 [02:58<01:30, 43863.62it/s][A
 64%|██████▍   | 7175978/11128050 [02:58<01:28, 44741.68it/s][A
 65%|██████▍   | 7180460/11128050 [02:58<01:29, 44007.47it/s][A
 65%|██████▍   | 7184990/11128050 [02:59<01:28, 44385.84it/s][A
 65%|██████▍   | 7189435/11128050 [02:59<01:29, 44190.65it/s][A
 65%|██████▍   | 7193859/11128050 [02:59<01:30, 43642.00it/s][A
 65%|██████▍   | 7198472/11128050 [02:59<01:28, 44359.00it/s][A
 65%|██████▍   | 7202915/11128050 [02:59<01:29, 43794.96it/s][A
 65%|██████▍   | 7207553/11128050 [02:59<01:28, 44537.36it/s][A
 65%|██████▍   | 7212014/11128050 [02:59<01:29, 43856.47it/s][A
 65%|██████▍   | 7216519/11128050 [02:59<01:28, 44205.49it/s][A
 65%|██████▍   | 7220997/

 69%|██████▉   | 7719326/11128050 [03:11<01:16, 44732.16it/s][A
 69%|██████▉   | 7723803/11128050 [03:11<01:16, 44529.90it/s][A
 69%|██████▉   | 7728389/11128050 [03:11<01:15, 44917.84it/s][A
 69%|██████▉   | 7732884/11128050 [03:11<01:15, 44819.84it/s][A
 70%|██████▉   | 7737392/11128050 [03:11<01:15, 44897.46it/s][A
 70%|██████▉   | 7741919/11128050 [03:11<01:15, 45006.83it/s][A
 70%|██████▉   | 7746630/11128050 [03:11<01:14, 45615.94it/s][A
 70%|██████▉   | 7751195/11128050 [03:11<01:15, 44946.14it/s][A
 70%|██████▉   | 7755694/11128050 [03:12<01:15, 44716.81it/s][A
 70%|██████▉   | 7760169/11128050 [03:12<01:17, 43709.85it/s][A
 70%|██████▉   | 7764716/11128050 [03:12<01:16, 44222.11it/s][A
 70%|██████▉   | 7769182/11128050 [03:12<01:15, 44350.35it/s][A
 70%|██████▉   | 7773743/11128050 [03:12<01:15, 44719.42it/s][A
 70%|██████▉   | 7778219/11128050 [03:12<01:15, 44198.62it/s][A
 70%|██████▉   | 7782712/11128050 [03:12<01:15, 44414.21it/s][A
 70%|██████▉   | 7787319/

 74%|███████▍  | 8287019/11128050 [03:24<01:03, 44496.82it/s][A
 75%|███████▍  | 8291472/11128050 [03:24<01:04, 44220.12it/s][A
 75%|███████▍  | 8296042/11128050 [03:24<01:03, 44649.48it/s][A
 75%|███████▍  | 8300589/11128050 [03:24<01:02, 44890.49it/s][A
 75%|███████▍  | 8305128/11128050 [03:24<01:02, 45036.64it/s][A
 75%|███████▍  | 8309776/11128050 [03:24<01:01, 45459.08it/s][A
 75%|███████▍  | 8314324/11128050 [03:24<01:02, 44693.57it/s][A
 75%|███████▍  | 8318852/11128050 [03:24<01:02, 44865.94it/s][A
 75%|███████▍  | 8323342/11128050 [03:24<01:02, 44637.35it/s][A
 75%|███████▍  | 8327809/11128050 [03:25<01:03, 44265.92it/s][A
 75%|███████▍  | 8332293/11128050 [03:25<01:02, 44434.18it/s][A
 75%|███████▍  | 8336906/11128050 [03:25<01:02, 44929.05it/s][A
 75%|███████▍  | 8341402/11128050 [03:25<01:02, 44516.94it/s][A
 75%|███████▍  | 8345857/11128050 [03:25<01:02, 44503.35it/s][A
 75%|███████▌  | 8350310/11128050 [03:25<01:02, 44243.21it/s][A
 75%|███████▌  | 8354736/

 80%|███████▉  | 8855999/11128050 [03:36<00:52, 43679.91it/s][A
 80%|███████▉  | 8860377/11128050 [03:37<00:51, 43709.75it/s][A
 80%|███████▉  | 8864752/11128050 [03:37<00:51, 43637.05it/s][A
 80%|███████▉  | 8869118/11128050 [03:37<00:51, 43521.62it/s][A
 80%|███████▉  | 8873771/11128050 [03:37<00:50, 44381.61it/s][A
 80%|███████▉  | 8878215/11128050 [03:37<00:51, 43701.75it/s][A
 80%|███████▉  | 8882862/11128050 [03:37<00:50, 44494.93it/s][A
 80%|███████▉  | 8887320/11128050 [03:37<00:50, 44071.74it/s][A
 80%|███████▉  | 8891968/11128050 [03:37<00:49, 44766.23it/s][A
 80%|███████▉  | 8896452/11128050 [03:37<00:51, 43745.28it/s][A
 80%|███████▉  | 8900858/11128050 [03:37<00:50, 43838.55it/s][A
 80%|████████  | 8905580/11128050 [03:38<00:49, 44798.84it/s][A
 80%|████████  | 8910071/11128050 [03:38<00:49, 44530.88it/s][A
 80%|████████  | 8914752/11128050 [03:38<00:48, 45190.29it/s][A
 80%|████████  | 8919279/11128050 [03:38<00:49, 44903.80it/s][A
 80%|████████  | 8923776/

 85%|████████▍ | 9423973/11128050 [03:49<00:38, 44252.50it/s][A
 85%|████████▍ | 9428403/11128050 [03:49<00:38, 43663.61it/s][A
 85%|████████▍ | 9432775/11128050 [03:50<00:39, 43143.97it/s][A
 85%|████████▍ | 9437141/11128050 [03:50<00:39, 43296.61it/s][A
 85%|████████▍ | 9441546/11128050 [03:50<00:38, 43519.49it/s][A
 85%|████████▍ | 9446078/11128050 [03:50<00:38, 44043.05it/s][A
 85%|████████▍ | 9450498/11128050 [03:50<00:38, 44087.64it/s][A
 85%|████████▍ | 9454910/11128050 [03:50<00:38, 43879.59it/s][A
 85%|████████▌ | 9459300/11128050 [03:50<00:38, 43162.74it/s][A
 85%|████████▌ | 9463621/11128050 [03:50<00:38, 43103.51it/s][A
 85%|████████▌ | 9467969/11128050 [03:50<00:38, 43215.63it/s][A
 85%|████████▌ | 9472293/11128050 [03:50<00:41, 39576.43it/s][A
 85%|████████▌ | 9476380/11128050 [03:51<00:41, 39954.41it/s][A
 85%|████████▌ | 9480420/11128050 [03:51<00:41, 39293.42it/s][A
 85%|████████▌ | 9484383/11128050 [03:51<00:42, 38878.69it/s][A
 85%|████████▌ | 9488295/

 90%|████████▉ | 9984107/11128050 [04:02<00:25, 44381.24it/s][A
 90%|████████▉ | 9988568/11128050 [04:02<00:25, 44447.22it/s][A
 90%|████████▉ | 9993018/11128050 [04:02<00:25, 44390.53it/s][A
 90%|████████▉ | 9997489/11128050 [04:03<00:25, 44483.53it/s][A
 90%|████████▉ | 10002160/11128050 [04:03<00:24, 45127.94it/s][A
 90%|████████▉ | 10006677/11128050 [04:03<00:25, 44675.23it/s][A
 90%|████████▉ | 10011149/11128050 [04:03<00:25, 44503.50it/s][A
 90%|█████████ | 10015850/11128050 [04:03<00:24, 45224.70it/s][A
 90%|█████████ | 10020378/11128050 [04:03<00:24, 44441.89it/s][A
 90%|█████████ | 10024983/11128050 [04:03<00:24, 44911.63it/s][A
 90%|█████████ | 10029549/11128050 [04:03<00:24, 45132.45it/s][A
 90%|█████████ | 10034067/11128050 [04:03<00:24, 45098.85it/s][A
 90%|█████████ | 10038580/11128050 [04:03<00:24, 44413.92it/s][A
 90%|█████████ | 10043026/11128050 [04:04<00:25, 42472.01it/s][A
 90%|█████████ | 10047496/11128050 [04:04<00:25, 43115.49it/s][A
 90%|█████████

 95%|█████████▍| 10568445/11128050 [04:15<00:11, 46771.33it/s][A
 95%|█████████▌| 10573133/11128050 [04:15<00:11, 46667.06it/s][A
 95%|█████████▌| 10577807/11128050 [04:15<00:11, 45998.02it/s][A
 95%|█████████▌| 10582694/11128050 [04:15<00:11, 46820.21it/s][A
 95%|█████████▌| 10587385/11128050 [04:15<00:11, 46081.53it/s][A
 95%|█████████▌| 10592057/11128050 [04:15<00:11, 46268.59it/s][A
 95%|█████████▌| 10596691/11128050 [04:16<00:12, 44012.21it/s][A
 95%|█████████▌| 10601120/11128050 [04:16<00:12, 42058.00it/s][A
 95%|█████████▌| 10605914/11128050 [04:16<00:11, 43663.44it/s][A
 95%|█████████▌| 10610724/11128050 [04:16<00:11, 44905.29it/s][A
 95%|█████████▌| 10615738/11128050 [04:16<00:11, 46355.28it/s][A
 95%|█████████▌| 10620456/11128050 [04:16<00:10, 46597.62it/s][A
 95%|█████████▌| 10625144/11128050 [04:16<00:10, 46140.71it/s][A
 96%|█████████▌| 10630156/11128050 [04:16<00:10, 47259.72it/s][A
 96%|█████████▌| 10634903/11128050 [04:16<00:10, 45830.13it/s][A
 96%|█████

### Group sale stats in recent
create stats (mean/var) of sales of certain groups during the past 12 months

In [13]:
def add_group_stats(matrix_, groupby_feats, target, enc_feat, last_periods):
    if not 'date_block_num' in groupby_feats:
        print ('date_block_num must in groupby_feats')
        return matrix_
    
    group = matrix_.groupby(groupby_feats)[target].sum().reset_index()
    max_lags = np.max(last_periods)
    
    for i in tqdm(range(1, max_lags+1)):
        shifted = group[groupby_feats+[target]].copy(deep=True)
        shifted['date_block_num'] += i
        shifted.rename({target:target+'_lag_'+str(i)},axis=1,inplace=True)
        group = group.merge(shifted, on=groupby_feats, how='left')
    group.fillna(0,inplace=True)
    
    for period in tqdm(last_periods):
        lag_feats = [target+'_lag_'+str(lag) for lag in np.arange(1,period+1)]
        # we do not use mean and std directly because we want to include months with sales = 0
        mean = group[lag_feats].sum(axis=1)/float(period)
        mean2 = (group[lag_feats]**2).sum(axis=1)/float(period)
        group[enc_feat+'_avg_sale_last_'+str(period)] = mean
        group[enc_feat+'_std_sale_last_'+str(period)] = (mean2 - mean**2).apply(np.sqrt)
        group[enc_feat+'_std_sale_last_'+str(period)].replace(np.inf,0,inplace=True)
        # divide by mean, this scales the features for NN
        group[enc_feat+'_avg_sale_last_'+str(period)] /= group[enc_feat+'_avg_sale_last_'+str(period)].mean()
        group[enc_feat+'_std_sale_last_'+str(period)] /= group[enc_feat+'_std_sale_last_'+str(period)].mean()
        
        group[enc_feat+'_avg_sale_last_'+str(period)] = group[enc_feat+'_avg_sale_last_'+str(period)].astype(np.float16) 
        group[enc_feat+'_std_sale_last_'+str(period)] = group[enc_feat+'_std_sale_last_'+str(period)].astype(np.float16)
        
        group[enc_feat+'_min_sale_last_'+str(period)] = group[lag_feats].sum(axis=1).min()
        group[enc_feat+'_max_sale_last_'+str(period)] = group[lag_feats].sum(axis=1).max()
        
        group[enc_feat+'_min_sale_last_'+str(period)] = group[enc_feat+'_min_sale_last_'+str(period)].astype(np.float16)
        group[enc_feat+'_max_sale_last_'+str(period)] = group[enc_feat+'_max_sale_last_'+str(period)].astype(np.float16)
                
    cols = groupby_feats + [f_ for f_ in group.columns.values if f_.find('_sale_last_')>=0]
    matrix = matrix_.merge(group[cols], on=groupby_feats, how='left')
    return matrix

In [14]:
ts = time.time()

X_target_encoded = train_df

X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'item_id'], 'item_cnt_month', 'item', [6,12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'shop_id'], 'item_cnt_month', 'shop', [6,12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'item_category_id'], 'item_cnt_month', 'category', [12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'city'], 'item_cnt_month', 'city', [12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'item_category_names_category_1_enc'], 'item_cnt_month', 'family', [12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'item_category_names_category_2_enc'], 'item_cnt_month', 'subfamily', [12])

time.time() - ts


  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:00<00:01,  9.72it/s][A
 25%|██▌       | 3/12 [00:00<00:00, 10.84it/s][A
 42%|████▏     | 5/12 [00:00<00:00, 11.90it/s][A
 58%|█████▊    | 7/12 [00:00<00:00, 12.52it/s][A
 75%|███████▌  | 9/12 [00:00<00:00, 13.06it/s][A
100%|██████████| 12/12 [00:00<00:00, 13.36it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A
100%|██████████| 2/2 [00:00<00:00,  8.99it/s][A

100%|██████████| 12/12 [00:00<00:00, 179.60it/s]

100%|██████████| 2/2 [00:00<00:00, 69.34it/s]

  0%|          | 0/12 [00:00<?, ?it/s][A
100%|██████████| 12/12 [00:00<00:00, 117.34it/s][A

100%|██████████| 1/1 [00:00<00:00, 86.98it/s]

100%|██████████| 12/12 [00:00<00:00, 162.78it/s]

100%|██████████| 1/1 [00:00<00:00, 68.57it/s]

100%|██████████| 12/12 [00:00<00:00, 229.66it/s]

100%|██████████| 1/1 [00:00<00:00, 55.93it/s]

  0%|          | 0/12 [00:00<?, ?it/s][A
100%|██████████| 12/12 [00:00<00:00, 113.06it/s][A

100%|██████████| 1/1 [00:00<00:00, 7

31.148726224899292

In [15]:
def lag_feature(df, lags, col):    
    tmp = df[['date_block_num', 'shop_id','item_id', col]]
    for i in tqdm(lags):
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

def mean_encoding(df, groupby_feats, target, enc, lags):
    print('Features: ' , groupby_feats)
    features = df[[*groupby_feats, target]]\
             .groupby(groupby_feats, as_index=False)\
             .agg(['mean'])
   
    features.columns = [enc]
    
    df = df.merge(features, on=groupby_feats, how='left')
    df[enc] = df[enc].astype(np.float16)
    df = lag_feature(df, lags, enc).fillna(0)
    df.drop(enc, axis=1, inplace=True)
    return df

ts = time.time()

periods = [1, 2, 3, 6, 12]

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num'], 'item_cnt_month', 
                                 'date_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_id'], 
                                'item_cnt_month', 'date_item_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'shop_id'], 
                                 'item_cnt_month', 'date_shop_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_category_id'], 
                                 'item_cnt_month', 'date_cat_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'shop_id', 'item_category_id'], 
                                 'item_cnt_month', 'date_shop_cat_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_id', 'item_category_id'], 
                                 'item_cnt_month', 
                                 'date_item_id_cat_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'city'], 
                                 'item_cnt_month', 'date_city_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_id', 'city'], 
                                 'item_cnt_month', 'date_item_city_avg_item_cnt', [1, 6]) 

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'shop_id', 'city'], 
                                 'item_cnt_month', 'date_shop_city_avg_item_cnt', [1, 6])

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_id', 
                                                    'item_category_names_category_1_enc'], 
                                 'item_cnt_month', 'date_item_category_1_avg_item_cnt', [1, 6])

time.time() - ts

Features:  ['date_block_num']



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:14<00:57, 14.38s/it][A
 40%|████      | 2/5 [00:23<00:38, 12.91s/it][A
 60%|██████    | 3/5 [00:32<00:23, 11.60s/it][A
 80%|████████  | 4/5 [00:40<00:10, 10.57s/it][A
100%|██████████| 5/5 [00:48<00:00,  9.78s/it][A


Features:  ['date_block_num', 'item_id']



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:15<01:02, 15.51s/it][A
 40%|████      | 2/5 [00:25<00:41, 13.88s/it][A
 60%|██████    | 3/5 [00:34<00:24, 12.32s/it][A
 80%|████████  | 4/5 [00:43<00:11, 11.48s/it][A
100%|██████████| 5/5 [00:52<00:00, 10.60s/it][A


Features:  ['date_block_num', 'shop_id']



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:16<01:06, 16.67s/it][A
 40%|████      | 2/5 [00:32<00:49, 16.51s/it][A
 60%|██████    | 3/5 [00:47<00:32, 16.06s/it][A
 80%|████████  | 4/5 [01:00<00:14, 14.94s/it][A
100%|██████████| 5/5 [01:12<00:00, 14.56s/it][A


Features:  ['date_block_num', 'item_category_id']



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:08, 17.19s/it][A
 40%|████      | 2/5 [00:28<00:46, 15.41s/it][A
 60%|██████    | 3/5 [00:38<00:27, 13.92s/it][A
 80%|████████  | 4/5 [00:48<00:12, 12.72s/it][A
100%|██████████| 5/5 [00:58<00:00, 11.77s/it][A


Features:  ['date_block_num', 'shop_id', 'item_category_id']



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:19<01:15, 18.91s/it][A
 40%|████      | 2/5 [00:31<00:51, 17.05s/it][A
 60%|██████    | 3/5 [00:44<00:31, 15.72s/it][A
 80%|████████  | 4/5 [00:56<00:14, 14.58s/it][A
100%|██████████| 5/5 [01:07<00:00, 13.45s/it][A


Features:  ['date_block_num', 'item_id', 'item_category_id']



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:21<01:27, 21.98s/it][A
 40%|████      | 2/5 [00:35<00:58, 19.56s/it][A
 60%|██████    | 3/5 [00:47<00:34, 17.28s/it][A
 80%|████████  | 4/5 [01:00<00:15, 15.74s/it][A
100%|██████████| 5/5 [01:13<00:00, 14.79s/it][A


Features:  ['date_block_num', 'city']



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:24<01:39, 24.95s/it][A
 40%|████      | 2/5 [00:43<01:08, 22.89s/it][A
 60%|██████    | 3/5 [00:56<00:40, 20.17s/it][A
 80%|████████  | 4/5 [01:10<00:18, 18.11s/it][A
100%|██████████| 5/5 [01:23<00:00, 16.64s/it][A


Features:  ['date_block_num', 'item_id', 'city']



  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:24<00:24, 24.53s/it][A
100%|██████████| 2/2 [00:37<00:00, 18.76s/it][A


Features:  ['date_block_num', 'shop_id', 'city']



  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:24<00:24, 24.73s/it][A
100%|██████████| 2/2 [00:38<00:00, 19.43s/it][A


Features:  ['date_block_num', 'item_id', 'item_category_names_category_1_enc']



  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:25<00:25, 25.06s/it][A
100%|██████████| 2/2 [00:38<00:00, 19.16s/it][A


871.5711209774017

In [16]:
X_target_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11128050 entries, 0 to 11128049
Data columns (total 92 columns):
date_block_num                             int8
shop_id                                    int8
item_id                                    int16
item_cnt_month                             float32
item_price                                 float32
item_category_id                           int64
city                                       int8
is_mal                                     int8
is_en_mal                                  int8
lat                                        float16
lng                                        float16
distance_to_moskov                         float16
item_category_names_category_1_enc         int8
item_category_names_category_2_enc         int8
is_digital                                 float64
key                                        int32
year                                       int8
month                                      int8


## Fix sales_train to train dataset

In [17]:
ts = time.time()
group = sales_train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(X_target_encoded, group, on=['item_id'], how='left')
matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

lags = [1,2,3,4,5,6]
matrix = lag_feature(matrix, lags, 'date_item_avg_item_price')

for i in lags:
    matrix['delta_price_lag_'+str(i)] = \
        (matrix['date_item_avg_item_price_lag_'+str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if row['delta_price_lag_'+str(i)]:
            return row['delta_price_lag_'+str(i)]
    return 0
    
matrix['delta_price_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)
matrix['delta_price_lag'].fillna(0, inplace=True)

fetures_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']
for i in lags:
    fetures_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    fetures_to_drop += ['delta_price_lag_'+str(i)]

matrix.drop(fetures_to_drop, axis=1, inplace=True)

time.time() - ts


  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:29<02:26, 29.31s/it][A
 33%|███▎      | 2/6 [00:43<01:39, 24.87s/it][A
 50%|█████     | 3/6 [00:58<01:05, 21.69s/it][A
 67%|██████▋   | 4/6 [01:11<00:38, 19.35s/it][A
 83%|████████▎ | 5/6 [01:25<00:17, 17.67s/it][A
100%|██████████| 6/6 [01:40<00:00, 16.73s/it][A


525.4093079566956

In [26]:
ts = time.time()

group = matrix.groupby(['date_block_num','shop_id']).agg({'revenue': ['sum']})
group.columns = ['date_shop_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_revenue'] = matrix['date_shop_revenue'].astype(np.float32)

group = group.groupby(['shop_id']).agg({'date_shop_revenue': ['mean']})
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id'], how='left')
matrix['shop_avg_revenue'] = matrix['shop_avg_revenue'].astype(np.float32)

matrix['delta_revenue'] = (matrix['date_shop_revenue'] - matrix['shop_avg_revenue']) / matrix['shop_avg_revenue']
matrix['delta_revenue'] = matrix['delta_revenue'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'delta_revenue')

matrix.drop(['date_shop_revenue','shop_avg_revenue','delta_revenue'], axis=1, inplace=True)
time.time() - ts

KeyError: 'date_shop_revenue'

In [None]:
matrix['date_shop_revenue'] = matrix.drop(['date_shop_revenue_y', 'date_shop_revenue_x'], axis=1)

In [31]:
matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price,item_category_id,city,is_mal,is_en_mal,lat,...,date_item_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_6,date_shop_city_avg_item_cnt_lag_1,date_shop_city_avg_item_cnt_lag_6,date_item_category_1_avg_item_cnt_lag_1,date_item_category_1_avg_item_cnt_lag_6,delta_price_lag,date_shop_revenue_x,date_shop_revenue_y,date_shop_revenue
0,0,0,19,0.0,0.0,40,29,0,0,62.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2905738.0,2905738.0,2905738.0
1,0,0,27,0.0,0.0,19,29,0,0,62.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2905738.0,2905738.0,2905738.0
2,0,0,28,0.0,0.0,30,29,0,0,62.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2905738.0,2905738.0,2905738.0
3,0,0,29,0.0,0.0,23,29,0,0,62.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2905738.0,2905738.0,2905738.0
4,0,0,32,6.0,221.0,40,29,0,0,62.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2905738.0,2905738.0,2905738.0


In [None]:

#Month since last sale for each shop/item pair.
ts = time.time()
last_sale = pd.DataFrame()
for month in range(1,35):    
    last_month = matrix.loc[(matrix['date_block_num']<month)&(matrix['item_cnt_month']>0)].groupby(['item_id','shop_id'])['date_block_num'].max()
    df = pd.DataFrame({'date_block_num':np.ones([last_month.shape[0],])*month,
                       'item_id': last_month.index.get_level_values(0).values,
                       'shop_id': last_month.index.get_level_values(1).values,
                       'item_shop_last_sale': last_month.values})
    last_sale = last_sale.append(df)
last_sale['date_block_num'] = last_sale['date_block_num'].astype(np.int8)

matrix = matrix.merge(last_sale, on=['date_block_num','item_id','shop_id'], how='left')
time.time() - ts

In [None]:
#Month since last sale for each item.
ts = time.time()
last_sale = pd.DataFrame()
for month in range(1,35):    
    last_month = matrix.loc[(matrix['date_block_num']<month)&(matrix['item_cnt_month']>0)].groupby('item_id')['date_block_num'].max()
    df = pd.DataFrame({'date_block_num':np.ones([last_month.shape[0],])*month,
                       'item_id': last_month.index.values,
                       'item_last_sale': last_month.values})
    last_sale = last_sale.append(df)
last_sale['date_block_num'] = last_sale['date_block_num'].astype(np.int8)

matrix = matrix.merge(last_sale, on=['date_block_num','item_id'], how='left')
time.time() - ts

In [None]:
# Months since the first sale for each shop/item pair and for item only.
ts = time.time()
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')
time.time() - ts

In [None]:
X, y = matrix.drop('item_cnt_month', axis=1), matrix['item_cnt_month']

X_train, X_test, y_train, y_test = \
    X[X['date_block_num'] != predict_month], X[X['date_block_num'] == predict_month], \
    y[X['date_block_num'] != predict_month], y[X['date_block_num'] == predict_month]

In [None]:
if does_it_for_submission:
    X_train_sub = X_train
    X_test_sub = X_test
    y_train_sub = y_train
    y_test_sub = y_test
    
    %store X_train_sub
    %store X_test_sub
    %store y_train_sub
    %store y_test_sub
    
    X_train_sub.to_pickle('X_train_sub.pkl')
    X_test_sub.to_pickle('X_test_sub.pkl')
    y_train_sub.to_pickle('y_train_sub.pkl')
    y_test_sub.to_pickle('y_test_sub.pkl')
    
else:
    
    %store X_train
    %store X_test
    %store y_train
    %store y_test
    
    X_train.to_pickle('X_train.pkl')
    X_test.to_pickle('X_test.pkl')
    y_train.to_pickle('y_train.pkl')
    y_test.to_pickle('y_test.pkl')

In [None]:
%%notify -m "Kernel sales-prediction.feature_eng.python.2.0 executed successfuly"
import gc
gc.collect()