# <center> Feature engineering

## Summary
1. Shop features generation
2. Shop features generation

## Initial Setup

In [1]:
does_it_for_submission = True

In [2]:
%load_ext jupyternotify

%store -r item_cat
%store -r item
%store -r shops
%store -r sales_train
%store -r train
%store -r train_test

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [3]:
__ipy

Helper ipython script loaded


In [4]:
__da

  from pandas import Panel


Basic Data Analysis tools was loaded


In [5]:
import googlemaps
import plotly.express as px
from functools import partial

# SKLEARN
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, KFold
from scipy.stats import randint as sp_randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# TSFRESH
from tsfresh.feature_extraction import ComprehensiveFCParameters, extract_features, MinimalFCParameters, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features

# Sklearn-pandas
from sklearn_pandas import CategoricalImputer, FunctionTransformer, DataFrameMapper

# SCIPY
from scipy.sparse import csr_matrix

# My files
from basic_text_preprocessing import BasicPreprocessText

gmaps = googlemaps.Client(key='AIzaSyCW4PTjjIz6yGUgAmqrG2cLy9euzbim23M')

from math import cos, asin, sqrt
import time

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maksymsuprunenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Shop features generation

Features:
1. **lat** - latitude
2. **lng** - longitude
3. **distance_to_moskov** - distance to Moscow city (Label Encoded)
4. **city** - city (Label Encoded)

In [6]:
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295     #Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 12742 * np.arcsin(sqrt(a))

def not_city_str(x, t):
    return 1 if t in "".join(x.split()[1:]) else 0

def get_location(x):
    loc = gmaps.geocode(x)
    return loc[0]['geometry']['location'] if len(loc) != 0 else {'lat': 0, 'lng': 0}

moskov_lat, moskov_lng = get_location('Moscow')

# new_shops = shops.copy()
# cleaned_shop_name = BasicPreprocessText().vectorize_process_text(shops['shop_name'])
# new_shops['shop_name'] = cleaned_shop_name
# new_shops['city'] = new_shops['shop_name'].apply(lambda x: x.split()[0])
# city = new_shops['city'] .value_counts()\
# .to_frame().reset_index().rename(columns={'index': 'shop_name', 'city': 'count_shops'})


# new_shops['is_mal'] = new_shops['shop_name'].apply(partial(not_city_str, t='тц')).astype(np.int8)
# new_shops['is_en_mal'] = new_shops['shop_name'].apply(partial(not_city_str, t='трк')).astype(np.int8)

# locations = new_shops['shop_name'].progress_apply(get_location) 

# new_shops_with_coords = pd.concat([new_shops, pd.DataFrame.from_records(locations.values)], axis=1)

# new_shops_with_coords.to_pickle("new_shops_with_coords.pickle")

new_shops_with_coords = pd.read_pickle("new_shops_with_coords.pickle")
moskov_lat, moskov_lng = list(get_location('Moscow').values())

new_shops_with_coords['lat'] = new_shops_with_coords['lat'].astype(np.float16, copy=False)
new_shops_with_coords['lng'] = new_shops_with_coords['lng'].astype(np.float16, copy=False)

new_shops_with_coords['distance_to_moskov'] = \
    new_shops_with_coords[['lat', 'lng']].apply(lambda x: distance(x[0], x[1], moskov_lat, moskov_lng), axis=1)\
    .astype(np.float16)

le_shop_dtm = LabelEncoder().fit(new_shops_with_coords['distance_to_moskov'].sort_values().values)

new_shops_with_coords['distance_to_moskov'] = \
    le_shop_dtm.transform(new_shops_with_coords['distance_to_moskov']).astype(np.float16)

new_shops_with_coords['city'] = LabelEncoder().fit_transform(new_shops_with_coords['city']).astype(np.int8)

new_shops_with_coords = new_shops_with_coords.drop('shop_name', axis=1)

### Item feature generation

Remove item name from dataset.

In [7]:
%%notify
item_cleaned = item.copy()
item_cleaned = item_cleaned.drop('item_name', axis=1)

<IPython.core.display.Javascript object>

## Item category generation

Features:
1. item_category_names_category_1_enc
2. item_category_names_category_2_enc
3. is_digital

In [8]:
item_cat_cleaned = item_cat.copy()

item_category_names = pd.Series(
    BasicPreprocessText().vectorize_process_text(item_cat_cleaned['item_category_name'], ['-'])
)

idx = [8, 9, 32, 79, 80, 81, 82, 83]
fixed_first_level = item_category_names[np.isin(item_cat_cleaned.index, idx)].apply(lambda x: str(np.abs(np.random.normal())) + "-" + x)
item_category_names[idx] = fixed_first_level

item_cat_cleaned['item_category_names_category_1'] = item_category_names.apply(lambda x: x.split("-")[0])
item_cat_cleaned['item_category_names_category_2'] = item_category_names.apply(lambda x: " ".join(x.split("-")[1:]))

item_cat_cleaned['item_category_names_category_1_enc'] = \
    LabelEncoder().fit_transform(item_cat_cleaned['item_category_names_category_1']).astype(np.int8)

item_cat_cleaned['item_category_names_category_2_enc'] = \
    LabelEncoder().fit_transform(item_cat_cleaned['item_category_names_category_2']).astype(np.int8)

item_cat_cleaned['is_digital'] = item_cat_cleaned.apply(lambda x: 'цыфра' in x).astype(np.int8)

item_cat_cleaned = item_cat_cleaned.drop(
    ['item_category_name', 'item_category_names_category_1', 'item_category_names_category_2'], 
    axis=1)

## Join on everthing

Join tables:
1. item
2. item_categories
3. sales_train
4. shops

In [9]:
%%notify

predict_month = 34 if does_it_for_submission else 33
train_df = train_test.copy() if does_it_for_submission else train.copy()

train_df = train_df.merge(item_cleaned[['item_id', 'item_category_id']], how='left', on='item_id', suffixes=("", "_item"), right_index=False)
train_df = train_df.merge(new_shops_with_coords, how='left', on='shop_id', suffixes=("", "_shops"), right_index=False)
train_df = train_df.merge(item_cat_cleaned, how='left', on='item_category_id', suffixes=("", "_item_cat"), right_index=False)

#train_df_file_name = "submission" if does_it_for_submission else "validation"
#train_df.to_pickle(f"train_df_{train_df_file_name}_.pickle")

<IPython.core.display.Javascript object>

## General features
1. key - compound key of shop_id and item_id
2. year 
3. month

In [10]:
train_df['key'] = train_df.progress_apply(lambda x: str(int(x['shop_id'])) + "_" + str(int(x['item_id'])), axis=1)
train_df['key'] = LabelEncoder().fit_transform(train_df['key']).astype(np.int32)

train_df['year'] = (train_df['date_block_num'] // 12).astype(np.int8)
train_df['month'] = (train_df['date_block_num'] % 12).astype(np.int8)

100%|██████████| 11128050/11128050 [03:43<00:00, 49807.54it/s]


### Group sale stats in recent
create stats (mean/var) of sales of certain groups during the past 12 months

In [12]:
def add_group_stats(matrix_, groupby_feats, target, enc_feat, last_periods):
    if not 'date_block_num' in groupby_feats:
        print ('date_block_num must in groupby_feats')
        return matrix_
    
    group = matrix_.groupby(groupby_feats)[target].sum().reset_index()
    max_lags = np.max(last_periods)
    
    for i in tqdm(range(1, max_lags+1)):
        shifted = group[groupby_feats+[target]].copy(deep=True)
        shifted['date_block_num'] += i
        shifted.rename({target:target+'_lag_'+str(i)},axis=1,inplace=True)
        group = group.merge(shifted, on=groupby_feats, how='left')
    group.fillna(0,inplace=True)
    
    for period in tqdm(last_periods):
        lag_feats = [target+'_lag_'+str(lag) for lag in np.arange(1,period+1)]
        # we do not use mean and std directly because we want to include months with sales = 0
        mean = group[lag_feats].sum(axis=1)/float(period)
        mean2 = (group[lag_feats]**2).sum(axis=1)/float(period)
        group[enc_feat+'_avg_sale_last_'+str(period)] = mean
        group[enc_feat+'_std_sale_last_'+str(period)] = (mean2 - mean**2).apply(np.sqrt)
        group[enc_feat+'_std_sale_last_'+str(period)].replace(np.inf,0,inplace=True)
        # divide by mean, this scales the features for NN
        group[enc_feat+'_avg_sale_last_'+str(period)] /= group[enc_feat+'_avg_sale_last_'+str(period)].mean()
        group[enc_feat+'_std_sale_last_'+str(period)] /= group[enc_feat+'_std_sale_last_'+str(period)].mean()
        
        group[enc_feat+'_avg_sale_last_'+str(period)] = group[enc_feat+'_avg_sale_last_'+str(period)].astype(np.float16) 
        group[enc_feat+'_std_sale_last_'+str(period)] = group[enc_feat+'_std_sale_last_'+str(period)].astype(np.float16)
        
        group[enc_feat+'_min_sale_last_'+str(period)] = group[lag_feats].sum(axis=1).min()
        group[enc_feat+'_max_sale_last_'+str(period)] = group[lag_feats].sum(axis=1).max()
        
        group[enc_feat+'_min_sale_last_'+str(period)] = group[enc_feat+'_min_sale_last_'+str(period)].astype(np.float16)
        group[enc_feat+'_max_sale_last_'+str(period)] = group[enc_feat+'_max_sale_last_'+str(period)].astype(np.float16)
                
    cols = groupby_feats + [f_ for f_ in group.columns.values if f_.find('_sale_last_')>=0]
    matrix = matrix_.merge(group[cols], on=groupby_feats, how='left')
    return matrix

In [13]:
ts = time.time()

X_target_encoded = train_df

X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'item_id'], 'item_cnt_month', 'item', [6,12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'shop_id'], 'item_cnt_month', 'shop', [6,12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'item_category_id'], 'item_cnt_month', 'category', [12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'city'], 'item_cnt_month', 'city', [12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'item_category_names_category_1_enc'], 'item_cnt_month', 'family', [12])
X_target_encoded = add_group_stats(X_target_encoded, ['date_block_num', 'item_category_names_category_2_enc'], 'item_cnt_month', 'subfamily', [12])

time.time() - ts

100%|██████████| 12/12 [00:00<00:00, 13.49it/s]
100%|██████████| 2/2 [00:00<00:00,  8.86it/s]
100%|██████████| 12/12 [00:00<00:00, 164.39it/s]
100%|██████████| 2/2 [00:00<00:00, 81.57it/s]
100%|██████████| 12/12 [00:00<00:00, 120.30it/s]
100%|██████████| 1/1 [00:00<00:00, 70.83it/s]
100%|██████████| 12/12 [00:00<00:00, 155.42it/s]
100%|██████████| 1/1 [00:00<00:00, 67.25it/s]
100%|██████████| 12/12 [00:00<00:00, 215.22it/s]
100%|██████████| 1/1 [00:00<00:00, 60.00it/s]
100%|██████████| 12/12 [00:00<00:00, 120.70it/s]
100%|██████████| 1/1 [00:00<00:00, 66.82it/s]


31.28959584236145

In [14]:
def lag_feature(df, lags, col):    
    tmp = df[['date_block_num', 'shop_id','item_id', col]]
    for i in tqdm(lags):
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] = shifted['date_block_num'] + i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

def mean_encoding(df, groupby_feats, target, enc, lags):
    print('Features: ' , groupby_feats)
    features = df[[*groupby_feats, target]]\
             .groupby(groupby_feats, as_index=False)\
             .agg(['mean'])
   
    features.columns = [enc]
    
    df = df.merge(features, on=groupby_feats, how='left')
    df[enc] = df[enc].astype(np.float16)
    df = lag_feature(df, lags, enc).fillna(0)
    df.drop(enc, axis=1, inplace=True)
    return df

ts = time.time()

periods = [1, 2, 3, 6, 12]

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num'], 'item_cnt_month', 
                                 'date_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_id'], 
                                'item_cnt_month', 'date_item_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'shop_id'], 
                                 'item_cnt_month', 'date_shop_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_category_id'], 
                                 'item_cnt_month', 'date_cat_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'shop_id', 'item_category_id'], 
                                 'item_cnt_month', 'date_shop_cat_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_id', 'item_category_id'], 
                                 'item_cnt_month', 
                                 'date_item_id_cat_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'city'], 
                                 'item_cnt_month', 'date_city_avg_item_cnt', periods)

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_id', 'city'], 
                                 'item_cnt_month', 'date_item_city_avg_item_cnt', [1, 6]) 

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'shop_id', 'city'], 
                                 'item_cnt_month', 'date_shop_city_avg_item_cnt', [1, 6])

X_target_encoded = mean_encoding(X_target_encoded, ['date_block_num', 'item_id', 
                                                    'item_category_names_category_1_enc'], 
                                 'item_cnt_month', 'date_item_category_1_avg_item_cnt', [1, 6])

time.time() - ts

Features:  ['date_block_num']


100%|██████████| 5/5 [00:43<00:00,  8.63s/it]


Features:  ['date_block_num', 'item_id']


100%|██████████| 5/5 [00:47<00:00,  9.43s/it]


Features:  ['date_block_num', 'shop_id']


100%|██████████| 5/5 [00:49<00:00, 10.00s/it]


Features:  ['date_block_num', 'item_category_id']


100%|██████████| 5/5 [00:59<00:00, 11.99s/it]


Features:  ['date_block_num', 'shop_id', 'item_category_id']


100%|██████████| 5/5 [01:03<00:00, 12.75s/it]


Features:  ['date_block_num', 'item_id', 'item_category_id']


100%|██████████| 5/5 [01:08<00:00, 13.76s/it]


Features:  ['date_block_num', 'city']


100%|██████████| 5/5 [01:14<00:00, 14.98s/it]


Features:  ['date_block_num', 'item_id', 'city']


100%|██████████| 2/2 [00:44<00:00, 22.34s/it]


Features:  ['date_block_num', 'shop_id', 'city']


100%|██████████| 2/2 [00:47<00:00, 24.00s/it]


Features:  ['date_block_num', 'item_id', 'item_category_names_category_1_enc']


100%|██████████| 2/2 [00:43<00:00, 21.89s/it]


805.8235862255096

In [15]:
X_target_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11128050 entries, 0 to 11128049
Data columns (total 92 columns):
date_block_num                             int8
shop_id                                    int8
item_id                                    int16
item_cnt_month                             float32
item_price                                 float32
item_category_id                           int64
city                                       int8
is_mal                                     int8
is_en_mal                                  int8
lat                                        float16
lng                                        float16
distance_to_moskov                         float16
item_category_names_category_1_enc         int8
item_category_names_category_2_enc         int8
is_digital                                 float64
key                                        int32
year                                       int8
month                                      int8


## Fix sales_train to train dataset

In [16]:
ts = time.time()
group = sales_train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(X_target_encoded, group, on=['item_id'], how='left')
matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

lags = [1,2,3,4,5,6]
matrix = lag_feature(matrix, lags, 'date_item_avg_item_price')

for i in lags:
    matrix['delta_price_lag_'+str(i)] = \
        (matrix['date_item_avg_item_price_lag_'+str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if row['delta_price_lag_'+str(i)]:
            return row['delta_price_lag_'+str(i)]
    return 0
    
matrix['delta_price_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)
matrix['delta_price_lag'].fillna(0, inplace=True)

fetures_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']
for i in lags:
    fetures_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    fetures_to_drop += ['delta_price_lag_'+str(i)]

matrix.drop(fetures_to_drop, axis=1, inplace=True)

time.time() - ts

100%|██████████| 6/6 [01:43<00:00, 17.22s/it]


507.49088978767395

In [17]:
ts = time.time()
group = train.groupby(['date_block_num','shop_id']).agg({'revenue': ['sum']})
group.columns = ['date_shop_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_revenue'] = matrix['date_shop_revenue'].astype(np.float32)

group = group.groupby(['shop_id']).agg({'date_shop_revenue': ['mean']})
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id'], how='left')
matrix['shop_avg_revenue'] = matrix['shop_avg_revenue'].astype(np.float32)

matrix['delta_revenue'] = (matrix['date_shop_revenue'] - matrix['shop_avg_revenue']) / matrix['shop_avg_revenue']
matrix['delta_revenue'] = matrix['delta_revenue'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'delta_revenue')

matrix.drop(['date_shop_revenue','shop_avg_revenue','delta_revenue'], axis=1, inplace=True)
time.time() - ts

KeyError: "Column 'revenue' does not exist!"

In [18]:

#Month since last sale for each shop/item pair.
ts = time.time()
last_sale = pd.DataFrame()
for month in range(1,35):    
    last_month = matrix.loc[(matrix['date_block_num']<month)&(matrix['item_cnt_month']>0)].groupby(['item_id','shop_id'])['date_block_num'].max()
    df = pd.DataFrame({'date_block_num':np.ones([last_month.shape[0],])*month,
                       'item_id': last_month.index.get_level_values(0).values,
                       'shop_id': last_month.index.get_level_values(1).values,
                       'item_shop_last_sale': last_month.values})
    last_sale = last_sale.append(df)
last_sale['date_block_num'] = last_sale['date_block_num'].astype(np.int8)

matrix = matrix.merge(last_sale, on=['date_block_num','item_id','shop_id'], how='left')
time.time() - ts

56.19337439537048

In [19]:
#Month since last sale for each item.
ts = time.time()
last_sale = pd.DataFrame()
for month in range(1,35):    
    last_month = matrix.loc[(matrix['date_block_num']<month)&(matrix['item_cnt_month']>0)].groupby('item_id')['date_block_num'].max()
    df = pd.DataFrame({'date_block_num':np.ones([last_month.shape[0],])*month,
                       'item_id': last_month.index.values,
                       'item_last_sale': last_month.values})
    last_sale = last_sale.append(df)
last_sale['date_block_num'] = last_sale['date_block_num'].astype(np.int8)

matrix = matrix.merge(last_sale, on=['date_block_num','item_id'], how='left')
time.time() - ts

58.104480028152466

In [20]:
# Months since the first sale for each shop/item pair and for item only.
ts = time.time()
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')
time.time() - ts

5.262052059173584

In [21]:
X, y = matrix.drop('item_cnt_month', axis=1), matrix['item_cnt_month']

X_train, X_test, y_train, y_test = \
    X[X['date_block_num'] != predict_month], X[X['date_block_num'] == predict_month], \
    y[X['date_block_num'] != predict_month], y[X['date_block_num'] == predict_month]

In [22]:
if does_it_for_submission:
    X_train_sub = X_train
    X_test_sub = X_test
    y_train_sub = y_train
    y_test_sub = y_test
    
    %store X_train_sub
    %store X_test_sub
    %store y_train_sub
    %store y_test_sub
    
    X_train_sub.to_pickle('X_train_sub.pkl')
    X_test_sub.to_pickle('X_test_sub.pkl')
    y_train_sub.to_pickle('y_train_sub.pkl')
    y_test_sub.to_pickle('y_test_sub.pkl')
    
else:
    
    %store X_train
    %store X_test
    %store y_train
    %store y_test
    
    X_train.to_pickle('X_train.pkl')
    X_test.to_pickle('X_test.pkl')
    y_train.to_pickle('y_train.pkl')
    y_test.to_pickle('y_test.pkl')

Stored 'X_train_sub' (DataFrame)
Stored 'X_test_sub' (DataFrame)
Stored 'y_train_sub' (Series)
Stored 'y_test_sub' (Series)


In [None]:
%%notify -m "Kernel sales-prediction.feature_eng.python.2.0 executed successfuly"
import gc
gc.collect()