In [None]:
import pandas as pd
import numpy as np
# import gc
# from itertools import product
# import matplotlib.pyplot as plt
# %matplotlib inline 

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

# from itertools import product


def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id', col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    del tmp
    return df

In [None]:
sales=pd.read_csv("sales_train_v2.csv")

In [None]:
sales.head()

In [None]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = []
import numpy as np
import pandas as pd 
import sklearn
import scipy 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

In [None]:
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Grouproduct(*[cur_shops, cur_itemspby data to get shop-item-month aggregates
gb = sales.groupby(['shop_id', 'item_id'],as_index=False).agg({'item_cnt_day':{'target':'sum'}})

# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 

# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=['shop_id', 'item_id']).fillna(0)

In [None]:
item_cat=pd.read_csv('item_cat.csv')
shop=pd.read_csv('shop_city.csv')

In [None]:
# grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)
all_data=pd.merge(all_data,item_cat,on=['item_id'],how='left')

In [None]:
all_data=pd.merge(all_data,shop,on=['shop_id'],how='left')

In [None]:
print(1)
########## 1. Create 'date_item_city_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'item_id', 'city_code']).agg({'target': ['mean']})
temp.columns = ['date_item_city_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'item_id', 'city_code'], how='left')
all_data = lag_feature(all_data, [1], 'date_item_city_avg_item_cnt')
all_data.drop(['date_item_city_avg_item_cnt'], axis=1, inplace=True)

print(2)
########## 2. Create 'date_item_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'item_id']).agg({'target': ['mean']})
temp.columns = ['date_item_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num','item_id'], how='left')
all_data = lag_feature(all_data, [1, 2, 3], 'date_item_avg_item_cnt')
all_data.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)

print(3)
########## 3. Create 'date_shop_type_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'shop_id', 'type_code']).agg({'target': ['mean']})
temp.columns = ['date_shop_type_avg_item_cnt']    ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'shop_id', 'type_code'], how='left')
all_data = lag_feature(all_data, [1], 'date_shop_type_avg_item_cnt')
all_data.drop(['date_shop_type_avg_item_cnt'], axis=1, inplace=True)


print(4)
########## 4. Create 'date_shop_cat_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'target': ['mean']})
temp.columns = ['date_shop_cat_avg_item_cnt']    ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
all_data = lag_feature(all_data, [1], 'date_shop_cat_avg_item_cnt')
all_data.drop(['date_shop_cat_avg_item_cnt'], axis=1, inplace=True)

print(5)
########## 5. Create 'date_subtype_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'subtype_code']).agg({'target': ['mean']})
temp.columns = ['date_subtype_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'subtype_code'], how='left')
all_data = lag_feature(all_data, [1], 'date_subtype_avg_item_cnt')
all_data.drop(['date_subtype_avg_item_cnt'], axis=1, inplace=True)
all_data.head()

print(6)
########## 6. Create 'date_avg_item_cnt'
temp = all_data.groupby(['date_block_num']).agg({'target': ['mean']})
temp.columns = ['date_avg_item_cnt']          ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num'], how='left')
all_data = lag_feature(all_data, [1], 'date_avg_item_cnt')
all_data.drop(['date_avg_item_cnt'], axis=1, inplace=True)

print(7)
########## 7. Create 'date_type_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'type_code']).agg({'target': ['mean']})
temp.columns = ['date_type_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'type_code'], how='left')
all_data = lag_feature(all_data, [1], 'date_type_avg_item_cnt')
all_data.drop(['date_type_avg_item_cnt'], axis=1, inplace=True)

print(8)
########## 8. Create 'date_shop_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'shop_id']).agg({'target': ['mean']})
temp.columns = ['date_shop_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num','shop_id'], how='left')
all_data = lag_feature(all_data, [1, 2], 'date_shop_avg_item_cnt')
all_data.drop(['date_shop_avg_item_cnt'], axis=1, inplace=True)

print(9)
########## 9. Create 'date_city_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'city_code']).agg({'target': ['mean']})
temp.columns = ['date_city_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'city_code'], how='left')
all_data = lag_feature(all_data, [1], 'date_city_avg_item_cnt')
all_data.drop(['date_city_avg_item_cnt'], axis=1, inplace=True)

In [None]:
print("mean encoding item id")
#####mean encoding item id
item_id_target_mean = all_data.groupby('item_id').target.mean()
all_data['item_target_enc'] = all_data['item_id'].map(item_id_target_mean)
all_data['item_target_enc'].fillna(0.3343, inplace=True) 
all_data=all_data.drop(['item_id'],axis=1)

print("mean encoding shop id")
#####mean encoding shop id
shop_id_target_mean=all_data.groupby('shop_id').target.mean()
all_data['shop_target_enc']=all_data['shop_id'].map(shop_id_target_mean)
all_data['shop_target_enc'].fillna(0.3343, inplace=True)
all_data=all_data.drop(['shop_id'],axis=1)

print("mean encoding item_category_id")
#####mean encoding item_category_id
item_cat_id_target_mean=all_data.groupby('item_category_id').target.mean()
all_data['item_cat_target_enc']=all_data['item_category_id'].map(item_cat_id_target_mean)
all_data['item_cat_target_enc'].fillna(0.3343, inplace=True)
all_data=all_data.drop(['item_category_id'],axis=1)

print("mean encoding type_code")
#####mean encoding type_code
type_code_target_mean=all_data.groupby('type_code').target.mean()
all_data['type_code_target_enc']=all_data['type_code'].map(type_code_target_mean)
all_data['type_code_target_enc'].fillna(0.3343, inplace=True)
all_data=all_data.drop(['type_code'],axis=1)

print("mean encoding subtype_code")
#####mean encoding subtype_code
subtype_code_target_mean=all_data.groupby('subtype_code').target.mean()
all_data['subtype_code_target_enc']=all_data['subtype_code'].map(subtype_code_target_mean)
all_data['subtype_code_target_enc'].fillna(0.3343, inplace=True)
all_data=all_data.drop(['subtype_code'],axis=1)

print("mean encoding city_code")
#####mean encoding city_code
city_code_target_mean=all_data.groupby('city_code').target.mean()
all_data['city_code_target_enc']=all_data['city_code'].map(city_code_target_mean)
all_data['city_code_target_enc'].fillna(0.3343, inplace=True)
all_data=all_data.drop(['city_code'],axis=1)

all_data = downcast_dtypes(all_data)


In [None]:
all_data.head()

In [None]:
all_data.to_csv("all_data2.csv",index=False)

## test datapreprocessing

In [None]:
def tlag_feature(test, lags, col):
    tmp = all_data[['date_block_num','shop_id','item_id', col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        test= pd.merge(test, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    del tmp
    return test

In [None]:
test=pd.read_csv('test.csv')
date_block_num=[34 for i in range(214200)]
date_block_num=pd.DataFrame(np.vstack(date_block_num), columns = ['date_block_num'],dtype=np.int32)
test=pd.concat([test,date_block_num] ,axis=1)

In [None]:
item_cat=pd.read_csv('item_cat.csv')
shop=pd.read_csv('shop_city.csv')

In [None]:
test=pd.merge(test,item_cat,on=['item_id'],how='left')
test=pd.merge(test,shop,on=['shop_id'],how='left')
del item_cat,shop

In [None]:
print(1)
########## 1. Create 'date_item_city_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'item_id', 'city_code']).agg({'target': ['mean']})
temp.columns = ['date_item_city_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'item_id', 'city_code'], how='left')
test = tlag_feature(test, [1], 'date_item_city_avg_item_cnt')
all_data.drop(['date_item_city_avg_item_cnt'], axis=1, inplace=True)

print(2)
########## 2. Create 'date_item_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'item_id']).agg({'target': ['mean']})
temp.columns = ['date_item_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num','item_id'], how='left')
test = tlag_feature(test, [1, 2, 3], 'date_item_avg_item_cnt')
all_data.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)

print(3)
########## 3. Create 'date_shop_type_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'shop_id', 'type_code']).agg({'target': ['mean']})
temp.columns = ['date_shop_type_avg_item_cnt']    ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'shop_id', 'type_code'], how='left')
test = tlag_feature(test, [1], 'date_shop_type_avg_item_cnt')
all_data.drop(['date_shop_type_avg_item_cnt'], axis=1, inplace=True)


print(4)
########## 4. Create 'date_shop_cat_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'target': ['mean']})
temp.columns = ['date_shop_cat_avg_item_cnt']    ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
test = tlag_feature(test,[1], 'date_shop_cat_avg_item_cnt')
all_data.drop(['date_shop_cat_avg_item_cnt'], axis=1, inplace=True)

print(5)
########## 5. Create 'date_subtype_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'subtype_code']).agg({'target': ['mean']})
temp.columns = ['date_subtype_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'subtype_code'], how='left')
test = tlag_feature(test,all_data, [1], 'date_subtype_avg_item_cnt')
all_data.drop(['date_subtype_avg_item_cnt'], axis=1, inplace=True)
all_data.head()

print(6)
########## 6. Create 'date_avg_item_cnt'
temp = all_data.groupby(['date_block_num']).agg({'target': ['mean']})
temp.columns = ['date_avg_item_cnt']          ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num'], how='left')
test= tlag_feature(test, [1], 'date_avg_item_cnt')
all_data.drop(['date_avg_item_cnt'], axis=1, inplace=True)

print(7)
########## 7. Create 'date_type_avg_item_cnt'
temp = all_data.groupby(['date_block_num', 'type_code']).agg({'target': ['mean']})
temp.columns = ['date_type_avg_item_cnt']     ###
temp.reset_index(inplace=True)
all_data = pd.merge(all_data, temp, on=['date_block_num', 'type_code'], how='left')
test = tlag_feature(test, [1], 'date_type_avg_item_cnt')
all_data.drop(['date_type_avg_item_cnt'], axis=1, inplace=True)



In [None]:
print("mean encoding item id")
#####mean encoding shop id
# shop_id_target_mean=test.groupby('shop_id').target.mean()
test['item_target_enc']=test['item_id'].map(shop_id_target_mean)
test['item_target_enc'].fillna(0.3343, inplace=True)
test=test.drop(['item_id'],axis=1)

print("mean encoding shop id")
#####mean encoding shop id
# shop_id_target_mean=test.groupby('shop_id').target.mean()
test['shop_target_enc']=test['shop_id'].map(shop_id_target_mean)
test['shop_target_enc'].fillna(0.3343, inplace=True)
test=test.drop(['shop_id'],axis=1)

print("mean encoding item_category_id")
#####mean encoding item_category_id
# item_cat_id_target_mean=test.groupby('item_category_id').target.mean()
test['item_cat_target_enc']=test['item_category_id'].map(item_cat_id_target_mean)
test['item_cat_target_enc'].fillna(0.3343, inplace=True)
test=test.drop(['item_category_id'],axis=1)

print("mean encoding type_code")
#####mean encoding type_code
# type_code_target_mean=test.groupby('type_code').target.mean()
test['type_code_target_enc']=test['type_code'].map(type_code_target_mean)
test['type_code_target_enc'].fillna(0.3343, inplace=True)
test=test.drop(['type_code'],axis=1)

print("mean encoding subtype_code")
#####mean encoding subtype_code
# subtype_code_target_mean=test.groupby('subtype_code').target.mean()
test['subtype_code_target_enc']=test['subtype_code'].map(subtype_code_target_mean)
test['subtype_code_target_enc'].fillna(0.3343, inplace=True)
test=test.drop(['subtype_code'],axis=1)

print("mean encoding city_code")
#####mean encoding city_code
# city_code_target_mean=test.groupby('city_code').target.mean()
test['city_code_target_enc']=test['city_code'].map(city_code_target_mean)
test['city_code_target_enc'].fillna(0.3343, inplace=True)
test=test.drop(['city_code'],axis=1)

test = downcast_dtypes(test)

In [None]:
test.to_csv("modified_test.csv",index=False)