In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
train_dtype = {
    'Store':'int16',
    'DayOfWeek':'int16',
    'Date':'str',
    'Sales':'float',
    'Customers':'float',
    'Open':'int16',
    'Promo':'int16',
    'StateHoliday':'str',
    'SchoolHoliday':'int16'
}
test_dtype = {
    'Id':'int32',
    'Store':'int16',
    'DayOfWeek':'int16',
    'Date':'str',
    'Promo':'int16',
    'StateHoliday':'str',
    'SchoolHoliday':'int16'
}
parse_dates = ['Date']
store_col = ['Store', 'StoreType', 'Assortment']

In [3]:
store = pd.read_csv('../data/rossman_data/store.csv',usecols=store_col)
train = pd.read_csv('../data/rossman_data/train.csv',dtype=train_dtype,parse_dates=parse_dates)
test = pd.read_csv('../data/rossman_data/test.csv',dtype=test_dtype,parse_dates=parse_dates)
test.Open =test.copy().Open.fillna(0.0).astype('int16')
test['Sales']=np.zeros(test.shape[0])

In [4]:
train_store_set = set(train.Store)
test_store_set = set(test.Store)
store_list = []
for s in train_store_set:
    if s in test_store_set:
        store_list.append(s)
train = train[train.Store.isin(store_list)]

In [5]:
train.shape

(773231, 9)

In [6]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263.0,555.0,1,1,0,1
2,3,5,2015-07-31,8314.0,821.0,1,1,0,1
6,7,5,2015-07-31,15344.0,1414.0,1,1,0,1
7,8,5,2015-07-31,8492.0,833.0,1,1,0,1
8,9,5,2015-07-31,8565.0,687.0,1,1,0,1


In [7]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Sales
0,1,1,4,2015-09-17,1,1,0,0,0.0
1,2,3,4,2015-09-17,1,1,0,0,0.0
2,3,7,4,2015-09-17,1,1,0,0,0.0
3,4,8,4,2015-09-17,1,1,0,0,0.0
4,5,9,4,2015-09-17,1,1,0,0,0.0


In [8]:
store.head()

Unnamed: 0,Store,StoreType,Assortment
0,1,c,a
1,2,a,a
2,3,a,a
3,4,c,c
4,5,a,a


In [9]:
train['dom'] = train.Date.dt.day
test['dom'] = test.Date.dt.day
train['doy'] = train.Date.dt.dayofyear
test['doy'] = test.Date.dt.dayofyear
train['quarter'] = train.Date.dt.quarter
test['quarter'] = test.Date.dt.quarter
train['month'] = train.Date.dt.month
test['month'] = test.Date.dt.month
train['week'] = train.Date.dt.week
test['week'] = test.Date.dt.week

In [10]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,dom,doy,quarter,month,week
0,1,5,2015-07-31,5263.0,555.0,1,1,0,1,31,212,3,7,31
2,3,5,2015-07-31,8314.0,821.0,1,1,0,1,31,212,3,7,31
6,7,5,2015-07-31,15344.0,1414.0,1,1,0,1,31,212,3,7,31
7,8,5,2015-07-31,8492.0,833.0,1,1,0,1,31,212,3,7,31
8,9,5,2015-07-31,8565.0,687.0,1,1,0,1,31,212,3,7,31


In [11]:
train.Date.max(),train.Date.min()

(Timestamp('2015-07-31 00:00:00'), Timestamp('2013-01-01 00:00:00'))

In [12]:
# to get the train and validation marker as Id column, which is good to concat with test set
special_date = train.Date.iloc[49060]
ID = np.zeros(train.shape[0])
ID[train.Date>special_date] = np.ones(len(ID[train.Date>special_date]))*-50000 # validation set
ID[train.Date<=special_date] = np.ones(len(ID[train.Date<=special_date]))*50000 # train set
train['Id'] = ID.astype('int32')

In [13]:
train.shape,test.shape

((773231, 15), (41088, 14))

In [14]:
full_data = pd.concat([train.drop('Customers',axis=1)[::-1],test[::-1]],axis=0)

In [15]:
full_data.head()

Unnamed: 0,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,dom,doy,month,quarter,week
1017208,2013-01-01,2,50000,0,0,0.0,1,a,1115,1,1,1,1,1
1017207,2013-01-01,2,50000,0,0,0.0,1,a,1114,1,1,1,1,1
1017206,2013-01-01,2,50000,0,0,0.0,1,a,1113,1,1,1,1,1
1017205,2013-01-01,2,50000,0,0,0.0,1,a,1112,1,1,1,1,1
1017204,2013-01-01,2,50000,0,0,0.0,1,a,1111,1,1,1,1,1


In [16]:
full_data = full_data.merge(store,how='left',on='Store')

In [17]:
full_data.head()

Unnamed: 0,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,dom,doy,month,quarter,week,StoreType,Assortment
0,2013-01-01,2,50000,0,0,0.0,1,a,1115,1,1,1,1,1,d,c
1,2013-01-01,2,50000,0,0,0.0,1,a,1114,1,1,1,1,1,a,c
2,2013-01-01,2,50000,0,0,0.0,1,a,1113,1,1,1,1,1,a,c
3,2013-01-01,2,50000,0,0,0.0,1,a,1112,1,1,1,1,1,c,c
4,2013-01-01,2,50000,0,0,0.0,1,a,1111,1,1,1,1,1,a,a


In [18]:
lbl = LabelEncoder()
for col in ['DayOfWeek','SchoolHoliday','StateHoliday','Store','dom','doy',
           'month','quarter','week','StoreType','Assortment']:
    full_data[col] = lbl.fit_transform(full_data[col])

In [19]:
full_data.reset_index(drop=True,inplace=True)

In [20]:
full_data.head()

Unnamed: 0,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,dom,doy,month,quarter,week,StoreType,Assortment
0,2013-01-01,1,50000,0,0,0.0,1,1,855,0,0,0,0,0,3,2
1,2013-01-01,1,50000,0,0,0.0,1,1,854,0,0,0,0,0,0,2
2,2013-01-01,1,50000,0,0,0.0,1,1,853,0,0,0,0,0,0,2
3,2013-01-01,1,50000,0,0,0.0,1,1,852,0,0,0,0,0,2,2
4,2013-01-01,1,50000,0,0,0.0,1,1,851,0,0,0,0,0,0,0


In [32]:
full_data['Store_'] = full_data['Store'].copy()

In [33]:
feature_list = ['DayOfWeek','Id','Open','Promo',
               'Sales','SchoolHoliday','StateHoliday','Store','dom',
               'doy','month','quarter','week','StoreType','Assortment']

In [34]:
feature_dict = {i:None for i in feature_list}

In [38]:
for k in feature_dict.keys():
    feature_dict[k]=full_data[['Store_','Date',k]].set_index(['Store_','Date']).unstack(-1,fill_value=0)
    print(k,feature_dict[k].shape)

DayOfWeek (856, 990)
Id (856, 990)
Open (856, 990)
Promo (856, 990)
Sales (856, 990)
SchoolHoliday (856, 990)
StateHoliday (856, 990)
Store (856, 990)
dom (856, 990)
doy (856, 990)
month (856, 990)
quarter (856, 990)
week (856, 990)
StoreType (856, 990)
Assortment (856, 990)


In [43]:
feature_dict['Sales'].iloc[:,-160:]

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Date,2015-04-11,2015-04-12,2015-04-13,2015-04-14,2015-04-15,2015-04-16,2015-04-17,2015-04-18,2015-04-19,2015-04-20,...,2015-09-08,2015-09-09,2015-09-10,2015-09-11,2015-09-12,2015-09-13,2015-09-14,2015-09-15,2015-09-16,2015-09-17
Store_,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,4173.0,0.0,5377.0,4648.0,4110.0,4116.0,4718.0,4594.0,0.0,3722.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4008.0,0.0,9563.0,9274.0,8651.0,8336.0,7735.0,3772.0,0.0,5238.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5479.0,0.0,11693.0,11185.0,9020.0,8168.0,9300.0,5534.0,0.0,7587.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3546.0,0.0,9881.0,8357.0,6489.0,8516.0,6049.0,3834.0,0.0,5591.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6837.0,0.0,11968.0,8416.0,7444.0,7644.0,8346.0,6685.0,0.0,6429.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4586.0,0.0,8442.0,7002.0,5948.0,5915.0,6521.0,5807.0,0.0,5130.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5550.0,0.0,9625.0,8499.0,8632.0,7523.0,7262.0,5331.0,0.0,6556.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,6235.0,0.0,11377.0,9775.0,8764.0,8602.0,8576.0,6901.0,0.0,6416.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,4411.0,0.0,7761.0,6119.0,5239.0,6301.0,5889.0,4748.0,0.0,3534.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,4214.0,0.0,7946.0,6872.0,5422.0,6360.0,6947.0,3421.0,0.0,5158.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
train_dict = {i:None for i in feature_dict.keys()}
val_dict = {i:None for i in feature_dict.keys()}
test_dict = {i:None for i in feature_dict.keys()}

In [70]:
for k in test_dict.keys():
    test_dict[k] = feature_dict[k].iloc[:,-160:]
    val_dict[k] = feature_dict[k].iloc[:,-208:-48]

In [71]:
val_dict['Sales'].shape

(856, 160)

In [217]:
test[test.Store==115].shape

(48, 14)

In [218]:
def get_lookback(data,n=65,f=5):
    length = len(data)
    result = []
    for i in range(n,length,f):
        result.append(data[i-n:i])
    return np.expand_dims(np.array(result),2)

In [220]:
full_data.Id.min()

-50000

In [191]:
len(set(train.Store)),len(set(test.Store))

(856, 856)

In [226]:
get_lookback(train.Sales[train.Store==1],n=160,f=48).shape

(17, 160, 1)

In [221]:
full_data.columns

Index(['Date', 'DayOfWeek', 'Id', 'Open', 'Promo', 'Sales', 'SchoolHoliday',
       'StateHoliday', 'Store', 'dom', 'doy', 'month', 'quarter', 'week',
       'StoreType', 'Assortment'],
      dtype='object')

In [285]:
attr_list = ['Id','Sales','DayOfWeek',  'Open', 'Promo',  'SchoolHoliday',
       'StateHoliday', 'Store', 'dom', 'doy', 'month', 'quarter', 'week',
       'StoreType', 'Assortment']
store_list = full_data.Store.unique()

In [289]:
store_list.shape

(856,)

In [231]:
np.concatenate?

In [292]:
def print_data(data):
    '''
    def get_one_store(col,s):
        return get_lookback(data[col][data.Store==s],160,48)
    
    result = get_one_store(attr_list[0],store_list[0])
    print(result.shape)
    print(get_one_store(attr_list[0],store_list[1]).shape)
    '''
    result = get_lookback(data[attr_list[0]][data.Store==store_list[0]],n=160,f=48)
    for s in store_list[1:]:
        print(get_lookback(data[attr_list[0]][data.Store==s],n=160,f=48).shape)
        result = np.concatenate((result,get_lookback(data[attr_list[0]][data.Store==s],n=160,f=48)),axis=0)
    for col in attr_list[1:]:
        store_tmp = get_lookback(data[col][data.Store==store_list[0]],n=160,f=48)
        for s in store_list[1:]:
            tmp = get_lookback(data[col][data.Store==s],n=160,f=48)
            #print(tmp.shape,store_tmp.shape)
            store_tmp = np.concatenate((store_tmp,tmp),axis=0)
        result = np.concatenate((result,store_tmp),axis=2)
    
    return result

In [296]:
data = print_data(full_data)

(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)

(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(14, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)
(14, 160, 1)
(18, 160, 1)
(18, 160, 1)

In [300]:
np.transpose(data,[2,0,1]).shape

(15, 14688, 160)

In [298]:
data[:,:,[data[:,:,0]==50000]].shape

IndexError: too many indices for array

In [294]:
result = get_lookback(full_data.StoreType[full_data.Store==store_list[0]],n=160,f=48)
result2 = get_lookback(full_data.StoreType[full_data.Store==store_list[1]],n=160,f=48)

In [295]:
for s in store_list[1:]:
    print(result.shape,s)
    result = np.concatenate((result,get_lookback(full_data.StoreType[full_data.Store==s],n=160,f=48)),axis=0)

(18, 160, 1) 854
(36, 160, 1) 853
(54, 160, 1) 852
(72, 160, 1) 851
(90, 160, 1) 850
(104, 160, 1) 849
(118, 160, 1) 848
(136, 160, 1) 847
(154, 160, 1) 846
(168, 160, 1) 845
(186, 160, 1) 844
(200, 160, 1) 843
(218, 160, 1) 842
(236, 160, 1) 841
(254, 160, 1) 840
(272, 160, 1) 839
(290, 160, 1) 838
(304, 160, 1) 837
(318, 160, 1) 836
(336, 160, 1) 835
(354, 160, 1) 834
(372, 160, 1) 833
(390, 160, 1) 832
(408, 160, 1) 831
(426, 160, 1) 830
(444, 160, 1) 829
(458, 160, 1) 828
(476, 160, 1) 827
(494, 160, 1) 826
(512, 160, 1) 825
(530, 160, 1) 824
(548, 160, 1) 823
(566, 160, 1) 822
(584, 160, 1) 821
(602, 160, 1) 820
(620, 160, 1) 819
(634, 160, 1) 818
(652, 160, 1) 817
(666, 160, 1) 816
(684, 160, 1) 815
(702, 160, 1) 814
(720, 160, 1) 813
(738, 160, 1) 812
(756, 160, 1) 811
(774, 160, 1) 810
(792, 160, 1) 809
(810, 160, 1) 808
(824, 160, 1) 807
(842, 160, 1) 806
(860, 160, 1) 805
(878, 160, 1) 804
(896, 160, 1) 803
(914, 160, 1) 802
(928, 160, 1) 801
(946, 160, 1) 800
(964, 160, 1) 7

(7924, 160, 1) 392
(7938, 160, 1) 391
(7956, 160, 1) 390
(7974, 160, 1) 389
(7992, 160, 1) 388
(8010, 160, 1) 387
(8028, 160, 1) 386
(8046, 160, 1) 385
(8064, 160, 1) 384
(8082, 160, 1) 383
(8100, 160, 1) 382
(8114, 160, 1) 381
(8128, 160, 1) 380
(8146, 160, 1) 379
(8160, 160, 1) 378
(8178, 160, 1) 377
(8196, 160, 1) 376
(8214, 160, 1) 375
(8228, 160, 1) 374
(8246, 160, 1) 373
(8260, 160, 1) 372
(8278, 160, 1) 371
(8296, 160, 1) 370
(8314, 160, 1) 369
(8328, 160, 1) 368
(8346, 160, 1) 367
(8364, 160, 1) 366
(8378, 160, 1) 365
(8396, 160, 1) 364
(8414, 160, 1) 363
(8432, 160, 1) 362
(8446, 160, 1) 361
(8464, 160, 1) 360
(8482, 160, 1) 359
(8500, 160, 1) 358
(8518, 160, 1) 357
(8536, 160, 1) 356
(8554, 160, 1) 355
(8572, 160, 1) 354
(8590, 160, 1) 353
(8608, 160, 1) 352
(8622, 160, 1) 351
(8640, 160, 1) 350
(8658, 160, 1) 349
(8676, 160, 1) 348
(8694, 160, 1) 347
(8712, 160, 1) 346
(8730, 160, 1) 345
(8748, 160, 1) 344
(8766, 160, 1) 343
(8784, 160, 1) 342
(8802, 160, 1) 341
(8820, 160, 

In [284]:
store_list[-1]

1115