In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch import nn
import sklearn
from torch.utils.data import DataLoader



### Load data
Load the training and test data that was created in the notebook 'create_data_sets.ipynb'

In [2]:
train_big = pd.read_csv('datasets/train_big.csv')
# select only interesting columns to train on
train_big = train_big.drop(['Unnamed: 0','id','type','family'],axis=1)
train_big.index = train_big['date']
train_big = train_big.drop('date',axis=1)
# permute columns (thats just my prefered order without any reason)
col_lst = ['store_nbr','family_id','onpromotion','day','weekday','month','year','holiday','oilprice','sales']
train_big = train_big[col_lst]

# do the same for the test frame

test_big = pd.read_csv('datasets/test_big.csv')
# select only interesting columns to train on
test_big = test_big.drop(['Unnamed: 0','id','type','family'],axis=1)
test_big.index = test_big['date']
test_big = test_big.drop('date',axis=1)
# permute columns (thats just my prefered order without any reason)
col_lst = ['store_nbr','family_id','onpromotion','day','weekday','month','year','holiday','oilprice']
test_big = test_big[col_lst]

In [3]:
train_big

Unnamed: 0_level_0,store_nbr,family_id,onpromotion,day,weekday,month,year,holiday,oilprice,sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2013-01-01,1,0,0,1,1,1,2013,2,93.14,0.000
2013-01-01,1,1,0,1,1,1,2013,2,93.14,0.000
2013-01-01,1,2,0,1,1,1,2013,2,93.14,0.000
2013-01-01,1,3,0,1,1,1,2013,2,93.14,0.000
2013-01-01,1,4,0,1,1,1,2013,2,93.14,0.000
...,...,...,...,...,...,...,...,...,...,...
2017-08-15,9,28,0,15,1,8,2017,0,47.57,438.133
2017-08-15,9,29,1,15,1,8,2017,0,47.57,154.553
2017-08-15,9,30,148,15,1,8,2017,0,47.57,2419.729
2017-08-15,9,31,8,15,1,8,2017,0,47.57,121.000


In [4]:
test_big

Unnamed: 0_level_0,store_nbr,family_id,onpromotion,day,weekday,month,year,holiday,oilprice
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-08-16,1,0,0,16,2,8,2017,0,46.80
2017-08-16,1,1,0,16,2,8,2017,0,46.80
2017-08-16,1,2,2,16,2,8,2017,0,46.80
2017-08-16,1,3,20,16,2,8,2017,0,46.80
2017-08-16,1,4,0,16,2,8,2017,0,46.80
...,...,...,...,...,...,...,...,...,...
2017-08-31,9,28,1,31,3,8,2017,0,47.26
2017-08-31,9,29,0,31,3,8,2017,0,47.26
2017-08-31,9,30,1,31,3,8,2017,0,47.26
2017-08-31,9,31,9,31,3,8,2017,0,47.26


In [4]:
#split train_big in train and test set
test_start = "2016-08-16"

df_train = train_big.loc[:test_start].copy()
df_test = train_big.loc[test_start:].copy()

#compte percentage of training test split
p = len(df_train)/len(train_big)
print('Size of training set is %s percent of total data.'%np.round(p,2))

Size of training set is 0.78 percent of total data.


### Standardize data
Hier schon überlegen ob man nur (stor_nbr,family_id) betrachted oder wie man am besten standardisiert.

In [5]:
from sklearn import preprocessing
store_nbr = 1
family_id = 0

In [6]:
X = df_train.loc[(df_train['store_nbr']==store_nbr)&(df_train['family_id']==family_id)]
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
df_train_scaled  = pd.DataFrame(X_scaled,index=X.index,columns = X.columns)

X_test = df_test.loc[(df_test['store_nbr']==store_nbr)&(df_test['family_id']==family_id)]
scaler = preprocessing.StandardScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)
df_test_scaled  = pd.DataFrame(X_test_scaled,index=X_test.index,columns = X_test.columns)


Create data class such that pytorch dataloader can handle the data

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, dataframe, target, features, sequence_length=5):
        self.features = features
        self.target = target
        self.sequence_length = sequence_length
        self.y = torch.tensor(dataframe[target].values).float()
        self.X = torch.tensor(dataframe[features].values).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i): 
        if i >= self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x = self.X[i_start:(i + 1), :]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0:(i + 1), :]
            x = torch.cat((padding, x), 0)

        return x, self.y[i]


In [11]:
features = ['onpromotion','day','weekday','month','year','holiday','oilprice']
train_dataset = SequenceDataset(df_train_scaled,'sales',features)

In [12]:
X, y = train_dataset[3]

In [13]:
torch.manual_seed(99)

train_loader = DataLoader(train_dataset, batch_size=3, shuffle=True)

X, y = next(iter(train_loader))
print(X.shape)
print(X)


torch.Size([3, 5, 7])
tensor([[[-1.0350e-01,  1.1816e+00, -9.9830e-01,  1.4487e+00, -1.2701e+00,
          -6.3332e-01,  7.6071e-01],
         [-1.0350e-01,  1.2953e+00, -4.9877e-01,  1.4487e+00, -1.2701e+00,
          -6.3332e-01,  7.0973e-01],
         [-1.0350e-01,  1.4091e+00,  7.5629e-04,  1.4487e+00, -1.2701e+00,
          -6.3332e-01,  7.1910e-01],
         [-1.0350e-01,  1.5228e+00,  5.0028e-01,  1.4487e+00, -1.2701e+00,
          -6.3332e-01,  7.2848e-01],
         [-1.0350e-01,  1.6366e+00,  9.9981e-01,  1.4487e+00, -1.2701e+00,
           1.2425e+00,  7.4172e-01]],

        [[-1.0350e-01,  1.6366e+00,  7.5629e-04,  1.1515e+00, -3.2698e-01,
          -6.3332e-01,  2.9781e-01],
         [-1.0350e-01,  1.7503e+00,  5.0028e-01,  1.1515e+00, -3.2698e-01,
          -6.3332e-01,  2.7794e-01],
         [-1.0350e-01, -1.6621e+00,  9.9981e-01,  1.4487e+00, -3.2698e-01,
           1.2425e+00,  2.5595e-01],
         [-1.0350e-01, -1.5484e+00,  1.4993e+00,  1.4487e+00, -3.2698e-01,
     

In [19]:
X[0].shape

torch.Size([5, 7])