In [303]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlutils.features import *

In [304]:
# let's create a dummy dataset
store_ids = np.arange(1,11)
month_indicator = np.arange(1,6)
values = np.random.randint(100, size = len(store_ids)*len(month_indicator))

data = []
i = 0
for s in store_ids:
    for m in month_indicator:
        data.append([s, m, values[i]])
        i = i + 1

df = pd.DataFrame(data)
df.columns = ['Store','Month','Sales']
df.head(20)

Unnamed: 0,Store,Month,Sales
0,1,1,9
1,1,2,65
2,1,3,79
3,1,4,92
4,1,5,74
5,2,1,46
6,2,2,21
7,2,3,5
8,2,4,60
9,2,5,99


In [305]:
# to test our functions, let's reorder everything
df = df.sample(frac=1)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Store,Month,Sales
0,10,4,47
1,7,1,80
2,1,3,79
3,10,3,50
4,4,2,27
5,1,4,92
6,4,5,25
7,7,3,51
8,10,5,51
9,3,4,53


In [328]:
%%time
def getPreviousValue(df, value_column, group_column, iterator_column):
    df = df.sort_values([group_column, iterator_column])
    df2 = df[[value_column, group_column, iterator_column ]].copy()
    df2.columns = ['Previous'+value_column, group_column, iterator_column]
    df2[iterator_column] = df2[iterator_column].apply(lambda x: x + 1)
    return df.merge(df2, how='left', on=[group_column, iterator_column])

df = getPreviousValue(df,'Sales', 'Store', 'Month')
df.head(10)

CPU times: user 9.18 ms, sys: 96 µs, total: 9.28 ms
Wall time: 8.54 ms


In [329]:
%%time
df = df.sort_values(['Store', 'Month'])
df['new'] = df.Sales.shift()
df.loc[df.Month == 1,'new'] = 0
df

CPU times: user 9.02 ms, sys: 83 µs, total: 9.11 ms
Wall time: 8.55 ms


In [316]:
temp = df.set_index('Month')
temp['Cum'] = temp.groupby(['Store'])['PreviousSales'].cumsum()
#temp['Cum'] = temp['Cum'].shift()
temp.reset_index(inplace=True)
#temp.loc[temp['Month'] == 1,'Cum'] = np.nan
temp

Unnamed: 0,Month,Store,Sales,PreviousSales,Cum
0,1,1,9,,
1,2,1,65,9.0,9.0
2,3,1,79,65.0,74.0
3,4,1,92,79.0,153.0
4,5,1,74,92.0,245.0
5,1,2,46,,
6,2,2,21,46.0,46.0
7,3,2,5,21.0,67.0
8,4,2,60,5.0,72.0
9,5,2,99,60.0,132.0


In [317]:
temp = df.set_index('Month')
temp['Avg'] = temp.groupby(['Store'])['PreviousSales'].rolling(5, min_periods=1).mean().values
temp.reset_index(inplace=True)
#temp['Avg'] = temp['Avg'].shift()
#temp.loc[temp['Month'] == 1,'Avg'] = np.nan
temp

Unnamed: 0,Month,Store,Sales,PreviousSales,Avg
0,1,1,9,,
1,2,1,65,9.0,9.0
2,3,1,79,65.0,37.0
3,4,1,92,79.0,51.0
4,5,1,74,92.0,61.25
5,1,2,46,,
6,2,2,21,46.0,46.0
7,3,2,5,21.0,33.5
8,4,2,60,5.0,24.0
9,5,2,99,60.0,33.0


In [309]:
temp = df.set_index('Month')
temp['Std'] = temp.groupby(['Store'])['Sales'].rolling(5, min_periods=1).std().values
temp.reset_index(inplace=True)
temp['Std'] = temp['Std'].shift()
temp.loc[temp['Month'] == 1,'Std'] = np.nan
temp

Unnamed: 0,Month,Store,Sales,PreviousSales,Std
0,1,1,9,,
1,2,1,65,9.0,
2,3,1,79,65.0,39.59798
3,4,1,92,79.0,37.040518
4,5,1,74,92.0,36.536511
5,1,2,46,,
6,2,2,21,46.0,
7,3,2,5,21.0,17.67767
8,4,2,60,5.0,20.663978
9,5,2,99,60.0,24.671171
