In [1]:
import pandas as pd
import numpy as np
import os
import gc
import joblib
from datetime import datetime as dt

In [2]:
class Dataset():
    def __init__(self):
        prefix = ''
        suffix = ''
        self.dir = '../features/'
        
        
    def reduce_mem_usage(self,df):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024**2
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
        return df

    
    def get_features(self ,features = None):
        
        #作成した特徴量の取得
        if features == None:
            print('features not selected')
            exit(0)
        else:
            dfs = [pd.read_feather(f'.features/{f}.pickle') for f in features]
            dfs = reduce_mem_usage(dfs)
            
            return dfs
        
    def get_dataset(self):
        #生データの取得
        with open('../data/data_full.joblib', mode="rb") as f:
            self.data = joblib.load(f)
        
        
    def run(self):
            self.create_features()
            prefix = self.prefix + '_' if self.prefix else ''
            suffix = '_' + self.suffix if self.suffix else ''
            self.train.columns = prefix + self.train.columns + suffix
            self.test.columns = prefix + self.test.columns + suffix
        

In [13]:
class Feature(Dataset):
    #
        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__ 
        if not os.path.exists(self.dir + self.file_dir):
            os.mkdir(self.dir + self.file_dir)
        
        
    
    def save(self ,feature):
        with open((self.dir + self.file_dir+'/'+'{0}.joblib'.format(feature.name), mode="wb") as f:
            joblib.dump(feature, f, compress=3)

        
    def create_features_SNAP_LAG(self):
        #作成する特徴量について記述
        
 

                
                

SyntaxError: invalid syntax (<ipython-input-13-8751e2535af7>, line 14)

In [3]:
class rolling_lag_Feature_shift1(Dataset):
    #単純な移動平均

        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__
        if not os.path.exists(self.dir + self.file_dir):
            os.mkdir(self.dir + self.file_dir)
        
        
    
    def save(self ,feature):
        with open(self.dir + self.file_dir+'/'+'{0}.joblib'.format(feature.name), mode="wb") as f:
            joblib.dump(feature, f, compress=3)

        
    def create_features(self):
        DAYS_PRED = 1
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_median_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).median())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_median_t{size}"])
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_mean_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).mean())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_mean_t{size}"])
   
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_sum_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).sum())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_sum_t{size}"])
            
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_std_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).std())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_std_t{size}"])
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_max_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).max())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_max_t{size}"])

        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_min_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).min())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_min_t{size}"])
            
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_min_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).min())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_min_t{size}"])

            
        for i in range(1,8):
            self.data['lag_'+str(i)] = self.data.groupby(['id'])['demand'].transform(lambda x: x.shift(i))
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data['lag_'+str(i)])
        
    

                
                

In [None]:
tmp = rolling_lag_Feature_shift1()
tmp.get_dataset()
tmp.create_features()
del tmp
gc.collect()

In [43]:
class calendar_feature(Dataset):
    #
        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__ 
        if not os.path.exists(self.dir + self.file_dir):
            os.mkdir(self.dir + self.file_dir)
        
        
    
    def save(self ,feature):
        with open(self.dir + self.file_dir+'/'+'{0}.joblib'.format(feature.name), mode="wb") as f:
            joblib.dump(feature, f, compress=3)
        
    def create_feature(self):
        dt_col = "date"
        self.data[dt_col] = pd.to_datetime(self.data[dt_col])

        attrs = [
            "year",
            "quarter",
            "month",
            "week",
            "day",
            "dayofweek",
            "is_year_end",
            "is_year_start",
            "is_quarter_end",
            "is_quarter_start",
            "is_month_end",
            "is_month_start",]

        for attr in attrs:
            dtype = np.int16 if attr == "year" else np.int8
            self.data[attr] = getattr(self.data[dt_col].dt, attr).astype(dtype)
            
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[attr])
        

In [44]:
tmp = calendar_feature()
tmp.get_dataset()
tmp.create_feature()
del tmp
gc.collect()

NameError: name 'tmp' is not defined

In [None]:
self.data['price_max'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('max')
self.data['price_min'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('min')
self.data['price_std'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('std')
self.data['price_mean'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('mean')

# and do price normalization (min/max scaling)
self.data['price_norm'] = self.data['sell_price']/self.data['price_max']

# Some items are can be inflation dependent
# and some items are very "stable"
self.data['price_nunique'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
self.data['item_nunique'] = self.data.groupby(['store_id','sell_price'])['item_id'].transform('nunique')


# I would like some "rolling" aggregations
# but would like months and years as "window"
calendar_df = pd.read_pickle('../../../tmp_M5/calender.pickle')
calendar_prices = calendar_df[['wm_yr_wk','month','year']]
calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
prices_df = seld.data.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')
del calendar_prices

# Now we can add price "momentum" (some sort of)
# Shifted by week 
# by month mean
# by year mean
self.data['price_momentum'] = self.data['sell_price']/self.data.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
self.data['price_momentum_m'] = self.data['sell_price']/self.data.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
self.data['price_momentum_y'] = self.data['sell_price']/self.data.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

In [53]:
!

/bin/sh: ../../: is a directory


In [54]:
pd.read_pickle('../../../tmp_M5/calender.pickle')

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,,,,,0,1,1
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,,,,,0,0,0
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,,,,,0,0,0
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,,,,,0,0,0


In [None]:
self.data['price_max'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('max')
self.data['price_min'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('min')
self.data['price_std'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('std')
self.data['price_mean'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('mean')

# and do price normalization (min/max scaling)
self.data['price_norm'] = self.data['sell_price']/self.data['price_max']

# Some items are can be inflation dependent
# and some items are very "stable"
self.data['price_nunique'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
self.data['item_nunique'] = prices_df.groupby(['store_id','sell_price'])['item_id'].transform('nunique')

# I would like some "rolling" aggregations
# but would like months and years as "window"
calendar_prices = calendar_df[['wm_yr_wk','month','year']]
calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
prices_df = prices_df.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')
del calendar_prices

# Now we can add price "momentum" (some sort of)
# Shifted by week 
# by month mean
# by year mean
prices_df['price_momentum'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
prices_df['price_momentum_m'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
prices_df['price_momentum_y'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

In [59]:
class price_feature(Dataset):
    #
        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__ 
        if not os.path.exists(self.dir + self.file_dir):
            os.mkdir(self.dir + self.file_dir)
        
        
    
    def save(self ,feature):
        with open(self.dir + self.file_dir+'/'+'{0}.joblib'.format(feature.name), mode="wb") as f:
            joblib.dump(feature, f, compress=3)
        
    def create_price_feature(self):
        
        DAYS_PRED = 28
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_price_mean_t{size}"] = self.data.groupby(["id"]
                                            )["sell_price"].transform(lambda x: x.rolling(size).mean())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_price_mean_t{size}"])
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_price_std_t{size}"] = self.data.groupby(["id"]
                                            )["sell_price"].transform(lambda x: x.rolling(size).std())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_price_std_t{size}"])
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_price_max_t{size}"] = self.data.groupby(["id"]
                                            )["sell_price"].transform(lambda x: x.rolling(size).max())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_price_max_t{size}"])

        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_price_min_t{size}"] = self.data.groupby(["id"]
                                            )["sell_price"].transform(lambda x: x.rolling(size).min())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_price_min_t{size}"])
        
    def create_price_feature2(self):
        self.data['price_max'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('max')
        self.data['price_min'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('min')
        self.data['price_std'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('std')
        self.data['price_mean'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('mean')

        # and do price normalization (min/max scaling)
        self.data['price_norm'] = self.data['sell_price']/self.data['price_max']

        # Some items are can be inflation dependent
        # and some items are very "stable"
        self.data['price_nunique'] = self.data.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
        self.data['item_nunique'] = self.data.groupby(['store_id','sell_price'])['item_id'].transform('nunique')


        # I would like some "rolling" aggregations
        # but would like months and years as "window"
        calendar_df = pd.read_pickle('../../../tmp_M5/calender.pickle')
        calendar_prices = calendar_df[['wm_yr_wk','month','year']]
        calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
        self.data = self.data.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')
        del calendar_prices

        # Now we can add price "momentum" (some sort of)
        # Shifted by week 
        # by month mean
        # by year mean
        self.data['price_momentum'] = self.data['sell_price']/self.data.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
        self.data['price_momentum_m'] = self.data['sell_price']/self.data.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
        self.data['price_momentum_y'] = self.data['sell_price']/self.data.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')
        
        
        self.data = self.reduce_mem_usage(self.data)
        self.save(feature=self.data['price_max'])
        self.save(feature=self.data['price_min'])
        self.save(feature=self.data['price_std'])
        self.save(feature=self.data['price_mean'])
        
        self.save(feature=self.data['price_norm'])
        self.save(feature=self.data['price_nunique'])
        self.save(feature=self.data['item_nunique'])
        self.save(feature=self.data['price_momentum'])
        self.save(feature=self.data['price_momentum_m'])
        self.save(feature=self.data['price_momentum_y'])

In [60]:
tmp = price_feature()
tmp.get_dataset()
tmp.create_price_feature2()
del tmp
gc.collect()

0

In [54]:
tmp.data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part,date,wm_yr_wk,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_008_CA_1_validation,1444,3,1,0,0,d_1,12.0,train,2011-01-29,11101,1,30,4,4,2,0,0,0,0.459961
1,HOBBIES_1_009_CA_1_validation,1445,3,1,0,0,d_1,2.0,train,2011-01-29,11101,1,30,4,4,2,0,0,0,1.55957
2,HOBBIES_1_015_CA_1_validation,1451,3,1,0,0,d_1,4.0,train,2011-01-29,11101,1,30,4,4,2,0,0,0,0.700195
3,HOBBIES_1_016_CA_1_validation,1452,3,1,0,0,d_1,5.0,train,2011-01-29,11101,1,30,4,4,2,0,0,0,0.700195
4,HOBBIES_1_022_CA_1_validation,1458,3,1,0,0,d_1,2.0,train,2011-01-29,11101,1,30,4,4,2,0,0,0,6.859375


In [44]:
class Ordered_TS_mean_encoding(Dataset):
    #
        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__ 
        if not os.path.exists(self.dir + self.file_dir):
            os.mkdir(self.dir + self.file_dir)
        
        
    
    def save(self ,feature):
        with open(self.dir + self.file_dir+'/'+'{0}.joblib'.format(feature.name), mode="wb") as f:
            joblib.dump(feature, f, compress=3)
        
    def create_Ordered_TS_mean_encoding(self):
        
        
        self.data['demand'] = self.data['demand'].astype(np.int32)

        tmp = self.data.groupby(['id']).agg({'demand': ['cumsum', 'cumcount']})
        self.data['Ordered_TS_id'] = (tmp[('demand', 'cumsum')] / (tmp[('demand', 'cumcount')] + 1))
        self.data['Ordered_TS_id'] = self.data.groupby(['id'])['Ordered_TS_id'].shift(28)
        self.data = self.reduce_mem_usage(self.data)
        self.save(feature=self.data['Ordered_TS_id'])
        
        
        tmp = self.data.groupby(['store_id']).agg({'demand': ['cumsum', 'cumcount']})
        self.data['Ordered_TS_store'] = (tmp[('demand', 'cumsum')] / (tmp[('demand', 'cumcount')] + 1))
        self.data['Ordered_TS_store'] = self.data.groupby(['store_id'])['Ordered_TS_store'].shift(28)
        self.data = self.reduce_mem_usage(self.data)
        self.save(feature=self.data['Ordered_TS_store'])

        tmp = self.data.groupby(['cat_id']).agg({'demand': ['cumsum', 'cumcount']})
        self.data['Ordered_TS_cat'] = (tmp[('demand', 'cumsum')] / (tmp[('demand', 'cumcount')] + 1))
        self.data['Ordered_TS_cat'] = self.data.groupby(['cat_id'])['Ordered_TS_cat'].shift(28)
        self.data = self.reduce_mem_usage(self.data)
        self.save(feature=self.data['Ordered_TS_cat'])

        
        tmp = self.data.groupby(['state_id']).agg({'demand': ['cumsum', 'cumcount']})
        self.data['Ordered_TS_state'] = (tmp[('demand', 'cumsum')] / (tmp[('demand', 'cumcount')] + 1))
        self.data['Ordered_TS_state'] = self.data.groupby(['state_id'])['Ordered_TS_state'].shift(28)
        self.data = self.reduce_mem_usage(self.data)
        self.save(feature=self.data['Ordered_TS_state'])

        tmp = self.data.groupby(['dept_id']).agg({'demand': ['cumsum', 'cumcount']})
        self.data['Ordered_TS_dept'] = (tmp[('demand', 'cumsum')] / (tmp[('demand', 'cumcount')] + 1))
        self.data['Ordered_TS_dept'] = self.data.groupby(['dept_id'])['Ordered_TS_dept'].shift(28)
        self.data = self.reduce_mem_usage(self.data)
        self.save(feature=self.data['Ordered_TS_dept'])
        
        

In [45]:
tmp = Ordered_TS_mean_encoding()
tmp.get_dataset()
tmp.create_Ordered_TS_mean_encoding()
del tmp
gc.collect()

1346

In [None]:
with

In [12]:
class SNAP_Feature(Dataset):
    #
        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__ 
        if not os.path.exists(self.dir + self.file_dir):
            os.mkdir(self.dir + self.file_dir)
        
        
    
    def save(self ,feature):
        with open(self.dir + self.file_dir+'/'+'{0}.joblib'.format(feature.name), mode="wb") as f:
            joblib.dump(feature, f, compress=3)
        
    def create_features_SNAP_LAG(self):
        tmp = self.data.groupby(['date'])['snap_CA'].mean()
        lag_CA = []
        tmp2 = 0
        for i in tmp.values:
            if i == 0:
                tmp2 += 1
                lag_CA.append(tmp2)
            else:
                lag_CA.append(0)
                tmp2 = 0
                    
        tmp = self.data.groupby(['date'])['snap_TX'].mean()
        lag_TX = []
        tmp2 = 0
        for i in tmp.values:
            if i == 0:
                tmp2 += 1
                lag_TX.append(tmp2)
            else:
                lag_TX.append(0)
                tmp2 = 0
                
        tmp = self.data.groupby(['date'])['snap_WI'].mean()
        lag_WI = []
        tmp2 = 0
        for i in tmp.values:
            if i == 0:
                tmp2 += 1
                lag_WI.append(tmp2)
            else:
                lag_WI.append(0)
                tmp2 = 0
        self.data =  self.data['date']
        
        a= pd.DataFrame({'Lag_SNAP_CA':lag_CA,'Lag_SNAP_TX':lag_TX,'Lag_SNAP_WI':lag_WI,'date':tmp.index})
        self.data = pd.merge(self.data,a,how = 'left' ,on='date')
        self.data = self.reduce_mem_usage(self.data)
        self.save(feature=self.data['Lag_SNAP_CA'])
        self.save(feature=self.data['Lag_SNAP_TX'])
        self.save(feature=self.data['Lag_SNAP_WI'])
        

                
                
        
 

                
                

In [13]:
tmp = SNAP_Feature()
tmp.get_dataset()
tmp.create_features_SNAP_LAG()
del tmp
gc.collect()

3542

In [14]:
class rolling_lag_store_Feature(Dataset):
    #単純な移動平均

        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__
        if not os.path.exists(self.dir + self.file_dir):
            os.mkdir(self.dir + self.file_dir)
        
        
    
    def save(self ,feature):
        with open(self.dir + self.file_dir+'/'+'{0}.joblib'.format(feature.name), mode="wb") as f:
            joblib.dump(feature, f, compress=3)

        
    def create_features(self):
        DAYS_PRED = 28
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_store_median_t{size}"] = self.data.groupby(["store_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).median())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_store_median_t{size}"])
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_store_mean_t{size}"] = self.data.groupby(["store_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).mean())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_store_mean_t{size}"])
   
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_store_sum_t{size}"] = self.data.groupby(["store_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).sum())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_store_sum_t{size}"])
            
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_store_std_t{size}"] = self.data.groupby(["store_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).std())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_store_std_t{size}"])
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_store_max_t{size}"] = self.data.groupby(["store_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).max())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_store_max_t{size}"])

        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_store_min_t{size}"] = self.data.groupby(["store_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).min())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_store_min_t{size}"])



                
                

In [15]:
tmp = rolling_lag_store_Feature()
tmp.get_dataset()
tmp.create_features()
del tmp
gc.collect()

0

In [16]:
class rolling_lag_dept_Feature(Dataset):
    #単純な移動平均

        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__
        if not os.path.exists(self.dir + self.file_dir):
            os.mkdir(self.dir + self.file_dir)
        
        
    
    def save(self ,feature):
        with open(self.dir + self.file_dir+'/'+'{0}.joblib'.format(feature.name), mode="wb") as f:
            joblib.dump(feature, f, compress=3)

        
    def create_features(self):
        DAYS_PRED = 28
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_dept_median_t{size}"] = self.data.groupby(["dept_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).median())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_dept_median_t{size}"])
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_dept_mean_t{size}"] = self.data.groupby(["dept_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).mean())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_dept_mean_t{size}"])
   
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_dept_sum_t{size}"] = self.data.groupby(["dept_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).sum())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_dept_sum_t{size}"])
            
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_dept_std_t{size}"] = self.data.groupby(["dept_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).std())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_dept_std_t{size}"])
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_dept_max_t{size}"] = self.data.groupby(["dept_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).max())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_dept_max_t{size}"])

        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_dept_min_t{size}"] = self.data.groupby(["dept_id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).min())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_dept_min_t{size}"])



                
                

In [17]:
tmp = rolling_lag_dept_Feature()
tmp.get_dataset()
tmp.create_features()
del tmp
gc.collect()

0

In [18]:
class rolling_lag_Feature(Dataset):
    #単純な移動平均

        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__
        if not os.path.exists(self.dir + self.file_dir):
            os.mkdir(self.dir + self.file_dir)
        
        
    
    def save(self ,feature):
        with open(self.dir + self.file_dir+'/'+'{0}.joblib'.format(feature.name), mode="wb") as f:
            joblib.dump(feature, f, compress=3)

        
    def create_features(self):
        DAYS_PRED = 28
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_median_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).median())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_median_t{size}"])
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_mean_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).mean())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_mean_t{size}"])
   
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_sum_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).sum())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_sum_t{size}"])
            
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_std_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).std())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_std_t{size}"])
            
        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_max_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).max())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_max_t{size}"])

        for size in [7, 28, 56, 84, 168]:
            self.data[f"rolling_lag_min_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).min())
            self.data = self.reduce_mem_usage(self.data)
            self.save(feature=self.data[f"rolling_lag_min_t{size}"])



                
                

In [19]:
tmp = rolling_lag_Feature()
tmp.get_dataset()
tmp.create_features()
del tmp
gc.collect()

0

In [4]:
def denoise_signal(x, wavelet='db4', level=1):
    coeff = pywt.wavedec(x, wavelet, mode="per")
    sigma = (1/0.6745) * maddest(coeff[-level])
    uthresh = sigma * np.sqrt(2*np.log(len(x)))
    coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])
    return pywt.waverec(coeff, wavelet, mode='per')

def maddest(x):
    return np.median(np.abs(x-np.median(x)))