In [1]:
import pandas as pd
import numpy as np

In [22]:
class Dataset():
    def __init__(self):
        prefix = ''
        suffix = ''
        self.dir = './features/'
        
        
    def reduce_mem_usage(df, verbose=True):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024**2    
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
        end_mem = df.memory_usage().sum() / 1024**2
        if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
        return df

    
    def get_features(self ,features = None):
        
        #作成した特徴量の取得
        if features == None:
            print('features not selected')
            exit(0)
        else:
            dfs = [pd.read_feather(f'.features/{f}.pickle') for f in features]
            dfs = reduce_mem_usage(dfs)
            
            return dfs
        
    def get_dataset(self):
        #生データの取得
        self.data = pd.read_pickle('./data_full.pickle')
        
        
    def run(self):
            self.create_features()
            prefix = self.prefix + '_' if self.prefix else ''
            suffix = '_' + self.suffix if self.suffix else ''
            self.train.columns = prefix + self.train.columns + suffix
            self.test.columns = prefix + self.test.columns + suffix
        

In [17]:
class rolling_lag_Feature(Dataset):
    #単純な移動平均

        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__
        
        
    
    def save(self ,feature):
        feature.to_pickle(self.dir + self.file_dir+'/'+'{0}.pickle'.format(feature.name))

        
    def create_features(self):
        DAYS_PRED = 28
        for size in [7, 30, 60, 90, 180]:
            self.data[f"rolling_mean_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).sum())
            self.save(feature=self.data[f"rolling_mean_t{size}"])
            
        for size in [7, 30, 60, 90, 180]:
            self.data[f"rolling_sum_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).sum())
            self.save(feature=self.data[f"rolling_sum_t{size}"])
            
            
        for size in [7, 30, 60, 90, 180]:
            self.data[f"rolling_std_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).std())
            
            self.save(feature=self.data[f"rolling_std_t{size}"])
            
        for size in [7, 30, 60, 90, 180]:
            self.data[f"rolling_max_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).max())
            self.save(feature=self.data[f"rolling_max_t{size}"])
            
        for size in [7, 30, 60, 90, 180]:
            self.data[f"rolling_min_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).min())
            self.save(feature=self.data[f"rolling_min_t{size}"])


                
                

In [18]:
tmp = rolling_lag_Feature()
tmp.get_dataset()
tmp.create_features()

In [25]:
class wavelet_lag_Feature(Dataset):
    #単純な移動平均
    def maddest(x):
        return np.median(np.abs(x-np.median(x)))

    def denoise_signal(x, wavelet='db4', level=1):
        coeff = pywt.wavedec(x, wavelet, mode="per")
        sigma = (1/0.6745) * maddest(coeff[-level])
        uthresh = sigma * np.sqrt(2*np.log(len(x)))
        coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])
        return pywt.waverec(coeff, wavelet, mode='per')
        
        
    def __init__(self):
        super().__init__()
        self.file_dir = self.__class__.__name__
        
        
    
    def save(self ,feature):
        feature.to_pickle(self.dir + self.file_dir+'/'+'{0}.pickle'.format(feature.name))

        
    def create_features(self):
        DAYS_PRED = 28
        self.data['demand'] = denoise_signal(self.data['demand'].values)
        for size in [7, 30, 60, 90, 180]:
            self.data[f"wl_rolling_mean_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).sum())
            self.save(feature=self.data[f"wl_rolling_mean_t{size}"])
            
        for size in [7, 30, 60, 90, 180]:
            self.data[f"wl_rolling_sum_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).sum())
            self.save(feature=self.data[f"wl_rolling_sum_t{size}"])
            
            
        for size in [7, 30, 60, 90, 180]:
            self.data[f"wl_rolling_std_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).std())
            
            self.save(feature=self.data[f"wl_rolling_std_t{size}"])
            
        for size in [7, 30, 60, 90, 180]:
            self.data[f"wl_rolling_max_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).max())
            self.save(feature=self.data[f"wl_rolling_max_t{size}"])
            
        for size in [7, 30, 60, 90, 180]:
            self.data[f"wl_rolling_min_t{size}"] = self.data.groupby(["id"]
                                            )["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).min())
            self.save(feature=self.data[f"wl_rolling_min_t{size}"])

                
                

In [26]:
tmp = wavelet_lag_Feature()
tmp.get_dataset()
tmp.create_features()

KeyboardInterrupt: 