# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

from tqdm import tqdm

import time
import gc

pd.set_option('display.max_columns', None)

## Daten laden

In [18]:
IS_TRAINING = False
if IS_TRAINING:
    FILE = '../../data/processed/train_series_split_normalized.parquet'
else:
    FILE = '../../data/processed/validation_series_split_normalized.parquet'
series = pd.read_parquet(FILE, columns=['num_series_id'])

## Features

In [19]:
LAGS_FUTURE = [f"t_lag_{i}" for i in range(-1, -25, -1)]
LAGS_PAST = [f"t_lag_{i}" for i in range(1, 25)]
FEATURES = ['t_0', *LAGS_PAST, *LAGS_FUTURE]

In [20]:
len(FEATURES)

49

## Neue Features erstellen

In [21]:
folder = "train/" if IS_TRAINING else "validation/"
file_path_prefix = "../../data/processed/transformer/" + folder

def save_chunk(num_series_id, chunk):
    chunk.to_feather(file_path_prefix + str(num_series_id) + ".feather")

In [22]:
def make_features_chunk(num_series_id):
    df = pd.read_parquet(FILE, filters=[('num_series_id','=',num_series_id)])
    df['t_0'] = df[['anglez', 'enmo']].values.tolist()

    for i in range(1, 25):
        df[f'anglez_lag_{i}'] = df["anglez"].shift(i).bfill()
        df[f'enmo_lag_{i}'] = df["enmo"].shift(i).bfill()
        df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
        df = df.drop(columns=[f'anglez_lag_{i}', f'enmo_lag_{i}'])

    for i in range(-1, -25, -1):
        df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
        df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
        df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
        df = df.drop(columns=[f'anglez_lag_{i}', f'enmo_lag_{i}'])
    
    return df.reset_index(drop=True)

In [23]:
def make_features(series):
    overview_data = []

    for num_series_id in tqdm(series.num_series_id.unique()):
        chunk = make_features_chunk(num_series_id)
        save_chunk(num_series_id, chunk)

        overview_data.append(
            chunk[['num_series_id', 'step']].reset_index().rename(columns={'index':'series_index'}).copy()[['num_series_id', 'step', 'series_index']]
        )
        
        del chunk
        gc.collect()
    
    return pd.concat(overview_data).reset_index(drop=True)

In [24]:
start_time = time.time()
overview = make_features(series)
print(f'Feature Engineering took {time.time() - start_time} seconds')

100%|██████████████████████████████████████████████████████████████████████████████████| 54/54 [17:32<00:00, 19.49s/it]


Feature Engineering took 1053.1359159946442 seconds


## Overview speichern

In [25]:
overview

Unnamed: 0,num_series_id,step,series_index
0,7,0,0
1,7,1,1
2,7,2,2
3,7,3,3
4,7,4,4
...,...,...,...
20009335,276,620635,620635
20009336,276,620636,620636
20009337,276,620637,620637
20009338,276,620638,620638


In [26]:
overview.to_parquet(file_path_prefix + "overview.parquet")

In [87]:
series7 = pd.read_feather(file_path_prefix + "7.feather")

In [90]:
torch.from_numpy(series7[LABEL].astype('int64').to_numpy()).squeeze(1)[0]

tensor(1)

In [98]:
tensor = torch.randn(64, 49, 1)
print(tensor)
print(torch.squeeze(tensor, dim=2))

tensor([[[ 1.0666],
         [-1.8279],
         [-0.4424],
         ...,
         [ 0.6216],
         [ 1.1704],
         [ 0.7218]],

        [[-1.5844],
         [-0.7324],
         [-0.8466],
         ...,
         [ 1.4535],
         [ 0.0270],
         [ 1.3010]],

        [[-0.6601],
         [-0.0892],
         [ 0.4895],
         ...,
         [-0.2421],
         [ 0.5774],
         [ 0.5033]],

        ...,

        [[-1.3609],
         [-0.2776],
         [ 0.2342],
         ...,
         [-0.1636],
         [ 0.2715],
         [-1.0235]],

        [[ 0.8998],
         [ 1.0288],
         [ 0.7142],
         ...,
         [-0.2692],
         [-0.8795],
         [-0.5863]],

        [[-2.7479],
         [ 1.3100],
         [-0.5185],
         ...,
         [ 0.5344],
         [-1.2897],
         [ 0.3658]]])
tensor([[ 1.0666, -1.8279, -0.4424,  ...,  0.6216,  1.1704,  0.7218],
        [-1.5844, -0.7324, -0.8466,  ...,  1.4535,  0.0270,  1.3010],
        [-0.6601, -0.0892,  0.