# Feature Engineering

This notebook will compute all the features.

The features will consist of:
- Rolling aggregates (mean, max, std) of anglez and enmo over a variety of window sizes, from 5 minutes to 8 hours.
- Rolling aggregates (mean, max, std) of anglez and enmo total variation (or first variation, i.e. 1v) over a variety of window sizes, from 5 minutes to 8 hours.


https://www.kaggle.com/code/lccburk/feature-engineering-and-random-forest-prediction

**Imports**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl

from tqdm import tqdm

**Parameters**

In [2]:
WINDOW_SIZES = [5, 30, 2*60, 8*60] # 5min, 30min, 2hours, 8hours

ID_COLUMNS = ['series_id', 'step', 'timestamp']

FEATURE_TIME_NAMES = [
    "hour_sin",
    "hour_cos",
    "month_sin",
    "month_cos",
    "minute_sin",
    "minute_cos",
    "anglez_sin",
    "anglez_cos",
]


**Import Data**

In [3]:
df = pl.read_parquet('data/78569a801a38.parquet')

In [4]:
print(df.head())

shape: (5, 8)
┌──────────────┬──────┬─────────────────────────┬──────────┬──────────┬───────┬────────┬───────┐
│ series_id    ┆ step ┆ timestamp               ┆ anglez   ┆ enmo     ┆ state ┆ wakeup ┆ onset │
│ ---          ┆ ---  ┆ ---                     ┆ ---      ┆ ---      ┆ ---   ┆ ---    ┆ ---   │
│ str          ┆ u32  ┆ datetime[μs, UTC]       ┆ f32      ┆ f32      ┆ i32   ┆ bool   ┆ bool  │
╞══════════════╪══════╪═════════════════════════╪══════════╪══════════╪═══════╪════════╪═══════╡
│ 78569a801a38 ┆ 0    ┆ 2017-08-17 20:45:00 UTC ┆ 0.49135  ┆ 0.028832 ┆ 0     ┆ false  ┆ false │
│ 78569a801a38 ┆ 1    ┆ 2017-08-17 20:45:05 UTC ┆ 0.646426 ┆ 0.022966 ┆ 0     ┆ false  ┆ false │
│ 78569a801a38 ┆ 2    ┆ 2017-08-17 20:45:10 UTC ┆ 0.385461 ┆ 0.017266 ┆ 0     ┆ false  ┆ false │
│ 78569a801a38 ┆ 3    ┆ 2017-08-17 20:45:15 UTC ┆ 0.167985 ┆ 0.026895 ┆ 0     ┆ false  ┆ false │
│ 78569a801a38 ┆ 4    ┆ 2017-08-17 20:45:20 UTC ┆ 0.154722 ┆ 0.019369 ┆ 0     ┆ false  ┆ false │
└──────────────┴

In [5]:
df = df.with_columns(
    year=df['timestamp'].dt.year().cast(pl.Int16),
    month=df['timestamp'].dt.month().cast(pl.Int8),
    day=df['timestamp'].dt.day().cast(pl.Int8),
    hour=df['timestamp'].dt.hour().cast(pl.Int8),
    minute=df['timestamp'].dt.minute().cast(pl.Int8),
    second=df['timestamp'].dt.second().cast(pl.Int8),
)

**Compute Features**

In [6]:
features, feature_cols = [pl.col('hour')], ['hour']

for mins in WINDOW_SIZES :
    for var in ['enmo', 'anglez'] :
        
        # Getting basic features
        features += [
            pl.col(var).rolling_mean(2 * mins, center=True, min_periods=1).abs().cast(pl.UInt16).alias(f'{var}_{mins}m_mean'),
            pl.col(var).rolling_max(2 * mins, center=True, min_periods=1).abs().cast(pl.UInt16).alias(f'{var}_{mins}m_max'),
            pl.col(var).rolling_std(2 * mins, center=True, min_periods=1).abs().cast(pl.UInt16).alias(f'{var}_{mins}m_std')
        ]

        feature_cols += [ 
            f'{var}_{mins}m_mean', f'{var}_{mins}m_max', f'{var}_{mins}m_std'
        ]

        # Getting first variations
        features += [
            (pl.col(var).diff().abs().rolling_mean(2 * mins, center=True, min_periods=1)*10).abs().cast(pl.UInt32).alias(f'{var}_1v_{mins}m_mean'),
            (pl.col(var).diff().abs().rolling_max(2 * mins, center=True, min_periods=1)*10).abs().cast(pl.UInt32).alias(f'{var}_1v_{mins}m_max'),
            (pl.col(var).diff().abs().rolling_std(2 * mins, center=True, min_periods=1)*10).abs().cast(pl.UInt32).alias(f'{var}_1v_{mins}m_std')
        ]

        feature_cols += [ 
            f'{var}_1v_{mins}m_mean', f'{var}_1v_{mins}m_max', f'{var}_1v_{mins}m_std'
        ]



In [7]:
train_series = df.with_columns(
    features
).select(ID_COLUMNS + feature_cols)


In [11]:
def to_coord(x: pl.Expr, max_: int, name: str) -> list[pl.Expr]:
    rad = 2 * np.pi * (x % max_) / max_
    x_sin = rad.sin()
    x_cos = rad.cos()

    return [x_sin.alias(f"{name}_sin"), x_cos.alias(f"{name}_cos")]


def deg_to_rad(x: pl.Expr) -> pl.Expr:
    return np.pi / 180 * x


def add_feature(series_df: pl.DataFrame) -> pl.DataFrame:
    series_df = (
        series_df.with_row_count("step")
        .with_columns(
            *to_coord(pl.col("hour"), 24, "hour"),
            *to_coord(pl.col("month"), 12, "month"),
            *to_coord(pl.col("minute"), 60, "minute"),
            pl.col('anglez_rad').sin().alias('anglez_sin'),
            pl.col('anglez_rad').cos().alias('anglez_cos'),
        )
    )
    return series_df

In [12]:
add_feature(train_series)

DuplicateError: column with name 'step' has more than one occurrences