In [1]:
import pandas as pd

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

from tsfresh import extract_features, extract_relevant_features
from tsfresh.feature_extraction import settings

In [2]:
# load data

X = pd.read_csv("occupancy.csv", parse_dates=["date"])
y = pd.read_csv("occupancy_target.csv", index_col="id")
y = pd.Series(y["occupancy"])

In [3]:
# create and select features

features = extract_relevant_features(
    X,
    y,
    column_id="id",
    column_sort="date",
)

features.shape

Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:29<00:00,  2.97s/it]


(135, 969)

In [4]:
# select features with lasso

cls = LogisticRegression(
    penalty="l1", 
    solver="liblinear",
    random_state=10,
    C=0.05,
    max_iter=1000,
)

selector = SelectFromModel(cls)

selector.fit(features, y)

features = selector.get_feature_names_out()

features

array(['light__sum_of_reoccurring_data_points',
       'co2__spkt_welch_density__coeff_2', 'co2__variance',
       'temperature__c3__lag_1', 'temperature__abs_energy',
       'temperature__c3__lag_2', 'temperature__c3__lag_3',
       'co2__sum_of_reoccurring_data_points',
       'light__spkt_welch_density__coeff_8', 'light__variance',
       'light__agg_linear_trend__attr_"slope"__chunk_len_50__f_agg_"var"',
       'light__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"var"'],
      dtype=object)

In [5]:
# capture selected features for each time series

kind_to_fc_parameters = settings.from_columns(selector.get_feature_names_out())

kind_to_fc_parameters

{'light': {'sum_of_reoccurring_data_points': None,
  'spkt_welch_density': [{'coeff': 8}],
  'variance': None,
  'agg_linear_trend': [{'attr': 'slope', 'chunk_len': 50, 'f_agg': 'var'},
   {'attr': 'intercept', 'chunk_len': 10, 'f_agg': 'var'}]},
 'co2': {'spkt_welch_density': [{'coeff': 2}],
  'variance': None,
  'sum_of_reoccurring_data_points': None},
 'temperature': {'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
  'abs_energy': None}}

In [6]:
# create selected features for each time series

features = extract_features(
    X,
    column_id="id",
    column_sort="date",
    kind_to_fc_parameters=kind_to_fc_parameters,
)

features.shape

Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.60it/s]


(135, 12)

In [7]:
features.head()

Unnamed: 0,temperature__c3__lag_1,temperature__c3__lag_2,temperature__c3__lag_3,temperature__abs_energy,light__sum_of_reoccurring_data_points,light__spkt_welch_density__coeff_8,light__variance,"light__agg_linear_trend__attr_""slope""__chunk_len_50__f_agg_""var""","light__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""var""",co2__spkt_welch_density__coeff_2,co2__variance,co2__sum_of_reoccurring_data_points
1,11585.127934,11581.20359,11578.178807,30721.568703,2514.0,332.221295,18086.371875,-21130.3425,19305.1375,1523.529443,756.700664,14124.0
2,10751.99961,10752.682508,10753.119812,29225.254374,0.0,0.0,0.0,0.0,0.0,350.067478,377.280895,13202.0
3,10187.865226,10187.206963,10186.28404,28198.13995,0.0,0.0,0.0,0.0,0.0,91.896894,115.269298,20885.666667
4,9908.900224,9909.731388,9910.163947,27680.800184,0.0,0.0,0.0,0.0,0.0,42.394905,35.473216,18285.5
5,9705.989789,9706.404551,9706.707963,27299.097469,0.0,0.0,0.0,0.0,0.0,4.811303,27.53908,17670.5
